From 204eabbbbfc84107f9990a891fb27902fc13dd93 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 22 Feb 2021 11:21:03 +0100
Subject: [PATCH 001/347] inital commit for time series classification

---
 .../data/time_series_feature_validator.py     | 100 ++++++++
 .../data/time_series_target_validator.py      |   4 +
 autoPyTorch/data/time_series_validator.py     |  53 +++++
 .../TimeSeriesTransformer.py                  |  77 ++++++
 .../time_series_preprocessing/__init__.py     |   0
 .../base_time_series_preprocessing.py         |  36 +++
 .../scaling/MaxAbsScaler.py                   |  33 +++
 .../scaling/MinMaxScaler.py                   |  32 +++
 .../scaling/NoScaler.py                       |  56 +++++
 .../scaling/StandardScaler.py                 |  33 +++
 .../scaling/__init__.py                       |   0
 .../scaling/base_scaler.py                    |  31 +++
 .../scaling/base_scaler_choice.py             | 115 +++++++++
 .../scaling/utils.py                          |  53 +++++
 .../time_series_preprocessing/utils.py        |  28 +++
 .../data_loader/time_series_data_loader.py    |  60 +++++
 .../pipeline/time_series_classification.py    | 225 ++++++++++++++++++
 17 files changed, 936 insertions(+)
 create mode 100644 autoPyTorch/data/time_series_feature_validator.py
 create mode 100644 autoPyTorch/data/time_series_target_validator.py
 create mode 100644 autoPyTorch/data/time_series_validator.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
 create mode 100644 autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py
 create mode 100644 autoPyTorch/pipeline/time_series_classification.py

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
new file mode 100644
index 000000000..283e98860
--- /dev/null
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -0,0 +1,100 @@
+import logging
+from typing import Optional, Union
+
+import numpy as np
+
+import sklearn.utils
+
+from sklearn.base import BaseEstimator
+
+from autoPyTorch.data.base_feature_validator import BaseFeatureValidator
+from autoPyTorch.utils.logging_ import PicklableClientLogger
+
+
+class TimeSeriesFeatureValidator(BaseEstimator):
+    def __init__(self,
+                 logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None) -> None:
+        self.logger = logger
+
+    def fit(self,
+            X_train: np.ndarray,
+            X_test: Optional[np.ndarray] = None) -> BaseEstimator:
+        """
+
+        Arguments:
+            X_train (np.ndarray):
+                A set of data that are going to be validated (type and dimensionality
+                checks) and used for fitting
+
+            X_test (Optional[np.ndarray]):
+                An optional set of data that is going to be validated
+
+        Returns:
+            self:
+                The fitted base estimator
+        """
+
+        if not isinstance(X_train, np.ndarray):
+            raise ValueError(f"Time series train data must be given as a numpy array")
+
+        if X_train.ndim != 3:
+            raise ValueError(f"Invalid number of dimensions for time series train data, "
+                             f"expected 3 but got {X_train.ndim}. "
+                             f"Time series data has to be of shape [B, T, F] where B is the "
+                             f"batch dimension, T is the time dimension and F are the number of features.")
+
+        _ = sklearn.utils.check_array(
+            X_train,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        )
+
+        if X_test is not None:
+            if not isinstance(X_test, np.ndarray):
+                raise ValueError(f"Time series test data must be given as a numpy array")
+
+            if not X_test.ndim == 3:
+                raise ValueError(f"Invalid number of dimensions for time series test data, "
+                                 f"expected 3 but got {X_train.ndim}. "
+                                 f"Time series data has to be of shape [B, T, F] where B is the "
+                                 f"batch dimension, T is the time dimension and F are the number of features")
+
+            if X_train.shape[1:] != X_test.shape[1:]:
+                raise ValueError(f"Time series train and test data are expected to have the same shape except for "
+                                 f"the batch dimension, but got {X_train.shape} for train data and "
+                                 f"{X_test.shape} for test data")
+
+            _ = sklearn.utils.check_array(
+                X_test,
+                force_all_finite=True,
+                ensure_2d=False,
+                allow_nd=True,
+                accept_sparse=False,
+                accept_large_sparse=False
+            )
+
+        return self
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """
+
+        Arguments:
+            X (np.ndarray):
+                A set of data, that is going to be transformed
+
+        Return:
+            np.ndarray:
+                The transformed array
+        """
+
+        return sklearn.utils.check_array(
+            X,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        )
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
new file mode 100644
index 000000000..50fd9d213
--- /dev/null
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -0,0 +1,4 @@
+from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+
+# just define an alias for the tabular target validator
+TimeSeriesTargetValidator = TabularTargetValidator
diff --git a/autoPyTorch/data/time_series_validator.py b/autoPyTorch/data/time_series_validator.py
new file mode 100644
index 000000000..7085af1ad
--- /dev/null
+++ b/autoPyTorch/data/time_series_validator.py
@@ -0,0 +1,53 @@
+# -*- encoding: utf-8 -*-
+import logging
+import typing
+
+from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
+from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
+
+
+class TimeSeriesInputValidator(BaseInputValidator):
+    """
+    Makes sure the input data complies with Auto-PyTorch requirements.
+
+    This class also perform checks for data integrity and flags the user
+    via informative errors.
+
+    Attributes:
+        is_classification (bool):
+            For classification task, this flag indicates that the target data
+            should be encoded
+        feature_validator (FeatureValidator):
+            A FeatureValidator instance used to validate and encode feature columns to match
+            sklearn expectations on the data
+        target_validator (TargetValidator):
+            A TargetValidator instance used to validate and encode (in case of classification)
+            the target values
+    """
+
+    def __init__(
+        self,
+        is_classification: bool = False,
+        logger_port: typing.Optional[int] = None,
+    ) -> None:
+        self.is_classification = is_classification
+        self.logger_port = logger_port
+        if self.logger_port is not None:
+            self.logger: typing.Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
+                name='Validation',
+                port=self.logger_port,
+            )
+        else:
+            self.logger = logging.getLogger('Validation')
+
+        self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
+        self.target_validator = TimeSeriesTargetValidator(
+            is_classification=self.is_classification,
+            logger=self.logger
+        )
+
+        self._is_fitted = False
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
new file mode 100644
index 000000000..764269043
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -0,0 +1,77 @@
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from sklearn.pipeline import make_pipeline, Pipeline
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
+    autoPyTorchTimeSeriesPreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.utils import get_time_series_preprocessers
+from autoPyTorch.utils.common import FitRequirement, subsampler
+
+
+class TimeSeriesTransformer(autoPyTorchTimeSeriesPreprocessingComponent):
+
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.random_state = random_state
+        self.preprocessor: Optional[Pipeline] = None
+        self.add_fit_requirements([
+            FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
+        """
+        Creates a column transformer for the chosen tabular
+        preprocessors
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            "TabularColumnTransformer": an instance of self
+        """
+        self.check_requirements(X, y)
+
+        preprocessors = get_time_series_preprocessers(X)
+
+        if len(X['dataset_properties']['categorical_features']):
+            raise ValueError(f"Categorical features are not yet supported for time series")
+
+        numerical_pipeline = make_pipeline(*preprocessors['numerical'])
+
+        self.preprocessor = numerical_pipeline
+
+        # Where to get the data -- Prioritize X_train if any else
+        # get from backend
+        if 'X_train' in X:
+            print(X.keys())
+            X_train = subsampler(X['X_train'], X['train_indices'])
+        else:
+            X_train = X['backend'].load_datamanager().train_tensors[0]
+
+        self.preprocessor.fit(X_train)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the time series transformer to fit dictionary
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            X (Dict[str, Any]): updated fit dictionary
+        """
+        X.update({'time_series_transformer': self})
+        return X
+
+    def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
+
+        if self.preprocessor is None:
+            raise ValueError("cant call {} without fitting the column transformer first."
+                             .format(self.__class__.__name__))
+
+        if len(X.shape) == 2:
+            # expand batch dimension when called on a single record
+            X = X[np.newaxis, ...]
+
+        return self.preprocessor.transform(X)
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
new file mode 100644
index 000000000..f1eb389a3
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -0,0 +1,36 @@
+from typing import Dict, List, Optional, Union
+
+from sklearn.base import BaseEstimator
+
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchPreprocessingComponent
+
+
+class autoPyTorchTimeSeriesPreprocessingComponent(autoPyTorchPreprocessingComponent):
+    """
+     Provides abstract interface for time series preprocessing algorithms in AutoPyTorch.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
+            numerical=None, categorical=None)
+
+    def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
+        """
+        Returns early_preprocessor dictionary containing the sklearn numerical
+        and categorical early_preprocessor with "numerical" and "categorical"
+        keys. May contain None for a key if early_preprocessor does not
+        handle the datatype defined by key
+
+        Returns:
+            Dict[str, BaseEstimator]: early_preprocessor dictionary
+        """
+        if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None:
+            raise AttributeError("{} can't return early_preprocessor dict without fitting first"
+                                 .format(self.__class__.__name__))
+        return self.preprocessor
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
new file mode 100644
index 000000000..9b384b76a
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
@@ -0,0 +1,33 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
+
+
+class MaxAbsScaler(BaseScaler):
+    """
+    Scales numerical features into range [-1, 1]
+    """
+
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        """
+        Args:
+            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
+        """
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode="max_abs")
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'MaxAbsScaler',
+            'name': 'MaxAbsScaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
new file mode 100644
index 000000000..a255d86b4
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
+
+
+class MinMaxScaler(BaseScaler):
+    """
+    Scales numerical features into range [0, 1]
+    """
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        """
+        Args:
+            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
+        """
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+        self.preprocessor["numerical"] = TimeSeriesScaler(mode="min_max")
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'MinMaxScaler',
+            'name': 'MinMaxScaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
new file mode 100644
index 000000000..ebef7a79b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+
+
+class NoScaler(BaseScaler):
+    """
+    No scaling performed
+    """
+
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        """
+        Args:
+            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
+        """
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+        """
+        The fit function calls the fit function of the underlying model
+        and returns the transformed array.
+        Args:
+            X (np.ndarray): input features
+            y (Optional[np.ndarray]): input labels
+
+        Returns:
+            instance of self
+        """
+
+        self.check_requirements(X, y)
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        The transform function calls the transform function of the
+        underlying model and returns the transformed array.
+
+        Args:
+            X (np.ndarray): input features
+
+        Returns:
+            np.ndarray: Transformed features
+        """
+        X.update({'scaler': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NoScaler',
+            'name': 'No Scaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
new file mode 100644
index 000000000..7404a6f62
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
@@ -0,0 +1,33 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
+
+
+class StandardScaler(BaseScaler):
+    """
+    Standardise numerical features by removing mean and scaling to unit variance
+    """
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        """
+        Args:
+            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
+        """
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode="standard")
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'StandardScaler',
+            'name': 'Standard Scaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
new file mode 100644
index 000000000..2567032b0
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -0,0 +1,31 @@
+from typing import Any, Dict, List
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseScaler(autoPyTorchTimeSeriesPreprocessingComponent):
+    """
+    Provides abstract class interface for time series scalers in AutoPytorch
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the fitted scalar into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+            raise ValueError(f"can not call transform on {self.__class__.__name__} without fitting first.")
+        X.update({'scaler': self.preprocessor})
+        return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
new file mode 100644
index 000000000..3bd90d5e3
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
@@ -0,0 +1,115 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+
+scaling_directory = os.path.split(__file__)[0]
+_scalers = find_components(__package__,
+                           scaling_directory,
+                           BaseScaler)
+
+_addons = ThirdPartyComponents(BaseScaler)
+
+
+def add_scaler(scaler: BaseScaler) -> None:
+    _addons.add_component(scaler)
+
+
+class ScalerChoice(autoPyTorchChoice):
+    """
+    Allows for dynamically choosing scaling component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available scaler components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseScalers components available
+                as choices for scaling
+        """
+        components = OrderedDict()
+        components.update(_scalers)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, Any]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        available_scalers = self.get_available_components(dataset_properties=dataset_properties,
+                                                          include=include,
+                                                          exclude=exclude)
+
+        if len(available_scalers) == 0:
+            raise ValueError("no scalers found, please add a scaler")
+
+        if default is None:
+            defaults = ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'NoScaler']
+            for default_ in defaults:
+                if default_ in available_scalers:
+                    default = default_
+                    break
+
+        # add only no scaler to choice hyperparameters in case the dataset is only categorical
+        if len(dataset_properties['numerical_features']) == 0:
+            default = 'NoScaler'
+            if include is not None and default not in include:
+                raise ValueError("Provided {} in include, however, "
+                                 "the dataset is incompatible with it".format(include))
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         ['NoScaler'],
+                                                         default_value=default)
+        else:
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         list(available_scalers.keys()),
+                                                         default_value=default)
+        cs.add_hyperparameter(preprocessor)
+
+        # add only child hyperparameters of early_preprocessor choices
+        for name in preprocessor.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
+                                                                                   **updates)
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, config_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the fit dictionary
+        It recursively makes sure that the children and parent level requirements
+        are honored before fit.
+        Args:
+            dataset_properties:
+
+        """
+        super()._check_dataset_properties(dataset_properties)
+        print(dataset_properties)
+        assert 'numerical_features' in dataset_properties and \
+               'categorical_features' in dataset_properties, \
+            "Dataset properties must contain information about the type of features"
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
new file mode 100644
index 000000000..f45e6faec
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -0,0 +1,53 @@
+from typing import Any
+
+import numpy as np
+import sklearn
+
+from sklearn.base import BaseEstimator
+
+
+# Similar to / inspired by
+# https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
+class TimeSeriesScaler(BaseEstimator):
+    def __init__(self, mode: str):
+        self.mode = mode
+
+    def fit(self, X: np.ndarray, y: Any = None):
+        """
+        For time series we do not need to fit anything since each time series is scaled individually
+        """
+        pass
+
+    def transform(self, X: np.ndarray):
+        X: np.ndarray = sklearn.utils.check_array(
+            X,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        )
+
+        if self.mode == "standard":
+            mean_ = np.mean(X, axis=1, keepdims=True)
+            std_ = np.std(X, axis=1, keepdims=True)
+            std_[std_ == 0.0] = 1.0
+
+            return (X - mean_) / std_
+
+        elif self.mode == "min_max":
+            min_ = np.min(X, axis=1, keepdims=True)
+            max_ = np.max(X, axis=1, keepdims=True)
+            diff_ = max_ - min_
+            diff_[diff_ == 0.0] = 1.0
+
+            return (X - min_) / diff_
+
+        elif self.mode == "max_abs":
+            max_abs_ = np.max(np.abs(X), axis=1, keepdims=True)
+            max_abs_[max_abs_ == 0.0] = 1.0
+
+            return X / max_abs_
+
+        else:
+            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
new file mode 100644
index 000000000..7072f001c
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
@@ -0,0 +1,28 @@
+from typing import Any, Dict, List
+
+from sklearn.base import BaseEstimator
+
+
+def get_time_series_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator]]:
+    """
+    Expects fit_dictionary(X) to have numerical/categorical preprocessors
+    (fitted numerical/categorical preprocessing nodes) that will build a pipeline in the TimeSeriesTransformer.
+    This function parses X and extracts such components.
+    Creates a dictionary with two keys,
+    numerical- containing list of numerical preprocessors
+    categorical- containing list of categorical preprocessors
+    Args:
+        X: fit dictionary
+    Returns:
+        (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
+    """
+    preprocessor = dict(numerical=list(), categorical=list())  # type: Dict[str, List[BaseEstimator]]
+    for key, value in X.items():
+        if isinstance(value, dict):
+            # as each preprocessor is child of BaseEstimator
+            if 'numerical' in value and isinstance(value['numerical'], BaseEstimator):
+                preprocessor['numerical'].append(value['numerical'])
+            if 'categorical' in value and isinstance(value['categorical'], BaseEstimator):
+                preprocessor['categorical'].append(value['categorical'])
+
+    return preprocessor
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py
new file mode 100644
index 000000000..5ea83b8dd
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py
@@ -0,0 +1,60 @@
+from typing import Any, Callable, Dict, List
+
+import torch
+
+import torchvision
+
+from autoPyTorch.pipeline.components.training.data_loader.base_data_loader import BaseDataLoaderComponent
+
+
+class TimeSeriesDataLoader(BaseDataLoaderComponent):
+    """This class is an interface to the PyTorch Dataloader.
+
+    Particularly, this data loader builds transformations for
+    tabular data.
+
+    """
+
+    def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
+        """
+        Method to build a transformation that can pre-process input data
+
+        Args:
+            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
+            mode (str): train/val/test
+
+        Returns:
+            A composition of transformations
+        """
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError("Unsupported mode provided {}. ".format(mode))
+
+        # In the case of time series data, the options currently available
+        # for transformations are:
+        #   + scaler
+        # This transformations apply for both train/val/test, so no
+        # distinction is performed
+        candidate_transformations = []  # type: List[Callable]
+
+        if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
+            candidate_transformations.extend(X['preprocess_transforms'])
+
+        # Transform to tensor
+        candidate_transformations.append(torch.from_numpy)
+
+        return torchvision.transforms.Compose(candidate_transformations)
+
+    def _check_transform_requirements(self, X: Dict[str, Any], y: Any = None) -> None:
+        """
+
+        Makes sure that the fit dictionary contains the required transformations
+        that the dataset should go through
+
+        Args:
+            X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing
+                mechanism, in which during a transform, a components adds relevant information
+                so that further stages can be properly fitted
+        """
+        if not X['dataset_properties']['is_small_preprocess'] and 'preprocess_transforms' not in X:
+            raise ValueError("Cannot find the preprocess_transforms in the fit dictionary")
diff --git a/autoPyTorch/pipeline/time_series_classification.py b/autoPyTorch/pipeline/time_series_classification.py
new file mode 100644
index 000000000..04c987763
--- /dev/null
+++ b/autoPyTorch/pipeline/time_series_classification.py
@@ -0,0 +1,225 @@
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import sklearn.preprocessing
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+from sklearn.base import ClassifierMixin
+
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
+from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
+from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import (
+    NetworkInitializerChoice
+)
+from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
+from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
+from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
+    TrainerChoice
+)
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesClassificationPipeline(ClassifierMixin, BasePipeline):
+    """This class is a proof of concept to integrate AutoSklearn Components
+
+    It implements a pipeline, which includes as steps:
+
+        ->One preprocessing step
+        ->One neural network
+
+    Contrary to the sklearn API it is not possible to enumerate the
+    possible parameters in the __init__ function because we only know the
+    available classifiers at runtime. For this reason the user must
+    specifiy the parameters by passing an instance of
+    ConfigSpace.configuration_space.Configuration.
+
+
+    Args:
+        config (Configuration)
+            The configuration to evaluate.
+        random_state (Optional[RandomState): random_state is the random number generator
+
+    Attributes:
+    Examples
+    """
+
+    def __init__(
+        self,
+        config: Optional[Configuration] = None,
+        steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None,
+        dataset_properties: Optional[Dict[str, Any]] = None,
+        include: Optional[Dict[str, Any]] = None,
+        exclude: Optional[Dict[str, Any]] = None,
+        random_state: Optional[np.random.RandomState] = None,
+        init_params: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ):
+        super().__init__(
+            config, steps, dataset_properties, include, exclude,
+            random_state, init_params, search_space_updates)
+
+    def _predict_proba(self, X: np.ndarray) -> np.ndarray:
+        # Pre-process X
+        loader = self.named_steps['data_loader'].get_loader(X=X)
+        pred = self.named_steps['network'].predict(loader)
+        if isinstance(self.dataset_properties['output_shape'], int):
+            proba = pred[:, :self.dataset_properties['output_shape']]
+            normalizer = proba.sum(axis=1)[:, np.newaxis]
+            normalizer[normalizer == 0.0] = 1.0
+            proba /= normalizer
+
+            return proba
+
+        else:
+            all_proba = []
+
+            for k in range(self.dataset_properties['output_shape']):
+                proba_k = pred[:, k, :self.dataset_properties['output_shape'][k]]
+                normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+                normalizer[normalizer == 0.0] = 1.0
+                proba_k /= normalizer
+                all_proba.append(proba_k)
+
+            return all_proba
+
+    def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
+        """predict_proba.
+
+        Args:
+            X (np.ndarray): input to the pipeline, from which to guess targets
+            batch_size (Optional[int]): batch_size controls whether the pipeline
+                will be called on small chunks of the data. Useful when calling the
+                predict method on the whole array X results in a MemoryError.
+        Returns:
+            np.ndarray: Probabilities of the target being certain class
+        """
+        if batch_size is None:
+            y = self._predict_proba(X)
+
+        else:
+            if not isinstance(batch_size, int):
+                raise ValueError("Argument 'batch_size' must be of type int, "
+                                 "but is '%s'" % type(batch_size))
+            if batch_size <= 0:
+                raise ValueError("Argument 'batch_size' must be positive, "
+                                 "but is %d" % batch_size)
+
+            else:
+                # Probe for the target array dimensions
+                target = self.predict_proba(X[0:2].copy())
+
+                y = np.zeros((X.shape[0], target.shape[1]),
+                             dtype=np.float32)
+
+                for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))):
+                    batch_from = k * batch_size
+                    batch_to = min([(k + 1) * batch_size, X.shape[0]])
+                    pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None)
+                    y[batch_from:batch_to] = pred_prob.astype(np.float32)
+
+        # Neural networks might not be fit to produce a [0-1] output
+        # For instance, after small number of epochs.
+        y = np.clip(y, 0, 1)
+        y = sklearn.preprocessing.normalize(y, axis=1, norm='l1')
+
+        return y
+
+    def _get_hyperparameter_search_space(self,
+                                         dataset_properties: Dict[str, Any],
+                                         include: Optional[Dict[str, Any]] = None,
+                                         exclude: Optional[Dict[str, Any]] = None,
+                                         ) -> ConfigurationSpace:
+        """Create the hyperparameter configuration space.
+
+        For the given steps, and the Choices within that steps,
+        this procedure returns a configuration space object to
+        explore.
+
+        Args:
+            include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+                to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+                to remove from the configuration space
+            dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics
+                of the dataset to guide the pipeline choices of components
+
+        Returns:
+            cs (Configuration): The configuration space describing
+                the SimpleRegressionClassifier.
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None or not isinstance(dataset_properties, dict):
+            if not isinstance(dataset_properties, dict):
+                warnings.warn('The given dataset_properties argument contains an illegal value.'
+                              'Proceeding with the default value')
+            dataset_properties = dict()
+
+        if 'target_type' not in dataset_properties:
+            dataset_properties['target_type'] = 'time_series_classification'
+        if dataset_properties['target_type'] != 'time_series_classification':
+            warnings.warn('Tabular classification is being used, however the target_type'
+                          'is not given as "time_series_classification". Overriding it.')
+            dataset_properties['target_type'] = 'time_series_classification'
+        # get the base search space given this
+        # dataset properties. Then overwrite with custom
+        # classification requirements
+        cs = self._get_base_search_space(
+            cs=cs, dataset_properties=dataset_properties,
+            exclude=exclude, include=include, pipeline=self.steps)
+
+        # Here we add custom code, like this with this
+        # is not a valid configuration
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
+                            ) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
+                by the pipeline.
+        """
+        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
+
+        default_dataset_properties = {'target_type': 'time_series_classification'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+
+        steps.extend([
+            ("scaler", ScalerChoice(default_dataset_properties)),
+            ("preprocessing", EarlyPreprocessing()),
+            ("time_series_transformer", TimeSeriesTransformer()),
+            ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
+            ("network_head", NetworkHeadChoice(default_dataset_properties)),
+            ("network", NetworkComponent()),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties)),
+            ("optimizer", OptimizerChoice(default_dataset_properties)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
+            ("data_loader", TimeSeriesDataLoader()),
+            ("trainer", TrainerChoice(default_dataset_properties)),
+        ])
+        return steps
+
+    def _get_estimator_hyperparameter_name(self) -> str:
+        """
+        Returns the name of the current estimator.
+
+        Returns:
+            str: name of the pipeline type
+        """
+        return "time_series_classifier"

From a7e500a78fe656282e668b20fe225c3e882ece42 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 22 Feb 2021 11:34:25 +0100
Subject: [PATCH 002/347] intermediate mypy and flake fix

---
 autoPyTorch/data/time_series_feature_validator.py      |  6 ++----
 autoPyTorch/data/time_series_validator.py              |  2 --
 .../time_series_preprocessing/TimeSeriesTransformer.py |  7 ++++---
 .../base_time_series_preprocessing.py                  |  2 +-
 .../time_series_preprocessing/scaling/MinMaxScaler.py  |  2 +-
 .../scaling/base_scaler_choice.py                      |  1 -
 .../time_series_preprocessing/scaling/utils.py         | 10 +++++-----
 autoPyTorch/pipeline/time_series_classification.py     |  4 +++-
 8 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 283e98860..dee284f1f 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -4,10 +4,8 @@
 import numpy as np
 
 import sklearn.utils
-
 from sklearn.base import BaseEstimator
 
-from autoPyTorch.data.base_feature_validator import BaseFeatureValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
@@ -35,7 +33,7 @@ def fit(self,
         """
 
         if not isinstance(X_train, np.ndarray):
-            raise ValueError(f"Time series train data must be given as a numpy array")
+            raise ValueError("Time series train data must be given as a numpy array")
 
         if X_train.ndim != 3:
             raise ValueError(f"Invalid number of dimensions for time series train data, "
@@ -54,7 +52,7 @@ def fit(self,
 
         if X_test is not None:
             if not isinstance(X_test, np.ndarray):
-                raise ValueError(f"Time series test data must be given as a numpy array")
+                raise ValueError("Time series test data must be given as a numpy array")
 
             if not X_test.ndim == 3:
                 raise ValueError(f"Invalid number of dimensions for time series test data, "
diff --git a/autoPyTorch/data/time_series_validator.py b/autoPyTorch/data/time_series_validator.py
index 7085af1ad..62be3318f 100644
--- a/autoPyTorch/data/time_series_validator.py
+++ b/autoPyTorch/data/time_series_validator.py
@@ -3,8 +3,6 @@
 import typing
 
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 764269043..28f9b5aa0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -1,8 +1,10 @@
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
+
+from sklearn.pipeline import Pipeline, make_pipeline
+
 import torch
-from sklearn.pipeline import make_pipeline, Pipeline
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
     autoPyTorchTimeSeriesPreprocessingComponent
@@ -35,7 +37,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
         preprocessors = get_time_series_preprocessers(X)
 
         if len(X['dataset_properties']['categorical_features']):
-            raise ValueError(f"Categorical features are not yet supported for time series")
+            raise ValueError("Categorical features are not yet supported for time series")
 
         numerical_pipeline = make_pipeline(*preprocessors['numerical'])
 
@@ -44,7 +46,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
         # Where to get the data -- Prioritize X_train if any else
         # get from backend
         if 'X_train' in X:
-            print(X.keys())
             X_train = subsampler(X['X_train'], X['train_indices'])
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
index f1eb389a3..0f8966ac0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Union
+from typing import Dict, Optional, Union
 
 from sklearn.base import BaseEstimator
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
index a255d86b4..1ae908efd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
index 3bd90d5e3..5825db2fa 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
@@ -109,7 +109,6 @@ def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
 
         """
         super()._check_dataset_properties(dataset_properties)
-        print(dataset_properties)
         assert 'numerical_features' in dataset_properties and \
                'categorical_features' in dataset_properties, \
             "Dataset properties must contain information about the type of features"
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index f45e6faec..f8d6f5e5a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,8 +1,8 @@
 from typing import Any
 
 import numpy as np
-import sklearn
 
+import sklearn
 from sklearn.base import BaseEstimator
 
 
@@ -12,14 +12,14 @@ class TimeSeriesScaler(BaseEstimator):
     def __init__(self, mode: str):
         self.mode = mode
 
-    def fit(self, X: np.ndarray, y: Any = None):
+    def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
         """
         For time series we do not need to fit anything since each time series is scaled individually
         """
-        pass
+        return self
 
-    def transform(self, X: np.ndarray):
-        X: np.ndarray = sklearn.utils.check_array(
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        X = sklearn.utils.check_array(
             X,
             force_all_finite=True,
             ensure_2d=False,
diff --git a/autoPyTorch/pipeline/time_series_classification.py b/autoPyTorch/pipeline/time_series_classification.py
index 04c987763..9ace78f17 100644
--- a/autoPyTorch/pipeline/time_series_classification.py
+++ b/autoPyTorch/pipeline/time_series_classification.py
@@ -1,9 +1,11 @@
 import warnings
 from typing import Any, Dict, List, Optional, Tuple
 
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
 import numpy as np
+
 import sklearn.preprocessing
-from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 from sklearn.base import ClassifierMixin
 
 from autoPyTorch.pipeline.base_pipeline import BasePipeline

From 26af0a8a113a30c1c5cab0f61746cc9e041d6cc6 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 22 Feb 2021 12:41:53 +0100
Subject: [PATCH 003/347] added time series classification task and example

---
 autoPyTorch/api/tabular_regression.py         |   7 +-
 autoPyTorch/api/time_series_classification.py | 268 ++++++++++++++++++
 autoPyTorch/constants.py                      |   2 +
 autoPyTorch/datasets/time_series_dataset.py   | 170 ++++++-----
 .../scaling/base_scaler_choice.py             |   6 +-
 .../network_backbone/base_network_backbone.py |   8 +-
 autoPyTorch/utils/pipeline.py                 |  33 ++-
 examples/example_tabular_regression.py        |   3 +-
 .../example_time_series_classification.py     | 101 +++++++
 9 files changed, 517 insertions(+), 81 deletions(-)
 create mode 100644 autoPyTorch/api/time_series_classification.py
 create mode 100644 examples/example_time_series_classification.py

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index 394a7230f..57b9a6394 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -114,7 +114,7 @@ def search(self,
                budget: Optional[float] = None,
                total_walltime_limit: int = 100,
                func_eval_time_limit: int = 60,
-               traditional_per_total_budget: float = 0.1,
+               traditional_per_total_budget: float = 0.,
                memory_limit: Optional[int] = 4096,
                smac_scenario_args: Optional[Dict[str, Any]] = None,
                get_smac_object_callback: Optional[Callable] = None,
@@ -213,6 +213,11 @@ def search(self,
             resampling_strategy_args=self.resampling_strategy_args,
         )
 
+        if traditional_per_total_budget > 0.:
+            self._logger.warning("Tabular regression for now does not support traditional classifiers. "
+                                 "Setting traditional_per_total_budget to 0.")
+            traditional_per_total_budget = 0.
+
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/api/time_series_classification.py b/autoPyTorch/api/time_series_classification.py
new file mode 100644
index 000000000..b9597cf9e
--- /dev/null
+++ b/autoPyTorch/api/time_series_classification.py
@@ -0,0 +1,268 @@
+import os
+import uuid
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+
+import pandas as pd
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.constants import (
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_CLASSIFICATION
+)
+from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+)
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
+from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
+from autoPyTorch.utils.backend import Backend
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesClassificationTask(BaseTask):
+    """
+    Time Series Classification API to the pipelines.
+    Args:
+        seed (int): seed to be used for reproducibility.
+        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        logging_config (Optional[Dict]): specifies configuration
+            for logging, if None, it is loaded from the logging.yaml
+        ensemble_size (int), (default=50): Number of models added to the ensemble built by
+            Ensemble selection from libraries of models.
+            Models are drawn with replacement.
+        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
+            models to build the ensemble
+        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
+            Also, controls the size of the ensemble as any additional models will be deleted.
+            Must be greater than or equal to 1.
+        temporary_directory (str): folder to store configuration output and log file
+        output_directory (str): folder to store predictions for optional test set
+        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
+            when finished
+        include_components (Optional[Dict]): If None, all possible components are used.
+            Otherwise specifies set of components to use.
+        exclude_components (Optional[Dict]): If None, all possible components are used.
+            Otherwise specifies set of components not to use. Incompatible with include
+            components
+    """
+    def __init__(
+        self,
+        seed: int = 1,
+        n_jobs: int = 1,
+        logging_config: Optional[Dict] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
+        temporary_directory: Optional[str] = None,
+        output_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        delete_output_folder_after_terminate: bool = True,
+        include_components: Optional[Dict] = None,
+        exclude_components: Optional[Dict] = None,
+        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        backend: Optional[Backend] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ):
+        super().__init__(
+            seed=seed,
+            n_jobs=n_jobs,
+            logging_config=logging_config,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
+            temporary_directory=temporary_directory,
+            output_directory=output_directory,
+            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
+            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
+            include_components=include_components,
+            exclude_components=exclude_components,
+            backend=backend,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            search_space_updates=search_space_updates,
+            task_type=TASK_TYPES_TO_STRING[TIMESERIES_CLASSIFICATION],
+        )
+
+    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
+        if not isinstance(dataset, TimeSeriesDataset):
+            raise ValueError("Dataset is incompatible for the given task,: {}".format(
+                type(dataset)
+            ))
+        return {'task_type': dataset.task_type,
+                'output_type': dataset.output_type,
+                'issparse': dataset.issparse,
+                'numerical_features': dataset.numerical_features,
+                'categorical_features': dataset.categorical_features}
+
+    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesClassificationPipeline:
+        return TimeSeriesClassificationPipeline(dataset_properties=dataset_properties)
+
+    def search(
+        self,
+        optimize_metric: str,
+        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        dataset_name: Optional[str] = None,
+        budget_type: Optional[str] = None,
+        budget: Optional[float] = None,
+        total_walltime_limit: int = 100,
+        func_eval_time_limit: int = 60,
+        traditional_per_total_budget: float = 0.,
+        memory_limit: Optional[int] = 4096,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: List = [],
+        load_models: bool = True,
+    ) -> 'BaseTask':
+        """
+        Search for the best pipeline configuration for the given dataset.
+
+        Fit both optimizes the machine learning models and builds an ensemble out of them.
+        To disable ensembling, set ensemble_size==0.
+        using the optimizer.
+        Args:
+            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
+                A pair of features (X_train) and targets (y_train) used to fit a
+                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
+                be provided to track the generalization performance of each stage.
+            optimize_metric (str): name of the metric that is used to
+                evaluate a pipeline.
+            budget_type (Optional[str]):
+                Type of budget to be used when fitting the pipeline.
+                Either 'epochs' or 'runtime'. If not provided, uses
+                the default in the pipeline config ('epochs')
+            budget (Optional[float]):
+                Budget to fit a single run of the pipeline. If not
+                provided, uses the default in the pipeline config
+            total_walltime_limit (int), (default=100): Time limit
+                in seconds for the search of appropriate models.
+                By increasing this value, autopytorch has a higher
+                chance of finding better models.
+            func_eval_time_limit (int), (default=60): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit. Set
+                this value high enough so that typical machine
+                learning algorithms can be fit on the training
+                data.
+            traditional_per_total_budget (float), (default=0.1):
+                Percent of total walltime to be allocated for
+                running traditional classifiers.
+            memory_limit (Optional[int]), (default=4096): Memory
+                limit in MB for the machine learning algorithm. autopytorch
+                will stop fitting the machine learning algorithm if it tries
+                to allocate more than memory_limit MB. If None is provided,
+                no memory limit is set. In case of multi-processing, memory_limit
+                will be per job. This memory limit also applies to the ensemble
+                creation process.
+            smac_scenario_args (Optional[Dict]): Additional arguments inserted
+                into the scenario of SMAC. See the
+                [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
+            get_smac_object_callback (Optional[Callable]): Callback function
+                to create an object of class
+                [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
+                The function must accept the arguments scenario_dict,
+                instances, num_params, runhistory, seed and ta. This is
+                an advanced feature. Use only if you are familiar with
+                [SMAC](https://automl.github.io/SMAC3/master/index.html).
+            all_supported_metrics (bool), (default=True): if True, all
+                metrics supporting current task will be calculated
+                for each pipeline and results will be available via cv_results
+            precision (int), (default=32): Numeric precision used when loading
+                ensemble data. Can be either '16', '32' or '64'.
+            disable_file_output (Union[bool, List]):
+            load_models (bool), (default=True): Whether to load the
+                models after fitting AutoPyTorch.
+
+        Returns:
+            self
+
+        """
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # we have to create a logger for at this point for the validator
+        self._logger = self._get_logger(dataset_name)
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        self.InputValidator = TimeSeriesInputValidator(
+            is_classification=True,
+            logger_port=self._logger_port,
+        )
+
+        # Fit a input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+
+        self.dataset = TimeSeriesDataset(
+            X=X_train, Y=y_train,
+            X_test=X_test, Y_test=y_test,
+            validator=self.InputValidator,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+        )
+
+        if traditional_per_total_budget > 0.:
+            self._logger.warning("Time series classification for now does not support traditional classifiers. "
+                                 "Setting traditional_per_total_budget to 0.")
+            traditional_per_total_budget = 0.
+
+        return self._search(
+            dataset=self.dataset,
+            optimize_metric=optimize_metric,
+            budget_type=budget_type,
+            budget=budget,
+            total_walltime_limit=total_walltime_limit,
+            func_eval_time_limit=func_eval_time_limit,
+            traditional_per_total_budget=traditional_per_total_budget,
+            memory_limit=memory_limit,
+            smac_scenario_args=smac_scenario_args,
+            get_smac_object_callback=get_smac_object_callback,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            load_models=load_models,
+        )
+
+    def predict(
+            self,
+            X_test: np.ndarray,
+            batch_size: Optional[int] = None,
+            n_jobs: int = 1
+    ) -> np.ndarray:
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("predict() is only supported after calling search. Kindly call first "
+                             "the estimator fit() method.")
+
+        X_test = self.InputValidator.feature_validator.transform(X_test)
+        predicted_probabilities = super().predict(X_test, batch_size=batch_size,
+                                                  n_jobs=n_jobs)
+
+        if self.InputValidator.target_validator.is_single_column_target():
+            predicted_indexes = np.argmax(predicted_probabilities, axis=1)
+        else:
+            predicted_indexes = (predicted_probabilities > 0.5).astype(int)
+
+        # Allow to predict in the original domain -- that is, the user is not interested
+        # in our encoded values
+        return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
+
+    def predict_proba(self,
+                      X_test: Union[np.ndarray, pd.DataFrame, List],
+                      batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("predict() is only supported after calling search. Kindly call first "
+                             "the estimator fit() method.")
+        X_test = self.InputValidator.feature_validator.transform(X_test)
+        return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 652a546b9..6680423a3 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -10,6 +10,8 @@
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
+TIMESERIES_TASKS = [TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION]
+
 TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS
 
 TASK_TYPES_TO_STRING = \
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 7b0435d19..50d8fc8e8 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,15 +1,20 @@
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+import pandas as pd
+
 import torchvision.transforms
 
+from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, REGRESSION_OUTPUTS, STRING_TO_OUTPUT_TYPES, \
+    TASK_TYPES_TO_STRING, TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION
+from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
     get_cross_validators,
-    get_holdout_validators
+    get_holdout_validators, is_stratified
 )
 
 TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
@@ -95,8 +100,8 @@ def _prepare_time_series_forecasting_tensor(tensor: TIME_SERIES_FORECASTING_INPU
     population_size, time_series_length, num_features = tensor[0].shape
     num_targets = len(target_variables)
     num_datapoints = time_series_length - sequence_length - n_steps + 1
-    x_tensor = np.zeros((num_datapoints, population_size, sequence_length, num_features), dtype=np.float)
-    y_tensor = np.zeros((num_datapoints, population_size, num_targets), dtype=np.float)
+    x_tensor = np.zeros((num_datapoints, population_size, sequence_length, num_features), dtype=np.float32)
+    y_tensor = np.zeros((num_datapoints, population_size, num_targets), dtype=np.float32)
 
     for p in range(population_size):
         for i in range(num_datapoints):
@@ -109,66 +114,99 @@ def _prepare_time_series_forecasting_tensor(tensor: TIME_SERIES_FORECASTING_INPU
     return x_tensor, y_tensor
 
 
-class TimeSeriesClassificationDataset(BaseDataset):
+class TimeSeriesDataset(BaseDataset):
+    """
+    Common dataset for time series classification and regression data
+    Args:
+        X (np.ndarray): input training data.
+        Y (Union[np.ndarray, pd.Series]): training data targets.
+        X_test (Optional[np.ndarray]):  input testing data.
+        Y_test (Optional[Union[np.ndarray, pd.DataFrame]]): testing data targets
+        resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
+            (default=HoldoutValTypes.holdout_validation):
+            strategy to split the training data.
+        resampling_strategy_args (Optional[Dict[str, Any]]): arguments
+            required for the chosen resampling strategy. If None, uses
+            the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+            in ```datasets/resampling_strategy.py```.
+        shuffle:  Whether to shuffle the data before performing splits
+        seed (int), (default=1): seed to be used for reproducibility.
+        train_transforms (Optional[torchvision.transforms.Compose]):
+            Additional Transforms to be applied to the training data.
+        val_transforms (Optional[torchvision.transforms.Compose]):
+            Additional Transforms to be applied to the validation/test data.
+
+        Notes: Support for Numpy Arrays is missing Strings.
+
+        """
+
     def __init__(self,
-                 train: TIME_SERIES_CLASSIFICATION_INPUT,
-                 val: Optional[TIME_SERIES_CLASSIFICATION_INPUT] = None):
-        _check_time_series_inputs(train=train,
-                                  val=val,
-                                  task_type="time_series_classification")
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
-        self.cross_validators = get_cross_validators(
-            CrossValTypes.stratified_k_fold_cross_validation,
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation,
-            CrossValTypes.stratified_shuffle_split_cross_validation
-        )
-        self.holdout_validators = get_holdout_validators(
-            HoldoutValTypes.holdout_validation,
-            HoldoutValTypes.stratified_holdout_validation
-        )
-
-
-class TimeSeriesRegressionDataset(BaseDataset):
-    def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np.ndarray, np.ndarray]] = None):
-        _check_time_series_inputs(train=train,
-                                  val=val,
-                                  task_type="time_series_regression")
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
-        self.cross_validators = get_cross_validators(
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation
-        )
-        self.holdout_validators = get_holdout_validators(
-            HoldoutValTypes.holdout_validation
-        )
-
-
-def _check_time_series_inputs(task_type: str,
-                              train: Union[TIME_SERIES_CLASSIFICATION_INPUT, TIME_SERIES_REGRESSION_INPUT],
-                              val: Optional[
-                                  Union[TIME_SERIES_CLASSIFICATION_INPUT, TIME_SERIES_REGRESSION_INPUT]] = None
-                              ) -> None:
-    if len(train) != 2:
-        raise ValueError(f"There must be exactly two training tensors for {task_type}. "
-                         f"The first one containing the data and the second one containing the targets.")
-    if train[0].ndim != 3:
-        raise ValueError(
-            f"The training data for {task_type} has to be a three-dimensional tensor of shape NxSxM.")
-    if train[1].ndim != 1:
-        raise ValueError(
-            f"The training targets for {task_type} have to be of shape N."
-        )
-    if val is not None:
-        if len(val) != 2:
-            raise ValueError(
-                f"There must be exactly two validation tensors for{task_type}. "
-                f"The first one containing the data and the second one containing the targets.")
-        if val[0].ndim != 3:
-            raise ValueError(
-                f"The validation data for {task_type} has to be a "
-                f"three-dimensional tensor of shape NxSxM.")
-        if val[0].ndim != 1:
-            raise ValueError(
-                f"The validation targets for {task_type} have to be of shape N."
-            )
+                 X: np.ndarray,
+                 Y: Union[np.ndarray, pd.Series],
+                 X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+                 resampling_strategy_args: Optional[Dict[str, Any]] = None,
+                 shuffle: Optional[bool] = True,
+                 seed: Optional[int] = 42,
+                 train_transforms: Optional[torchvision.transforms.Compose] = None,
+                 val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 dataset_name: Optional[str] = None,
+                 validator: Optional[BaseInputValidator] = None,
+                 ):
+        # Take information from the validator, which guarantees clean data for the
+        # dataset.
+        # TODO: Consider moving the validator to the pipeline itself when we
+        # move to using the fit_params on scikit learn 0.24
+        if validator is None:
+            raise ValueError("A feature validator is required to build a time series pipeline")
+
+        self.validator = validator
+
+        X, Y = self.validator.transform(X, Y)
+        if X_test is not None:
+            X_test, Y_test = self.validator.transform(X_test, Y_test)
+
+        super().__init__(train_tensors=(X, Y),
+                         test_tensors=(X_test, Y_test),
+                         shuffle=shuffle,
+                         resampling_strategy=resampling_strategy,
+                         resampling_strategy_args=resampling_strategy_args,
+                         seed=seed, train_transforms=train_transforms,
+                         dataset_name=dataset_name,
+                         val_transforms=val_transforms)
+
+        if self.output_type is not None:
+            if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
+                self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_CLASSIFICATION]
+            elif STRING_TO_OUTPUT_TYPES[self.output_type] in REGRESSION_OUTPUTS:
+                self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_REGRESSION]
+            else:
+                raise ValueError(f"Output type {self.output_type} currently not supported ")
+        else:
+            raise ValueError("Task type not currently supported ")
+
+        # filter the default cross and holdout validators if we have a regression task
+        # since we cannot use stratification there
+        if self.task_type == TASK_TYPES_TO_STRING[TIMESERIES_REGRESSION]:
+            self.cross_validators = {cv_type: cv for cv_type, cv in self.cross_validators.items()
+                                     if not is_stratified(cv_type)}
+            self.holdout_validators = {hv_type: hv for hv_type, hv in self.holdout_validators.items()
+                                       if not is_stratified(hv_type)}
+
+        self.num_features = self.train_tensors[0].shape[2]
+        self.numerical_features: List[int] = list(range(self.num_features))
+        self.categorical_features: List[int] = []
+
+    def get_required_dataset_info(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary containing required dataset properties to instantiate a pipeline,
+        """
+        info = super().get_required_dataset_info()
+        info.update({
+            'task_type': self.task_type,
+            'numerical_features': self.numerical_features,
+            'categorical_features': self.categorical_features,
+
+        })
+        return info
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
index 5825db2fa..5c5dce4cd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
@@ -105,10 +105,10 @@ def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
         It recursively makes sure that the children and parent level requirements
         are honored before fit.
         Args:
-            dataset_properties:
+            dataset_properties (Dict[str, Any]): dictionary holding the dataset properties
 
         """
         super()._check_dataset_properties(dataset_properties)
-        assert 'numerical_features' in dataset_properties and \
-               'categorical_features' in dataset_properties, \
+        assert "numerical_features" in dataset_properties and \
+               "categorical_features" in dataset_properties, \
             "Dataset properties must contain information about the type of features"
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index 2557e92b8..94cb3471c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -10,6 +10,8 @@
 import torch
 from torch import nn
 
+import torchvision
+
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.base_component import (
     autoPyTorchComponent,
@@ -31,7 +33,7 @@ def __init__(self,
             FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
                            dataset_property=False),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)])
+            FitRequirement('preprocess_transforms', (Iterable,), user_defined=False, dataset_property=False)])
         self.backbone: nn.Module = None
         self.config = kwargs
         self.input_shape: Optional[Iterable] = None
@@ -53,8 +55,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             input_shape = X_train.shape[1:]
         else:
             # get input shape by transforming first two elements of the training set
-            column_transformer = X['tabular_transformer'].preprocessor
-            input_shape = column_transformer.transform(X_train[:1]).shape[1:]
+            transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
+            input_shape = transforms(X_train[:1, ...]).shape[1:]
 
         self.input_shape = input_shape
 
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 3cd0d528f..f2040fc84 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -9,10 +9,12 @@
     REGRESSION_TASKS,
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS,
+    TIMESERIES_TASKS,
 )
 from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
+from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -79,20 +81,32 @@ def _get_regression_dataset_requirements(info: Dict[str, Any], include: Dict[str
         raise ValueError("Task_type not supported")
 
 
-def _get_classification_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]],
+def _get_classification_dataset_requirements(info: Dict[str, Any],
+                                             include: Dict[str, List[str]],
                                              exclude: Dict[str, List[str]]) -> List[FitRequirement]:
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
 
     if task_type in TABULAR_TASKS:
         return TabularClassificationPipeline(
             dataset_properties=info,
-            include=include, exclude=exclude).\
-            get_dataset_requirements()
+            include=include,
+            exclude=exclude
+        ).get_dataset_requirements()
+
+    elif task_type in TIMESERIES_TASKS:
+        return TimeSeriesClassificationPipeline(
+            dataset_properties=info,
+            include=include,
+            exclude=exclude,
+        ).get_dataset_requirements()
+
     elif task_type in IMAGE_TASKS:
         return ImageClassificationPipeline(
             dataset_properties=info,
-            include=include, exclude=exclude).\
-            get_dataset_requirements()
+            include=include,
+            exclude=exclude
+        ).get_dataset_requirements()
+
     else:
         raise ValueError("Task_type not supported")
 
@@ -143,11 +157,18 @@ def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[
                                                  include=include, exclude=exclude,
                                                  search_space_updates=search_space_updates)
         return pipeline.get_hyperparameter_search_space()
+
+    elif STRING_TO_TASK_TYPES[info['task_type']] in TIMESERIES_TASKS:
+        pipeline = TimeSeriesClassificationPipeline(dataset_properties=info,
+                                                    include=include, exclude=exclude,
+                                                    search_space_updates=search_space_updates)
+        return pipeline.get_hyperparameter_search_space()
+
     elif STRING_TO_TASK_TYPES[info['task_type']] in IMAGE_TASKS:
         return ImageClassificationPipeline(
             dataset_properties=info,
             include=include, exclude=exclude,
-            search_space_updates=search_space_updates).\
+            search_space_updates=search_space_updates). \
             get_hyperparameter_search_space()
     else:
         raise ValueError("Task_type not supported")
diff --git a/examples/example_tabular_regression.py b/examples/example_tabular_regression.py
index 43c901827..5b0f6d700 100644
--- a/examples/example_tabular_regression.py
+++ b/examples/example_tabular_regression.py
@@ -99,8 +99,7 @@ def get_search_space_updates():
         y_test=y_test_scaled.copy(),
         optimize_metric='r2',
         total_walltime_limit=500,
-        func_eval_time_limit=50,
-        traditional_per_total_budget=0
+        func_eval_time_limit=50
     )
 
     ############################################################################
diff --git a/examples/example_time_series_classification.py b/examples/example_time_series_classification.py
new file mode 100644
index 000000000..8710d98de
--- /dev/null
+++ b/examples/example_time_series_classification.py
@@ -0,0 +1,101 @@
+"""
+======================
+Time Series Classification
+======================
+
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import numpy as np
+
+import sklearn.model_selection
+
+from sktime.datasets import load_gunpoint
+
+from autoPyTorch.api.time_series_classification import TimeSeriesClassificationTask
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+def get_search_space_updates():
+    """
+    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
+    Returns:
+        HyperparameterSearchSpaceUpdates
+    """
+    updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="data_loader",
+                   hyperparameter="batch_size",
+                   value_range=[16, 512],
+                   default_value=32)
+    updates.append(node_name="lr_scheduler",
+                   hyperparameter="CosineAnnealingLR:T_max",
+                   value_range=[50, 60],
+                   default_value=55)
+    updates.append(node_name='network_backbone',
+                   hyperparameter='ResNetBackbone:dropout',
+                   value_range=[0, 0.5],
+                   default_value=0.2)
+    return updates
+
+
+if __name__ == '__main__':
+    ############################################################################
+    # Data Loading
+    # ============
+    X, y = load_gunpoint(return_X_y=True)
+
+    # Convert the pandas dataframes returned from load_gunpoint to 3D numpy array since that is
+    # the format AutoPyTorch expects for now
+    X = [X.iloc[i][0].values for i in range(len(X))]
+    y = [int(y.iloc[i]) for i in range(len(y))]
+    X = np.vstack(X)
+
+    # Expand the last dimension because time series data has to be of shape [B, T, F]
+    # where B is the batch size, T is the time dimension and F are the number of features per time step
+    X = X[..., np.newaxis]
+
+    # Subtract one from the labels because they are initially in {1, 2}, but are expected to be in {0, 1}
+    y = np.array(y) - 1
+
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X,
+        y,
+        random_state=1,
+    )
+
+    ############################################################################
+    # Build and fit a classifier
+    # ==========================
+    api = TimeSeriesClassificationTask(
+        delete_tmp_folder_after_terminate=False,
+        search_space_updates=get_search_space_updates()
+    )
+    api.search(
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test.copy(),
+        y_test=y_test.copy(),
+        optimize_metric='accuracy',
+        total_walltime_limit=500,
+        func_eval_time_limit=50
+    )
+
+    ############################################################################
+    # Print the final ensemble performance
+    # ====================================
+    print(api.run_history, api.trajectory)
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test)
+    print(score)

From 349685a78731888fc26a6ae462de10507d1727a9 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 22 Feb 2021 15:07:29 +0100
Subject: [PATCH 004/347] added time series pipeline tests

---
 test/conftest.py                              |  88 +++--
 .../test_time_series_classification.py        | 317 ++++++++++++++++++
 2 files changed, 384 insertions(+), 21 deletions(-)
 create mode 100644 test/test_pipeline/test_time_series_classification.py

diff --git a/test/conftest.py b/test/conftest.py
index a5d0fe0af..9e2f3f1c8 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -15,7 +15,9 @@
 from sklearn.datasets import fetch_openml, make_classification, make_regression
 
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
 from autoPyTorch.utils.backend import create
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
@@ -226,13 +228,32 @@ def get_tabular_data(task):
     return X, y, validator
 
 
-def get_fit_dictionary(X, y, validator, backend):
-    datamanager = TabularDataset(
-        X=X, Y=y,
-        validator=validator,
-        X_test=X, Y_test=y,
-    )
+def get_time_series_data(task):
+    sin_wave = np.sin(np.arange(30))
+    cos_wave = np.cos(np.arange(30))
+    sin_waves = []
+    cos_waves = []
+    # create a dummy dataset with 100 sin and 100 cosine waves
+    for i in range(100):
+        # add some random noise so not every sample is equal
+        sin_waves.append(sin_wave + np.random.randn(30) * 0.1)
+        cos_waves.append(cos_wave + np.random.randn(30) * 0.1)
+    sin_waves = np.stack(sin_waves)[..., np.newaxis]
+    cos_waves = np.stack(cos_waves)[..., np.newaxis]
+
+    if task == "classification_numerical_only":
+        X = np.concatenate([sin_waves, cos_waves])
+        y = np.array([0] * len(sin_waves) + [1] * len(cos_waves))
+
+        validator = TimeSeriesInputValidator(is_classification=True).fit(X.copy(), y.copy())
+
+    else:
+        raise ValueError("Unsupported task {}".format(task))
+
+    return X, y, validator
 
+
+def get_fit_dictionary(datamanager, backend):
     info = datamanager.get_required_dataset_info()
 
     dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
@@ -260,6 +281,24 @@ def get_fit_dictionary(X, y, validator, backend):
     return fit_dictionary
 
 
+def get_tabular_fit_dictionary(X, y, validator, backend):
+    datamanager = TabularDataset(
+        X=X, Y=y,
+        validator=validator,
+        X_test=X, Y_test=y,
+    )
+    return get_fit_dictionary(datamanager, backend)
+
+
+def get_time_series_fit_dictionary(X, y, validator, backend):
+    datamanager = TimeSeriesDataset(
+        X=X, Y=y,
+        validator=validator,
+        X_test=X, Y_test=y,
+    )
+    return get_fit_dictionary(datamanager, backend)
+
+
 @pytest.fixture
 def fit_dictionary_tabular_dummy(request, backend):
     if request.param == "classification":
@@ -267,14 +306,29 @@ def fit_dictionary_tabular_dummy(request, backend):
     elif request.param == "regression":
         X, y, validator = get_tabular_data("regression_numerical_only")
     else:
-        raise ValueError("Unsupported indirect fixture {}".format(request.param))
-    return get_fit_dictionary(X, y, validator, backend)
+        raise ValueError(f"Unsupported indirect fixture {request.param}")
+    return get_tabular_fit_dictionary(X, y, validator, backend)
+
+
+@pytest.fixture
+def fit_dictionary_time_series_dummy(request, backend):
+    if request.param == "classification":
+        X, y, validator = get_time_series_data("classification_numerical_only")
+    else:
+        raise ValueError(f"Unsupported indirect fixture {request.param}")
+    return get_time_series_fit_dictionary(X, y, validator, backend)
 
 
 @pytest.fixture
 def fit_dictionary_tabular(request, backend):
     X, y, validator = get_tabular_data(request.param)
-    return get_fit_dictionary(X, y, validator, backend)
+    return get_tabular_fit_dictionary(X, y, validator, backend)
+
+
+@pytest.fixture
+def fit_dictionary_time_series(request, backend):
+    X, y, validator = get_time_series_data(request.param)
+    return get_time_series_fit_dictionary(X, y, validator, backend)
 
 
 @pytest.fixture
@@ -318,10 +372,6 @@ def dataset_traditional_classifier_num_categorical():
 @pytest.fixture
 def search_space_updates():
     updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="imputer",
-                   hyperparameter="numerical_strategy",
-                   value_range=("mean", "most_frequent"),
-                   default_value="mean")
     updates.append(node_name="data_loader",
                    hyperparameter="batch_size",
                    value_range=[16, 512],
@@ -330,20 +380,16 @@ def search_space_updates():
                    hyperparameter="CosineAnnealingLR:T_max",
                    value_range=[50, 60],
                    default_value=55)
-    updates.append(node_name='network_backbone',
-                   hyperparameter='ResNetBackbone:dropout',
-                   value_range=[0, 0.5],
-                   default_value=0.2)
+    updates.append(node_name="optimizer",
+                   hyperparameter="AdamOptimizer:lr",
+                   value_range=[0.0001, 0.001],
+                   default_value=0.001)
     return updates
 
 
 @pytest.fixture
 def error_search_space_updates():
     updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="imputer",
-                   hyperparameter="num_str",
-                   value_range=("mean", "most_frequent"),
-                   default_value="mean")
     updates.append(node_name="data_loader",
                    hyperparameter="batch_size",
                    value_range=[16, 512],
diff --git a/test/test_pipeline/test_time_series_classification.py b/test/test_pipeline/test_time_series_classification.py
new file mode 100644
index 000000000..2a8fee10f
--- /dev/null
+++ b/test/test_pipeline/test_time_series_classification.py
@@ -0,0 +1,317 @@
+import os
+import re
+
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+import pytest
+
+import torch
+
+from autoPyTorch import metrics
+from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms
+from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
+from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates, \
+    parse_hyperparameter_search_space_updates
+
+
+@pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
+class TestTimeSeriesClassification:
+    def _assert_pipeline_search_space(self, pipeline, search_space_updates):
+        config_space = pipeline.get_hyperparameter_search_space()
+        for update in search_space_updates.updates:
+            try:
+                assert update.node_name + ':' + update.hyperparameter in config_space
+                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter)
+            except AssertionError:
+                assert any(update.node_name + ':' + update.hyperparameter in name
+                           for name in config_space.get_hyperparameter_names()), \
+                    "Can't find hyperparameter: {}".format(update.hyperparameter)
+                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1')
+            assert update.default_value == hyperparameter.default_value
+            if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)):
+                assert update.value_range[0] == hyperparameter.lower
+                assert update.value_range[1] == hyperparameter.upper
+                if hasattr(update, 'log'):
+                    assert update.log == hyperparameter.log
+            elif isinstance(hyperparameter, CategoricalHyperparameter):
+                assert update.value_range == hyperparameter.choices
+
+    def test_pipeline_fit(self, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to fit
+        given random combinations of hyperparameters across the pipeline"""
+
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+        pipeline.fit(fit_dictionary_time_series)
+
+        # To make sure we fitted the model, there should be a
+        # run summary object with accuracy
+        run_summary = pipeline.named_steps['trainer'].run_summary
+        assert run_summary is not None
+
+        # Make sure that performance was properly captured
+        assert run_summary.performance_tracker['train_loss'][1] > 0
+        assert run_summary.total_parameter_count > 0
+        assert 'accuracy' in run_summary.performance_tracker['train_metrics'][1]
+
+        # Make sure a network was fit
+        assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)
+
+    @pytest.mark.parametrize("fit_dictionary_time_series_dummy", ["classification"], indirect=True)
+    def test_pipeline_score(self, fit_dictionary_time_series_dummy, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to achieve a decent score on dummy data
+        given the default configuration"""
+        X = fit_dictionary_time_series_dummy['X_train'].copy()
+        y = fit_dictionary_time_series_dummy['y_train'].copy()
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series_dummy['dataset_properties'])
+
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.get_default_configuration()
+        pipeline.set_hyperparameters(config)
+
+        pipeline.fit(fit_dictionary_time_series_dummy)
+
+        # we expect the output to have the same batch size as the test input,
+        # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties)
+        expected_output_shape = (X.shape[0],
+                                 fit_dictionary_time_series_dummy["dataset_properties"]["output_shape"])
+
+        prediction = pipeline.predict(X)
+        assert isinstance(prediction, np.ndarray)
+        assert prediction.shape == expected_output_shape
+
+        # we should be able to get a decent score on this dummy data
+        accuracy = metrics.accuracy(y, prediction.squeeze())
+        assert accuracy >= 0.8
+
+    def test_pipeline_predict(self, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to predict
+        given a random configuration"""
+        X = fit_dictionary_time_series['X_train'].copy()
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        pipeline.fit(fit_dictionary_time_series)
+
+        # we expect the output to have the same batch size as the test input,
+        # and number of outputs per batch sample equal to the number of outputs
+        expected_output_shape = (X.shape[0], fit_dictionary_time_series["dataset_properties"]["output_shape"])
+
+        prediction = pipeline.predict(X)
+        assert isinstance(prediction, np.ndarray)
+        assert prediction.shape == expected_output_shape
+
+    def test_pipeline_predict_proba(self, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to fit
+        given random combinations of hyperparameters across the pipeline
+        And then predict using predict probability
+        """
+        X = fit_dictionary_time_series['X_train'].copy()
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        pipeline.fit(fit_dictionary_time_series)
+
+        # we expect the output to have the same batch size as the test input,
+        # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties)
+        expected_output_shape = (X.shape[0], fit_dictionary_time_series["dataset_properties"]["output_shape"])
+
+        prediction = pipeline.predict_proba(X)
+        assert isinstance(prediction, np.ndarray)
+        assert prediction.shape == expected_output_shape
+
+    def test_pipeline_transform(self, fit_dictionary_time_series):
+        """
+        In the context of autopytorch, transform expands a fit dictionary with
+        components that where previously fit. We can use this as a nice way to make sure
+        that fit properly work.
+        This code is added in light of components not properly added to the fit dicitonary
+        """
+
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        # We do not want to make the same early preprocessing operation to the fit dictionary
+        pipeline.fit(fit_dictionary_time_series.copy())
+
+        transformed_fit_dictionary_time_series = pipeline.transform(fit_dictionary_time_series)
+
+        # First, we do not lose anyone! (We use a fancy subset containment check)
+        assert fit_dictionary_time_series.items() <= transformed_fit_dictionary_time_series.items()
+
+        # Then the pipeline should have added the following keys
+        expected_keys = {'scaler', 'time_series_transformer',
+                         'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler',
+                         'train_data_loader', 'val_data_loader', 'run_summary'}
+        assert expected_keys.issubset(set(transformed_fit_dictionary_time_series.keys()))
+
+        # Then we need to have transformations being created.
+        assert len(get_preprocess_transforms(transformed_fit_dictionary_time_series)) > 0
+
+        # We expect the transformations to be in the pipeline at anytime for inference
+        assert 'preprocess_transforms' in transformed_fit_dictionary_time_series.keys()
+
+    @pytest.mark.parametrize("is_small_preprocess", [True, False])
+    def test_default_configuration(self, fit_dictionary_time_series, is_small_preprocess):
+        """Makes sure that when no config is set, we can trust the
+        default configuration from the space"""
+
+        fit_dictionary_time_series['is_small_preprocess'] = is_small_preprocess
+
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+
+        pipeline.fit(fit_dictionary_time_series)
+
+    def test_remove_key_check_requirements(self, fit_dictionary_time_series):
+        """Makes sure that when a key is removed from X, correct error is outputted"""
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        for key in ['num_run', 'device', 'split_id', 'use_pynisher', 'torch_num_threads', 'dataset_properties']:
+            fit_dictionary_time_series_copy = fit_dictionary_time_series.copy()
+            fit_dictionary_time_series_copy.pop(key)
+            with pytest.raises(ValueError, match=r"To fit .+?, expected fit dictionary to have"):
+                pipeline.fit(fit_dictionary_time_series_copy)
+
+    def test_network_optimizer_lr_handshake(self, fit_dictionary_time_series):
+        """Fitting a network should put the network in the X"""
+        # Create the pipeline to check. A random config should be sufficient
+        pipeline = TimeSeriesClassificationPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        # Make sure that fitting a network adds a "network" to X
+        assert 'network' in pipeline.named_steps.keys()
+        fit_dictionary_time_series['network_backbone'] = torch.nn.Linear(3, 4)
+        fit_dictionary_time_series['network_head'] = torch.nn.Linear(4, 1)
+        X = pipeline.named_steps['network'].fit(
+            fit_dictionary_time_series,
+            None
+        ).transform(fit_dictionary_time_series)
+        assert 'network' in X
+
+        # Then fitting a optimizer should fail if no network:
+        assert 'optimizer' in pipeline.named_steps.keys()
+        with pytest.raises(
+            ValueError,
+            match=r"To fit .+?, expected fit dictionary to have 'network' but got .*"
+        ):
+            pipeline.named_steps['optimizer'].fit({'dataset_properties': {}}, None)
+
+        # No error when network is passed
+        X = pipeline.named_steps['optimizer'].fit(X, None).transform(X)
+        assert 'optimizer' in X
+
+        # Then fitting a optimizer should fail if no network:
+        assert 'lr_scheduler' in pipeline.named_steps.keys()
+        with pytest.raises(
+            ValueError,
+            match=r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*"
+        ):
+            pipeline.named_steps['lr_scheduler'].fit({'dataset_properties': {}}, None)
+
+        # No error when network is passed
+        X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X)
+        assert 'optimizer' in X
+
+    def test_get_fit_requirements(self, fit_dictionary_time_series):
+        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
+                              'task_type': 'time_series_classification'}
+        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties)
+        fit_requirements = pipeline.get_fit_requirements()
+
+        # check if fit requirements is a list of FitRequirement named tuples
+        assert isinstance(fit_requirements, list)
+        for requirement in fit_requirements:
+            assert isinstance(requirement, FitRequirement)
+
+    def test_apply_search_space_updates(self, fit_dictionary_time_series, search_space_updates):
+        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
+                              'task_type': 'time_series_classification'}
+        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
+                                                    search_space_updates=search_space_updates)
+        self._assert_pipeline_search_space(pipeline, search_space_updates)
+
+    def test_read_and_update_search_space(self, fit_dictionary_time_series, search_space_updates):
+        import tempfile
+        path = tempfile.gettempdir()
+        path = os.path.join(path, 'updates.txt')
+        # Write to disk
+        search_space_updates.save_as_file(path=path)
+        assert os.path.exists(path=path)
+
+        # Read from disk
+        file_search_space_updates = parse_hyperparameter_search_space_updates(updates_file=path)
+        assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates)
+        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
+                              'task_type': 'time_series_classification'}
+        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
+                                                    search_space_updates=file_search_space_updates)
+        assert file_search_space_updates == pipeline.search_space_updates
+
+    def test_error_search_space_updates(self, fit_dictionary_time_series, error_search_space_updates):
+        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
+                              'task_type': 'time_series_classification'}
+        try:
+            _ = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
+                                                 search_space_updates=error_search_space_updates)
+        except Exception as e:
+            assert isinstance(e, ValueError)
+            assert re.match(r'Unknown hyperparameter for component .*?\. Expected update '
+                            r'hyperparameter to be in \[.*?\] got .+', e.args[0])
+
+    def test_set_range_search_space_updates(self, fit_dictionary_time_series):
+        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
+                              'task_type': 'time_series_classification'}
+        config_dict = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties). \
+            get_hyperparameter_search_space()._hyperparameters
+        updates = HyperparameterSearchSpaceUpdates()
+        for i, (name, hyperparameter) in enumerate(config_dict.items()):
+            if '__choice__' in name:
+                continue
+            name = name.split(':')
+            hyperparameter_name = ':'.join(name[1:])
+            if "network" in name[0]:
+                continue
+            if isinstance(hyperparameter, CategoricalHyperparameter):
+                value_range = (hyperparameter.choices[0],)
+                default_value = hyperparameter.choices[0]
+            else:
+                value_range = (0, 1)
+                default_value = 1
+            print(name, hyperparameter_name)
+            updates.append(node_name=name[0], hyperparameter=hyperparameter_name,
+                           value_range=value_range, default_value=default_value)
+        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
+                                                    search_space_updates=updates)
+
+        try:
+            self._assert_pipeline_search_space(pipeline, updates)
+        except AssertionError as e:
+            # As we are setting num_layers to 1 for fully connected
+            # head, units_layer does not exist in the configspace
+            assert 'fully_connected:units_layer' in e.args[0]

From d23dcdedf042e21a8348f1ba5acc2e6df270445f Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 22 Feb 2021 15:19:15 +0100
Subject: [PATCH 005/347] fixed abstract evaluator to work with general
 datamanagers

---
 autoPyTorch/datasets/time_series_dataset.py  |  6 ++++--
 autoPyTorch/evaluation/abstract_evaluator.py | 10 ++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 50d8fc8e8..2f9c29a4f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -6,8 +6,8 @@
 
 import torchvision.transforms
 
-from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, REGRESSION_OUTPUTS, STRING_TO_OUTPUT_TYPES, \
-    TASK_TYPES_TO_STRING, TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION
+from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, CLASSIFICATION_TASKS, REGRESSION_OUTPUTS, \
+    STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING, TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
@@ -185,6 +185,8 @@ def __init__(self,
                 raise ValueError(f"Output type {self.output_type} currently not supported ")
         else:
             raise ValueError("Task type not currently supported ")
+        if STRING_TO_TASK_TYPES[self.task_type] in CLASSIFICATION_TASKS:
+            self.num_classes: int = len(np.unique(self.train_tensors[1]))
 
         # filter the default cross and holdout validators if we have a regression task
         # since we cannot use stratification there
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index db5caf72b..df11b1593 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -30,7 +30,6 @@
     TABULAR_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.evaluation.utils import (
     VotingRegressorWrapper,
     convert_multioutput_multiclass_to_multilabel
@@ -251,9 +250,7 @@ def __init__(self, backend: Backend,
             raise ValueError('disable_file_output should be either a bool or a list')
 
         self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None
-        info: Dict[str, Any] = {'task_type': self.datamanager.task_type,
-                                'output_type': self.datamanager.output_type,
-                                'issparse': self.issparse}
+        info = self.datamanager.get_required_dataset_info()
         if self.task_type in REGRESSION_TASKS:
             if isinstance(self.configuration, int):
                 self.pipeline_class = DummyClassificationPipeline
@@ -282,10 +279,7 @@ def __init__(self, backend: Backend,
                 else:
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
-        if self.task_type in TABULAR_TASKS:
-            assert isinstance(self.datamanager, TabularDataset)
-            info.update({'numerical_columns': self.datamanager.numerical_columns,
-                         'categorical_columns': self.datamanager.categorical_columns})
+
         self.dataset_properties = self.datamanager.get_dataset_properties(get_dataset_requirements(info))
 
         self.additional_metrics: Optional[List[autoPyTorchMetric]] = None

From b1edf2d09fa0bfadb1c5393e698d143205ed8831 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 22 Feb 2021 15:19:30 +0100
Subject: [PATCH 006/347] remove print in time series pipeline test

---
 test/test_pipeline/test_time_series_classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_pipeline/test_time_series_classification.py b/test/test_pipeline/test_time_series_classification.py
index 2a8fee10f..6098f2dc7 100644
--- a/test/test_pipeline/test_time_series_classification.py
+++ b/test/test_pipeline/test_time_series_classification.py
@@ -303,7 +303,6 @@ def test_set_range_search_space_updates(self, fit_dictionary_time_series):
             else:
                 value_range = (0, 1)
                 default_value = 1
-            print(name, hyperparameter_name)
             updates.append(node_name=name[0], hyperparameter=hyperparameter_name,
                            value_range=value_range, default_value=default_value)
         pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,

From a63c0e0efde1c766f3e57fc40a91ad43c5e9a7dc Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Wed, 24 Feb 2021 09:42:14 +0100
Subject: [PATCH 007/347] fix scaler imports, abstract evaluator and time
 series classification example

---
 autoPyTorch/evaluation/abstract_evaluator.py          |  6 +++++-
 .../time_series_preprocessing/scaling/MaxAbsScaler.py |  2 +-
 .../time_series_preprocessing/scaling/NoScaler.py     |  2 +-
 .../scaling/StandardScaler.py                         |  2 +-
 examples/example_time_series_classification.py        | 11 +++++++----
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index df11b1593..d5ed07838 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -19,6 +19,7 @@
 import autoPyTorch.pipeline.image_classification
 import autoPyTorch.pipeline.tabular_classification
 import autoPyTorch.pipeline.tabular_regression
+import autoPyTorch.pipeline.time_series_classification
 import autoPyTorch.pipeline.traditional_tabular_classification
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
@@ -27,7 +28,7 @@
     REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
-    TABULAR_TASKS,
+    TABULAR_TASKS, TIMESERIES_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.evaluation.utils import (
@@ -276,6 +277,9 @@ def __init__(self, backend: Backend,
                     self.pipeline_class = autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline
                 elif self.task_type in IMAGE_TASKS:
                     self.pipeline_class = autoPyTorch.pipeline.image_classification.ImageClassificationPipeline
+                elif self.task_type in TIMESERIES_TASKS:
+                    self.pipeline_class = \
+                        autoPyTorch.pipeline.time_series_classification.TimeSeriesClassificationPipeline
                 else:
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
index 9b384b76a..4818e20b4 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
 
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
index ebef7a79b..903be8110 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
 
 
 class NoScaler(BaseScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
index 7404a6f62..5d73b880f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
 
 
diff --git a/examples/example_time_series_classification.py b/examples/example_time_series_classification.py
index 8710d98de..78d630c72 100644
--- a/examples/example_time_series_classification.py
+++ b/examples/example_time_series_classification.py
@@ -10,6 +10,8 @@
 import tempfile as tmp
 import warnings
 
+from autoPyTorch import metrics
+
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
@@ -43,10 +45,10 @@ def get_search_space_updates():
                    hyperparameter="CosineAnnealingLR:T_max",
                    value_range=[50, 60],
                    default_value=55)
-    updates.append(node_name='network_backbone',
-                   hyperparameter='ResNetBackbone:dropout',
-                   value_range=[0, 0.5],
-                   default_value=0.2)
+    updates.append(node_name='optimizer',
+                   hyperparameter='AdamOptimizer:lr',
+                   value_range=[0.0001, 0.001],
+                   default_value=0.0005)
     return updates
 
 
@@ -73,6 +75,7 @@ def get_search_space_updates():
         X,
         y,
         random_state=1,
+        stratify=y
     )
 
     ############################################################################

From fc494d4054f16cf35c997c762e6babae477fe402 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Wed, 24 Feb 2021 11:15:52 +0100
Subject: [PATCH 008/347] adding some time series tests

---
 .../test_datasets/test_time_series_dataset.py |  38 +++++
 test/test_pipeline/components/base.py         |  34 +++-
 .../test_time_series_scaler_choice.py         |  47 ++++++
 .../components/test_time_series_scalers.py    | 155 ++++++++++++++++++
 .../test_time_series_transformer.py           |  30 ++++
 5 files changed, 302 insertions(+), 2 deletions(-)
 create mode 100644 test/test_datasets/test_time_series_dataset.py
 create mode 100644 test/test_pipeline/components/test_time_series_scaler_choice.py
 create mode 100644 test/test_pipeline/components/test_time_series_scalers.py
 create mode 100644 test/test_pipeline/components/test_time_series_transformer.py

diff --git a/test/test_datasets/test_time_series_dataset.py b/test/test_datasets/test_time_series_dataset.py
new file mode 100644
index 000000000..c61cc2c76
--- /dev/null
+++ b/test/test_datasets/test_time_series_dataset.py
@@ -0,0 +1,38 @@
+import pytest
+
+from autoPyTorch.utils.pipeline import get_dataset_requirements
+
+
+@pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
+def test_get_dataset_properties(backend, fit_dictionary_time_series):
+    # The fixture creates a datamanager by itself
+    datamanager = backend.load_datamanager()
+
+    info = {'task_type': datamanager.task_type,
+            'output_type': datamanager.output_type,
+            'issparse': datamanager.issparse,
+            'numerical_features': datamanager.numerical_features,
+            'categorical_features': datamanager.categorical_features}
+    dataset_requirements = get_dataset_requirements(info)
+
+    dataset_properties = datamanager.get_dataset_properties(dataset_requirements)
+    for expected in [
+        'categorical_features',
+        'numerical_features',
+        'issparse',
+        'is_small_preprocess',
+        'task_type',
+        'output_type',
+        'input_shape',
+        'output_shape'
+    ]:
+        assert expected in dataset_properties
+
+    assert isinstance(dataset_properties, dict)
+    for dataset_requirement in dataset_requirements:
+        assert dataset_requirement.name in dataset_properties.keys()
+        assert isinstance(dataset_properties[dataset_requirement.name], dataset_requirement.supported_types)
+
+    assert datamanager.train_tensors[0].shape == fit_dictionary_time_series['X_train'].shape
+    assert datamanager.train_tensors[1].shape == fit_dictionary_time_series['y_train'].shape
+    assert datamanager.task_type == 'time_series_classification'
diff --git a/test/test_pipeline/components/base.py b/test/test_pipeline/components/base.py
index 8adbbd48a..697e5dcad 100644
--- a/test/test_pipeline/components/base.py
+++ b/test/test_pipeline/components/base.py
@@ -14,10 +14,16 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \
     EncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice as TabularScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice as TimeSeriesScalerChoice
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
+from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
 
 
 class BaseTraining(unittest.TestCase):
@@ -144,7 +150,31 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
         steps.extend([
             ("imputer", SimpleImputer()),
             ("encoder", EncoderChoice(default_dataset_properties)),
-            ("scaler", ScalerChoice(default_dataset_properties)),
+            ("scaler", TabularScalerChoice(default_dataset_properties)),
             ("tabular_transformer", TabularColumnTransformer()),
         ])
         return steps
+
+
+class TimeSeriesPipeline(TimeSeriesClassificationPipeline):
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
+                            ) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
+                by the pipeline.
+        """
+        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
+
+        default_dataset_properties = {'target_type': 'time_series_classification'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+
+        steps.extend([
+            ("scaler", TimeSeriesScalerChoice(default_dataset_properties)),
+            ("time_series_transformer", TimeSeriesTransformer()),
+        ])
+        return steps
diff --git a/test/test_pipeline/components/test_time_series_scaler_choice.py b/test/test_pipeline/components/test_time_series_scaler_choice.py
new file mode 100644
index 000000000..d59154ed3
--- /dev/null
+++ b/test/test_pipeline/components/test_time_series_scaler_choice.py
@@ -0,0 +1,47 @@
+import copy
+import unittest
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
+
+
+class TestTimeSeriesScalerChoice(unittest.TestCase):
+
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice for the time series scaler"""
+        dataset_properties = {'categorical_features': [],
+                              'numerical_features': list(range(4)),
+                              'issparse': False}
+        scaler_choice = ScalerChoice(dataset_properties)
+        cs = scaler_choice.get_hyperparameter_search_space()
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(scaler_choice.get_components().keys()))
+        )
+
+        # Make sure we can properly set some random configs
+        # Whereas just one iteration will make sure the algorithm works,
+        # doing five iterations increase the confidence. We will be able to
+        # catch component specific crashes
+        for i in range(5):
+            config = cs.sample_configuration()
+            config_dict = copy.deepcopy(config.get_dictionary())
+            scaler_choice.set_hyperparameters(config)
+
+            self.assertEqual(scaler_choice.choice.__class__,
+                             scaler_choice.get_components()[config_dict['__choice__']])
+
+            # Then check the choice configuration
+            selected_choice = config_dict.pop('__choice__', None)
+            for key, value in config_dict.items():
+                # Remove the selected_choice string from the parameter
+                # so we can query in the object for it
+                key = key.replace(selected_choice + ':', '')
+                self.assertIn(key, vars(scaler_choice.choice))
+                self.assertEqual(value, scaler_choice.choice.__dict__[key])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_pipeline/components/test_time_series_scalers.py b/test/test_pipeline/components/test_time_series_scalers.py
new file mode 100644
index 000000000..f03c43a86
--- /dev/null
+++ b/test/test_pipeline/components/test_time_series_scalers.py
@@ -0,0 +1,155 @@
+import unittest
+
+import numpy as np
+from numpy.testing import assert_allclose
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MinMaxScaler import MinMaxScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MaxAbsScaler import MaxAbsScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.StandardScaler import \
+    StandardScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.NoScaler import NoScaler
+
+
+class TestMinMaxScaler(unittest.TestCase):
+
+    def test_minmax_scaler(self):
+        data = np.array([
+            [[1], [2], [3]],
+            [[7], [8], [9]],
+            [[10], [11], [12]]
+        ])
+
+        dataset_properties = {'categorical_features': [],
+                              'numerical_features': [0]}
+
+        X = {
+            'X_train': data,
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = MinMaxScaler()
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['scaler'], dict)
+        self.assertIsInstance(scaler, BaseEstimator)
+        self.assertIsNone(X['scaler']['categorical'])
+
+        # make column transformer with returned encoder to fit on data
+        scaler = scaler.fit(X["X_train"])
+        transformed = scaler.transform(X["X_train"])
+        assert_allclose(transformed,
+                        np.array([
+                            [[0], [0.5], [1]],
+                            [[0], [0.5], [1]],
+                            [[0], [0.5], [1]],
+                        ]))
+
+
+class TestMaxAbsScaler(unittest.TestCase):
+
+    def test_maxabs_scaler(self):
+        data = np.array([
+            [[-10], [2], [3]],
+            [[-7], [8], [9]],
+            [[-8], [11], [12]]
+        ])
+
+        dataset_properties = {'categorical_features': [],
+                              'numerical_features': [0]}
+
+        X = {
+            'X_train': data,
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = MaxAbsScaler()
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['scaler'], dict)
+        self.assertIsInstance(scaler, BaseEstimator)
+        self.assertIsNone(X['scaler']['categorical'])
+
+        # make column transformer with returned encoder to fit on data
+        scaler = scaler.fit(X["X_train"])
+        transformed = scaler.transform(X["X_train"])
+        print(transformed)
+        assert_allclose(transformed,
+                        np.array([
+                            [[-1], [0.2], [0.3]],
+                            [[-7 / 9], [8 / 9], [1]],
+                            [[-8 / 12], [11 / 12], [1]],
+                        ]))
+
+
+class TestStandardScaler(unittest.TestCase):
+
+    def test_standard_scaler(self):
+        data = np.array([
+            [[1], [2], [3], [4], [5]],
+            [[7], [8], [9], [10], [11]],
+            [[10], [11], [12], [13], [14]]
+        ])
+
+        dataset_properties = {'categorical_features': [],
+                              'numerical_features': [0]}
+
+        X = {
+            'X_train': data,
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = StandardScaler()
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+        scaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['scaler'], dict)
+        self.assertIsInstance(scaler, BaseEstimator)
+        self.assertIsNone(X['scaler']['categorical'])
+
+        # make column transformer with returned encoder to fit on data
+        scaler = scaler.fit(X["X_train"])
+        transformed = scaler.transform(X["X_train"])
+        assert_allclose(transformed,
+                        np.array([
+                            [[-1.41421356], [-0.70710678], [0.], [0.70710678], [1.41421356]],
+                            [[-1.41421356], [-0.70710678], [0.], [0.70710678], [1.41421356]],
+                            [[-1.41421356], [-0.70710678], [0.], [0.70710678], [1.41421356]],
+                        ]))
+
+
+class TestNoneScaler(unittest.TestCase):
+
+    def test_none_scaler(self):
+        data = np.array([
+            [[1], [2], [3]],
+            [[7], [8], [9]],
+            [[10], [11], [12]]
+        ])
+
+        dataset_properties = {'categorical_features': [],
+                              'numerical_features': [0]}
+
+        X = {
+            'X_train': data,
+            'dataset_properties': dataset_properties
+        }
+        scaler_component = NoScaler()
+
+        scaler_component = scaler_component.fit(X)
+        X = scaler_component.transform(X)
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['scaler'], dict)
+        self.assertIsNone(X['scaler']['categorical'])
+        self.assertIsNone(X['scaler']['numerical'])
diff --git a/test/test_pipeline/components/test_time_series_transformer.py b/test/test_pipeline/components/test_time_series_transformer.py
new file mode 100644
index 000000000..af8ef624e
--- /dev/null
+++ b/test/test_pipeline/components/test_time_series_transformer.py
@@ -0,0 +1,30 @@
+import numpy as np
+
+import pytest
+
+from sklearn.pipeline import Pipeline
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
+
+from test.test_pipeline.components.base import TimeSeriesPipeline
+
+
+@pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
+class TestTimeSeriesTransformer:
+    def test_time_series_preprocess(self, fit_dictionary_time_series):
+        pipeline = TimeSeriesPipeline(dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        pipeline = pipeline.fit(fit_dictionary_time_series)
+        X = pipeline.transform(fit_dictionary_time_series)
+        transformer = X['time_series_transformer']
+
+        # check if transformer was added to fit dictionary
+        assert 'time_series_transformer' in X.keys()
+        # check if transformer is of expected type
+        # In this case we expect the time series transformer not the actual implementation behind it
+        # as the later is not callable and runs into error in the compose transform
+        assert isinstance(transformer, TimeSeriesTransformer)
+        assert isinstance(transformer.preprocessor, Pipeline)
+
+        data = transformer.preprocessor.fit_transform(X['X_train'])
+        assert isinstance(data, np.ndarray)

From 09e32c3c932979efb0952a417fa649e7b31cb602 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Wed, 24 Feb 2021 15:49:35 +0100
Subject: [PATCH 009/347] add sktime to examples dependencies

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c496a48c1..db252e98e 100755
--- a/setup.py
+++ b/setup.py
@@ -46,13 +46,14 @@
             "pytest-cov",
             "codecov",
             "pep8",
-            "mypy",
+            "mypy"
         ],
         "examples": [
             "matplotlib",
             "jupyter",
             "notebook",
             "seaborn",
+            "sktime"
         ],
         "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"],
     },

From 8454b0f33487b8b95b4bdab6041eae1f35c51fef Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Wed, 24 Feb 2021 16:22:58 +0100
Subject: [PATCH 010/347] fix for time series feature validator + test

---
 .../data/time_series_feature_validator.py     | 10 ++-
 .../test_time_series_feature_validator.py     | 87 +++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 test/test_data/test_time_series_feature_validator.py

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index dee284f1f..7a7422ef6 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -5,6 +5,7 @@
 
 import sklearn.utils
 from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
 
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
@@ -13,6 +14,7 @@ class TimeSeriesFeatureValidator(BaseEstimator):
     def __init__(self,
                  logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None) -> None:
         self.logger = logger
+        self._is_fitted = False
 
     def fit(self,
             X_train: np.ndarray,
@@ -33,7 +35,7 @@ def fit(self,
         """
 
         if not isinstance(X_train, np.ndarray):
-            raise ValueError("Time series train data must be given as a numpy array")
+            raise ValueError(f"Time series train data must be given as a numpy array, but got {type(X_train)}")
 
         if X_train.ndim != 3:
             raise ValueError(f"Invalid number of dimensions for time series train data, "
@@ -52,7 +54,7 @@ def fit(self,
 
         if X_test is not None:
             if not isinstance(X_test, np.ndarray):
-                raise ValueError("Time series test data must be given as a numpy array")
+                raise ValueError(f"Time series test data must be given as a numpy array, but got {type(X_test)}")
 
             if not X_test.ndim == 3:
                 raise ValueError(f"Invalid number of dimensions for time series test data, "
@@ -74,6 +76,8 @@ def fit(self,
                 accept_large_sparse=False
             )
 
+        self._is_fitted = True
+
         return self
 
     def transform(self, X: np.ndarray) -> np.ndarray:
@@ -87,6 +91,8 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             np.ndarray:
                 The transformed array
         """
+        if not self._is_fitted:
+            raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
         return sklearn.utils.check_array(
             X,
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_time_series_feature_validator.py
new file mode 100644
index 000000000..bf255e0f1
--- /dev/null
+++ b/test/test_data/test_time_series_feature_validator.py
@@ -0,0 +1,87 @@
+import numpy as np
+
+import pandas as pd
+
+import pytest
+import scipy
+
+from scipy import sparse
+
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+
+
+# Fixtures to be used in this class. By default all elements have 100 datapoints
+@pytest.fixture
+def input_data_featuretest(request):
+    if request.param == 'numpy_numericalonly_nonan':
+        return np.array([
+            [[1.0], [2.0], [3.0]],
+            [[-3.0], [-2.0], [-1.0]]
+        ])
+    else:
+        ValueError("Unsupported indirect fixture {}".format(request.param))
+
+
+# Actual checks for the features
+@pytest.mark.parametrize(
+    'input_data_featuretest',
+    (
+        'numpy_numericalonly_nonan',
+    ),
+    indirect=True
+)
+def test_featurevalidator_supported_types(input_data_featuretest):
+    validator = TimeSeriesFeatureValidator()
+    validator.fit(input_data_featuretest, input_data_featuretest)
+    transformed_X = validator.transform(input_data_featuretest)
+    if sparse.issparse(input_data_featuretest):
+        assert sparse.issparse(transformed_X)
+    else:
+        assert isinstance(transformed_X, np.ndarray)
+    assert np.shape(input_data_featuretest) == np.shape(transformed_X)
+    assert np.issubdtype(transformed_X.dtype, np.number)
+    assert validator._is_fitted
+
+
+def test_featurevalidator_unsupported_numpy():
+    validator = TimeSeriesFeatureValidator()
+
+    with pytest.raises(ValueError, match="Input contains NaN, infinity or a value too large *"):
+        validator.fit(X_train=np.array([[[1], [2], [np.nan]], [[4], [5], [6]]]))
+
+
+def test_features_unsupported_calls_are_raised():
+    """
+    Makes sure we raise a proper message to the user,
+    when providing not supported data input or using the validator in a way that is not
+    expected
+    """
+    validator = TimeSeriesFeatureValidator()
+
+    with pytest.raises(ValueError, match="Time series train data must be given as a numpy array, but got *"):
+        validator.fit(
+            pd.DataFrame({'x': [1.0, 2.0, 3.0]})
+        )
+
+    with pytest.raises(ValueError, match="Time series train data must be given as a numpy array, but got *"):
+        validator.fit(
+            [1.0, 2.0, 3.0]
+        )
+
+    with pytest.raises(ValueError, match="Time series train data must be given as a numpy array, but got *"):
+        validator.fit({'input1': 1, 'input2': 2})
+
+    with pytest.raises(ValueError, match="Invalid number of dimensions for time series train data *"):
+        validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]))
+
+    with pytest.raises(ValueError, match="Invalid number of dimensions for time series test data *"):
+        validator.fit(X_train=np.array([[[1], [2], [3]], [[4], [5], [6]]]),
+                      X_test=np.array([[1, 2, 3], [4, 5, 6]]))
+
+    with pytest.raises(ValueError, match="Time series train and test data are expected to have the same shape "
+                                         "except for the batch dimension, but got *"):
+        validator.fit(X_train=np.array([[[1], [2], [3]], [[4], [5], [6]]]),
+                      X_test=np.array([[[1], [2], [3], [4]], [[4], [5], [6], [7]]]))
+
+    with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"):
+        validator.transform(np.array([[1, 2, 3], [4, 5, 6]]))

From 508b897ef0454d56bcd09e73cf450d8ba5fc9e57 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Fri, 26 Feb 2021 11:59:14 +0100
Subject: [PATCH 011/347] fix scaler and sparse init

---
 .../scaling/NoScaler.py                       | 19 +++----------------
 .../scaling/utils.py                          |  3 +++
 .../setup/network_initializer/SparseInit.py   |  7 +++----
 3 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
index 903be8110..423d8aa58 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
 
 
 class NoScaler(BaseScaler):
@@ -31,26 +32,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         """
 
         self.check_requirements(X, y)
-
+        self.preprocessor["numerical"] = TimeSeriesScaler(mode="none")
         return self
 
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        The transform function calls the transform function of the
-        underlying model and returns the transformed array.
-
-        Args:
-            X (np.ndarray): input features
-
-        Returns:
-            np.ndarray: Transformed features
-        """
-        X.update({'scaler': self.preprocessor})
-        return X
-
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
         return {
             'shortname': 'NoScaler',
-            'name': 'No Scaler'
+            'name': 'NoScaler'
         }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index f8d6f5e5a..2ae09303d 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -49,5 +49,8 @@ def transform(self, X: np.ndarray) -> np.ndarray:
 
             return X / max_abs_
 
+        elif self.mode == "none":
+            return X
+
         else:
             raise ValueError(f"Unknown mode {self.mode} for time series scaler")
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
index 1e6dbdbf3..4820c55db 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
@@ -19,12 +19,11 @@ def weights_init(self) -> Callable:
         self.config is a dictionary created form a given config in the config space.
         It contains the necessary information to build a network.
         """
+
         def initialization(m: torch.nn.Module) -> None:
-            if isinstance(m, (torch.nn.Conv1d,
-                              torch.nn.Conv2d,
-                              torch.nn.Conv3d,
-                              torch.nn.Linear)):
+            if isinstance(m, torch.nn.Linear):
                 torch.nn.init.sparse_(m.weight.data, 0.9)
                 if m.bias is not None and self.bias_strategy == 'Zero':
                     torch.nn.init.constant_(m.bias.data, 0.0)
+
         return initialization

From 92141397ae625c575b89eafe53ad24da6686d033 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Fri, 26 Feb 2021 13:04:57 +0100
Subject: [PATCH 012/347] added time series data loader test

---
 .../test_time_series_data_loader.py           | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 test/test_pipeline/components/test_time_series_data_loader.py

diff --git a/test/test_pipeline/components/test_time_series_data_loader.py b/test/test_pipeline/components/test_time_series_data_loader.py
new file mode 100644
index 000000000..bb09d361d
--- /dev/null
+++ b/test/test_pipeline/components/test_time_series_data_loader.py
@@ -0,0 +1,45 @@
+import unittest
+import unittest.mock
+
+import torchvision
+
+from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import (
+    TimeSeriesDataLoader
+)
+
+
+class TestTimeSeriesDataLoader(unittest.TestCase):
+    def test_build_transform_small_preprocess_true(self):
+        """
+        Makes sure a proper composition is created
+        """
+        loader = TimeSeriesDataLoader()
+
+        fit_dictionary = {'dataset_properties': {'is_small_preprocess': True}}
+        for thing in ['scaler']:
+            fit_dictionary[thing] = [unittest.mock.Mock()]
+
+        compose = loader.build_transform(fit_dictionary, mode='train')
+
+        self.assertIsInstance(compose, torchvision.transforms.Compose)
+
+        # No preprocessing needed here as it was done before, only from_numpy
+        self.assertEqual(len(compose.transforms), 1)
+
+    def test_build_transform_small_preprocess_false(self):
+        """
+        Makes sure a proper composition is created
+        """
+        loader = TimeSeriesDataLoader()
+
+        fit_dictionary = {'dataset_properties': {'is_small_preprocess': False},
+                          'preprocess_transforms': [unittest.mock.Mock()]}
+
+        compose = loader.build_transform(fit_dictionary, mode='train')
+
+        self.assertIsInstance(compose, torchvision.transforms.Compose)
+
+        print(compose)
+
+        # We expect the preprocess_transforms and from_numpy
+        self.assertEqual(len(compose.transforms), 2)

From 3349e4f095e25f3e507f4746b489d6d4ce148ace Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Fri, 26 Feb 2021 14:06:58 +0100
Subject: [PATCH 013/347] added lstm backbone for time series

---
 .../setup/network_backbone/LSTMBackbone.py    | 125 ++++++++++++++++++
 test/test_pipeline/components/test_setup.py   |   4 +-
 2 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
new file mode 100644
index 000000000..706199801
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
@@ -0,0 +1,125 @@
+import math
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Tuple
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
+
+
+class _LSTM(nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 config: Dict[str, Any]):
+        super().__init__()
+        self.config = config
+        self.lstm = nn.LSTM(input_size=in_features,
+                            hidden_size=config["hidden_size"],
+                            num_layers=config["num_layers"],
+                            dropout=config.get("dropout", 0.0),
+                            bidirectional=config["bidirectional"],
+                            batch_first=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, _ = x.shape
+
+        hidden_states, (_, _) = self.lstm(x)
+        if not self.config["bidirectional"]:
+            print(hidden_states[:, -1, :].shape)
+            return hidden_states[:, -1, :]
+        else:
+            # concatenate last forward hidden state with first backward hidden state
+            hidden_states_by_direction = hidden_states.view(B,
+                                                            T,
+                                                            2,
+                                                            self.config["hidden_size"])
+            out = torch.cat([
+                hidden_states_by_direction[:, -1, 0, :],
+                hidden_states_by_direction[:, 0, 1, :]
+            ], dim=1)
+            return out
+
+
+class LSTMBackbone(NetworkBackboneComponent):
+    """
+    Standard searchable LSTM backbone for time series data
+    """
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+
+    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        backbone = _LSTM(in_features=input_shape[-1],
+                         config=self.config)
+        self.backbone = backbone
+        return backbone
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'LSTMBackbone',
+            'name': 'LSTMBackbone',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
+                                        num_layers: Tuple[Tuple, int] = ((1, 3), 1),
+                                        hidden_size: Tuple[Tuple, int] = ((64, 512), 256),
+                                        use_dropout: Tuple[Tuple, bool] = ((True, False), False),
+                                        dropout: Tuple[Tuple, float] = ((0, 0.5), 0.2),
+                                        bidirectional: Tuple[Tuple, bool] = ((True, False), True)
+                                        ) -> ConfigurationSpace:
+        cs = CS.ConfigurationSpace()
+
+        min_num_layers, max_num_layers = num_layers[0]
+        num_layers = UniformIntegerHyperparameter('num_layers',
+                                                  lower=min_num_layers,
+                                                  upper=max_num_layers,
+                                                  default_value=num_layers[1])
+        cs.add_hyperparameter(num_layers)
+
+        min_hidden_size, max_hidden_size = hidden_size[0]
+        hidden_size = UniformIntegerHyperparameter('hidden_size',
+                                                   lower=min_hidden_size,
+                                                   upper=max_hidden_size,
+                                                   default_value=hidden_size[1])
+        cs.add_hyperparameter(hidden_size)
+
+        use_dropout = CategoricalHyperparameter('use_dropout',
+                                                choices=use_dropout[0],
+                                                default_value=use_dropout[1])
+
+        min_dropout, max_dropout = dropout[0]
+        dropout = UniformFloatHyperparameter('dropout',
+                                             lower=min_dropout,
+                                             upper=max_dropout,
+                                             default_value=dropout[1])
+
+        cs.add_hyperparameters([use_dropout, dropout])
+        cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
+                                           CS.GreaterThanCondition(dropout, num_layers, 1)))
+
+        bidirectional = CategoricalHyperparameter('bidirectional',
+                                                  choices=bidirectional[0],
+                                                  default_value=bidirectional[1])
+        cs.add_hyperparameter(bidirectional)
+
+        return cs
+
+
+if __name__ == "__main__":
+    lstm = _LSTM(20, dict(LSTMBackbone.get_hyperparameter_search_space().get_default_configuration()))
+
+    lstm(torch.randn(32, 128, 20))
diff --git a/test/test_pipeline/components/test_setup.py b/test/test_pipeline/components/test_setup.py
index 07e2f2f03..1761cf9e0 100644
--- a/test/test_pipeline/components/test_setup.py
+++ b/test/test_pipeline/components/test_setup.py
@@ -293,7 +293,7 @@ class NetworkBackboneTest(unittest.TestCase):
     def test_all_backbones_available(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
 
-        self.assertEqual(len(backbone_choice.get_components().keys()), 8)
+        self.assertEqual(len(backbone_choice.get_components().keys()), 9)
 
     def test_dummy_forward_backward_pass(self):
         network_backbone_choice = NetworkBackboneChoice(dataset_properties={})
@@ -328,7 +328,7 @@ def test_dummy_forward_backward_pass(self):
     def test_every_backbone_is_valid(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
 
-        self.assertEqual(len(backbone_choice.get_components().keys()), 8)
+        self.assertEqual(len(backbone_choice.get_components().keys()), 9)
 
         for name, backbone in backbone_choice.get_components().items():
             config = backbone.get_hyperparameter_search_space().sample_configuration()

From 980c6c44f83764d318a661355a35ebabcdc56c3f Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Fri, 26 Feb 2021 14:24:17 +0100
Subject: [PATCH 014/347] fix flake

---
 .../components/setup/network_backbone/LSTMBackbone.py    | 9 ---------
 test/test_data/test_time_series_feature_validator.py     | 1 -
 .../components/test_time_series_data_loader.py           | 2 --
 .../test_pipeline/components/test_time_series_scalers.py | 6 ++----
 .../components/test_time_series_transformer.py           | 4 ++--
 5 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
index 706199801..fd23999d5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
@@ -1,5 +1,3 @@
-import math
-from collections import OrderedDict
 from typing import Any, Dict, Optional, Tuple
 
 import ConfigSpace as CS
@@ -34,7 +32,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         hidden_states, (_, _) = self.lstm(x)
         if not self.config["bidirectional"]:
-            print(hidden_states[:, -1, :].shape)
             return hidden_states[:, -1, :]
         else:
             # concatenate last forward hidden state with first backward hidden state
@@ -117,9 +114,3 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         cs.add_hyperparameter(bidirectional)
 
         return cs
-
-
-if __name__ == "__main__":
-    lstm = _LSTM(20, dict(LSTMBackbone.get_hyperparameter_search_space().get_default_configuration()))
-
-    lstm(torch.randn(32, 128, 20))
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_time_series_feature_validator.py
index bf255e0f1..5bc638946 100644
--- a/test/test_data/test_time_series_feature_validator.py
+++ b/test/test_data/test_time_series_feature_validator.py
@@ -3,7 +3,6 @@
 import pandas as pd
 
 import pytest
-import scipy
 
 from scipy import sparse
 
diff --git a/test/test_pipeline/components/test_time_series_data_loader.py b/test/test_pipeline/components/test_time_series_data_loader.py
index bb09d361d..1b15db916 100644
--- a/test/test_pipeline/components/test_time_series_data_loader.py
+++ b/test/test_pipeline/components/test_time_series_data_loader.py
@@ -39,7 +39,5 @@ def test_build_transform_small_preprocess_false(self):
 
         self.assertIsInstance(compose, torchvision.transforms.Compose)
 
-        print(compose)
-
         # We expect the preprocess_transforms and from_numpy
         self.assertEqual(len(compose.transforms), 2)
diff --git a/test/test_pipeline/components/test_time_series_scalers.py b/test/test_pipeline/components/test_time_series_scalers.py
index f03c43a86..91ea53f75 100644
--- a/test/test_pipeline/components/test_time_series_scalers.py
+++ b/test/test_pipeline/components/test_time_series_scalers.py
@@ -4,13 +4,12 @@
 from numpy.testing import assert_allclose
 
 from sklearn.base import BaseEstimator
-from sklearn.compose import make_column_transformer
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MinMaxScaler import MinMaxScaler
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MaxAbsScaler import MaxAbsScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MinMaxScaler import MinMaxScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.NoScaler import NoScaler
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.StandardScaler import \
     StandardScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.NoScaler import NoScaler
 
 
 class TestMinMaxScaler(unittest.TestCase):
@@ -81,7 +80,6 @@ def test_maxabs_scaler(self):
         # make column transformer with returned encoder to fit on data
         scaler = scaler.fit(X["X_train"])
         transformed = scaler.transform(X["X_train"])
-        print(transformed)
         assert_allclose(transformed,
                         np.array([
                             [[-1], [0.2], [0.3]],
diff --git a/test/test_pipeline/components/test_time_series_transformer.py b/test/test_pipeline/components/test_time_series_transformer.py
index af8ef624e..456267e3e 100644
--- a/test/test_pipeline/components/test_time_series_transformer.py
+++ b/test/test_pipeline/components/test_time_series_transformer.py
@@ -1,3 +1,5 @@
+from test.test_pipeline.components.base import TimeSeriesPipeline
+
 import numpy as np
 
 import pytest
@@ -7,8 +9,6 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
 
-from test.test_pipeline.components.base import TimeSeriesPipeline
-
 
 @pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
 class TestTimeSeriesTransformer:

From 2fc058c13d9e59a7dfaf576c79047a63abbb1cca Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Fri, 26 Feb 2021 20:19:13 +0100
Subject: [PATCH 015/347] added sequential mnist example

---
 .../example_time_series_classification.py     |   2 -
 ..._series_classification_sequential_mnist.py | 105 ++++++++++++++++++
 2 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 examples/example_time_series_classification_sequential_mnist.py

diff --git a/examples/example_time_series_classification.py b/examples/example_time_series_classification.py
index 78d630c72..41d38d4bc 100644
--- a/examples/example_time_series_classification.py
+++ b/examples/example_time_series_classification.py
@@ -10,8 +10,6 @@
 import tempfile as tmp
 import warnings
 
-from autoPyTorch import metrics
-
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['OPENBLAS_NUM_THREADS'] = '1'
diff --git a/examples/example_time_series_classification_sequential_mnist.py b/examples/example_time_series_classification_sequential_mnist.py
new file mode 100644
index 000000000..7b913de71
--- /dev/null
+++ b/examples/example_time_series_classification_sequential_mnist.py
@@ -0,0 +1,105 @@
+"""
+======================
+Time Series Classification on Sequential MNIST
+======================
+
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+
+import torch
+from torch.utils.data import Subset
+
+import torchvision
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import numpy as np
+
+from autoPyTorch.api.time_series_classification import TimeSeriesClassificationTask
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+def get_search_space_updates():
+    """
+    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
+    Returns:
+        HyperparameterSearchSpaceUpdates
+    """
+    updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="data_loader",
+                   hyperparameter="batch_size",
+                   value_range=[16, 512],
+                   default_value=32)
+    updates.append(node_name="lr_scheduler",
+                   hyperparameter="CosineAnnealingLR:T_max",
+                   value_range=[50, 60],
+                   default_value=55)
+    updates.append(node_name='optimizer',
+                   hyperparameter='AdamOptimizer:lr',
+                   value_range=[0.0001, 0.001],
+                   default_value=0.0005)
+    return updates
+
+
+if __name__ == '__main__':
+    ############################################################################
+    # Data Loading
+    # ============
+    train_dataset = torchvision.datasets.MNIST(root=".", train=True, download=True)
+    test_dataset = torchvision.datasets.MNIST(root=".", train=False)
+
+    train_dataset = Subset(train_dataset, indices=torch.randperm(len(train_dataset))[:10000])
+    test_dataset = Subset(train_dataset, indices=torch.randperm(len(test_dataset))[:100])
+
+    X_train = np.empty((len(train_dataset), 28 * 28, 1), dtype=np.float32)
+    y_train = np.empty(len(train_dataset), dtype=np.int32)
+    X_test = np.empty((len(test_dataset), 28 * 28, 1), dtype=np.float32)
+    y_test = np.empty(len(test_dataset), dtype=np.int32)
+
+    for i, (image, label) in enumerate(train_dataset):
+        X_train[i] = np.asarray(image).reshape(28 * 28, 1)
+        y_train[i] = label
+
+    for i, (image, label) in enumerate(test_dataset):
+        X_test[i] = np.asarray(image).reshape(28 * 28, 1)
+        y_test[i] = label
+
+    ############################################################################
+    # Build and fit a classifier
+    # ==========================
+    api = TimeSeriesClassificationTask(
+        n_jobs=4,
+        delete_tmp_folder_after_terminate=False,
+        search_space_updates=get_search_space_updates(),
+        exclude_components={"network_backbone": ["LSTMBackbone"]}
+    )
+    api.set_pipeline_config(device="cuda")
+    api.search(
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test.copy(),
+        y_test=y_test.copy(),
+        budget_type="epochs",
+        budget=5,
+        optimize_metric='accuracy',
+        total_walltime_limit=3600,
+        func_eval_time_limit=3600
+    )
+
+    ############################################################################
+    # Print the final ensemble performance
+    # ====================================
+    print(api.run_history, api.trajectory)
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test)
+    print(score)

From 1f392bd194228b203152535182ad809179a9c8a5 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Sat, 27 Feb 2021 12:48:34 +0100
Subject: [PATCH 016/347] added time series regression

---
 autoPyTorch/api/time_series_classification.py |   6 +-
 autoPyTorch/api/time_series_regression.py     | 250 +++++++++++++++
 .../pipeline/time_series_regression.py        | 177 +++++++++++
 autoPyTorch/utils/pipeline.py                 |  31 +-
 test/conftest.py                              |  22 +-
 .../test_time_series_regression.py            | 295 ++++++++++++++++++
 6 files changed, 762 insertions(+), 19 deletions(-)
 create mode 100644 autoPyTorch/api/time_series_regression.py
 create mode 100644 autoPyTorch/pipeline/time_series_regression.py
 create mode 100644 test/test_pipeline/test_time_series_regression.py

diff --git a/autoPyTorch/api/time_series_classification.py b/autoPyTorch/api/time_series_classification.py
index b9597cf9e..0d6241edf 100644
--- a/autoPyTorch/api/time_series_classification.py
+++ b/autoPyTorch/api/time_series_classification.py
@@ -93,11 +93,7 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An
             raise ValueError("Dataset is incompatible for the given task,: {}".format(
                 type(dataset)
             ))
-        return {'task_type': dataset.task_type,
-                'output_type': dataset.output_type,
-                'issparse': dataset.issparse,
-                'numerical_features': dataset.numerical_features,
-                'categorical_features': dataset.categorical_features}
+        return dataset.get_required_dataset_info()
 
     def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesClassificationPipeline:
         return TimeSeriesClassificationPipeline(dataset_properties=dataset_properties)
diff --git a/autoPyTorch/api/time_series_regression.py b/autoPyTorch/api/time_series_regression.py
new file mode 100644
index 000000000..8493cc4d7
--- /dev/null
+++ b/autoPyTorch/api/time_series_regression.py
@@ -0,0 +1,250 @@
+import os
+import uuid
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+
+import pandas as pd
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.constants import (
+    TASK_TYPES_TO_STRING, TIMESERIES_REGRESSION
+)
+from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+)
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
+from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
+from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
+from autoPyTorch.utils.backend import Backend
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesRegressionTask(BaseTask):
+    """
+    Time Series Regression API to the pipelines.
+    Args:
+        seed (int): seed to be used for reproducibility.
+        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        logging_config (Optional[Dict]): specifies configuration
+            for logging, if None, it is loaded from the logging.yaml
+        ensemble_size (int), (default=50): Number of models added to the ensemble built by
+            Ensemble selection from libraries of models.
+            Models are drawn with replacement.
+        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
+            models to build the ensemble
+        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
+            Also, controls the size of the ensemble as any additional models will be deleted.
+            Must be greater than or equal to 1.
+        temporary_directory (str): folder to store configuration output and log file
+        output_directory (str): folder to store predictions for optional test set
+        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
+            when finished
+        include_components (Optional[Dict]): If None, all possible components are used.
+            Otherwise specifies set of components to use.
+        exclude_components (Optional[Dict]): If None, all possible components are used.
+            Otherwise specifies set of components not to use. Incompatible with include
+            components
+    """
+
+    def __init__(
+            self,
+            seed: int = 1,
+            n_jobs: int = 1,
+            logging_config: Optional[Dict] = None,
+            ensemble_size: int = 50,
+            ensemble_nbest: int = 50,
+            max_models_on_disc: int = 50,
+            temporary_directory: Optional[str] = None,
+            output_directory: Optional[str] = None,
+            delete_tmp_folder_after_terminate: bool = True,
+            delete_output_folder_after_terminate: bool = True,
+            include_components: Optional[Dict] = None,
+            exclude_components: Optional[Dict] = None,
+            resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            backend: Optional[Backend] = None,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ):
+        super().__init__(
+            seed=seed,
+            n_jobs=n_jobs,
+            logging_config=logging_config,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
+            temporary_directory=temporary_directory,
+            output_directory=output_directory,
+            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
+            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
+            include_components=include_components,
+            exclude_components=exclude_components,
+            backend=backend,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            search_space_updates=search_space_updates,
+            task_type=TASK_TYPES_TO_STRING[TIMESERIES_REGRESSION],
+        )
+
+    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
+        if not isinstance(dataset, TimeSeriesDataset):
+            raise ValueError("Dataset is incompatible for the given task,: {}".format(
+                type(dataset)
+            ))
+        return dataset.get_required_dataset_info()
+
+    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
+        return TimeSeriesRegressionPipeline(dataset_properties=dataset_properties)
+
+    def search(self,
+               optimize_metric: str,
+               X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+               y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+               X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+               y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+               dataset_name: Optional[str] = None,
+               budget_type: Optional[str] = None,
+               budget: Optional[float] = None,
+               total_walltime_limit: int = 100,
+               func_eval_time_limit: int = 60,
+               traditional_per_total_budget: float = 0.,
+               memory_limit: Optional[int] = 4096,
+               smac_scenario_args: Optional[Dict[str, Any]] = None,
+               get_smac_object_callback: Optional[Callable] = None,
+               all_supported_metrics: bool = True,
+               precision: int = 32,
+               disable_file_output: List = [],
+               load_models: bool = True,
+               ) -> 'BaseTask':
+        """
+        Search for the best pipeline configuration for the given dataset.
+
+        Fit both optimizes the machine learning models and builds an ensemble out of them.
+        To disable ensembling, set ensemble_size==0.
+        using the optimizer.
+        Args:
+            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
+                A pair of features (X_train) and targets (y_train) used to fit a
+                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
+                be provided to track the generalization performance of each stage.
+            optimize_metric (str): name of the metric that is used to
+                evaluate a pipeline.
+            budget_type (Optional[str]):
+                Type of budget to be used when fitting the pipeline.
+                Either 'epochs' or 'runtime'. If not provided, uses
+                the default in the pipeline config ('epochs')
+            budget (Optional[float]):
+                Budget to fit a single run of the pipeline. If not
+                provided, uses the default in the pipeline config
+            total_walltime_limit (int), (default=100): Time limit
+                in seconds for the search of appropriate models.
+                By increasing this value, autopytorch has a higher
+                chance of finding better models.
+            func_eval_time_limit (int), (default=60): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit. Set
+                this value high enough so that typical machine
+                learning algorithms can be fit on the training
+                data.
+            traditional_per_total_budget (float), (default=0.1):
+                Percent of total walltime to be allocated for
+                running traditional classifiers.
+            memory_limit (Optional[int]), (default=4096): Memory
+                limit in MB for the machine learning algorithm. autopytorch
+                will stop fitting the machine learning algorithm if it tries
+                to allocate more than memory_limit MB. If None is provided,
+                no memory limit is set. In case of multi-processing, memory_limit
+                will be per job. This memory limit also applies to the ensemble
+                creation process.
+            smac_scenario_args (Optional[Dict]): Additional arguments inserted
+                into the scenario of SMAC. See the
+                [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
+            get_smac_object_callback (Optional[Callable]): Callback function
+                to create an object of class
+                [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
+                The function must accept the arguments scenario_dict,
+                instances, num_params, runhistory, seed and ta. This is
+                an advanced feature. Use only if you are familiar with
+                [SMAC](https://automl.github.io/SMAC3/master/index.html).
+            all_supported_metrics (bool), (default=True): if True, all
+                metrics supporting current task will be calculated
+                for each pipeline and results will be available via cv_results
+            precision (int), (default=32): Numeric precision used when loading
+                ensemble data. Can be either '16', '32' or '64'.
+            disable_file_output (Union[bool, List]):
+            load_models (bool), (default=True): Whether to load the
+                models after fitting AutoPyTorch.
+
+        Returns:
+            self
+
+        """
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # we have to create a logger for at this point for the validator
+        self._logger = self._get_logger(dataset_name)
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        self.InputValidator = TimeSeriesInputValidator(
+            is_classification=False,
+            logger_port=self._logger_port,
+        )
+
+        # Fit a input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+
+        self.dataset = TimeSeriesDataset(
+            X=X_train, Y=y_train,
+            X_test=X_test, Y_test=y_test,
+            validator=self.InputValidator,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+        )
+
+        if traditional_per_total_budget > 0.:
+            self._logger.warning("Time series regression for now does not support traditional classifiers. "
+                                 "Setting traditional_per_total_budget to 0.")
+            traditional_per_total_budget = 0.
+
+        return self._search(
+            dataset=self.dataset,
+            optimize_metric=optimize_metric,
+            budget_type=budget_type,
+            budget=budget,
+            total_walltime_limit=total_walltime_limit,
+            func_eval_time_limit=func_eval_time_limit,
+            traditional_per_total_budget=traditional_per_total_budget,
+            memory_limit=memory_limit,
+            smac_scenario_args=smac_scenario_args,
+            get_smac_object_callback=get_smac_object_callback,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            load_models=load_models,
+        )
+
+    def predict(
+            self,
+            X_test: np.ndarray,
+            batch_size: Optional[int] = None,
+            n_jobs: int = 1
+    ) -> np.ndarray:
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
+            raise ValueError("predict() is only supported after calling search. Kindly call first "
+                             "the estimator fit() method.")
+
+        X_test = self.InputValidator.feature_validator.transform(X_test)
+        predicted_values = super().predict(X_test, batch_size=batch_size,
+                                           n_jobs=n_jobs)
+
+        # Allow to predict in the original domain -- that is, the user is not interested
+        # in our encoded values
+        return self.InputValidator.target_validator.inverse_transform(predicted_values)
diff --git a/autoPyTorch/pipeline/time_series_regression.py b/autoPyTorch/pipeline/time_series_regression.py
new file mode 100644
index 000000000..04b063224
--- /dev/null
+++ b/autoPyTorch/pipeline/time_series_regression.py
@@ -0,0 +1,177 @@
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
+
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+import numpy as np
+
+from sklearn.base import RegressorMixin
+
+from autoPyTorch.constants import STRING_TO_TASK_TYPES
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
+from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
+from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import (
+    NetworkInitializerChoice
+)
+from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
+from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
+from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
+from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
+    TrainerChoice
+)
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesRegressionPipeline(RegressorMixin, BasePipeline):
+    """This class is a proof of concept to integrate AutoPyTorch Components
+
+    It implements a pipeline, which includes as steps:
+
+        ->One preprocessing step
+        ->One neural network
+
+    Contrary to the sklearn API it is not possible to enumerate the
+    possible parameters in the __init__ function because we only know the
+    available regressors at runtime. For this reason the user must
+    specifiy the parameters by passing an instance of
+    ConfigSpace.configuration_space.Configuration.
+
+
+    Args:
+        config (Configuration)
+            The configuration to evaluate.
+        random_state (Optional[RandomState): random_state is the random number generator
+
+    Attributes:
+    Examples
+    """
+
+    def __init__(self,
+                 config: Optional[Configuration] = None,
+                 steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None,
+                 dataset_properties: Optional[Dict[str, Any]] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 random_state: Optional[np.random.RandomState] = None,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                 ):
+        super().__init__(
+            config, steps, dataset_properties, include, exclude,
+            random_state, init_params, search_space_updates)
+
+    def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
+        """Scores the fitted estimator on (X, y)
+
+        Args:
+            X (np.ndarray): input to the pipeline, from which to guess targets
+            batch_size (Optional[int]): batch_size controls whether the pipeline
+                will be called on small chunks of the data. Useful when calling the
+                predict method on the whole array X results in a MemoryError.
+        Returns:
+            np.ndarray: coefficient of determination R^2 of the prediction
+        """
+        from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
+        metrics = get_metrics(self.dataset_properties, ['r2'])
+        y_pred = self.predict(X, batch_size=batch_size)
+        r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']],
+                             metrics=metrics)['r2']
+        return r2
+
+    def _get_hyperparameter_search_space(self,
+                                         dataset_properties: Dict[str, Any],
+                                         include: Optional[Dict[str, Any]] = None,
+                                         exclude: Optional[Dict[str, Any]] = None,
+                                         ) -> ConfigurationSpace:
+        """Create the hyperparameter configuration space.
+
+        For the given steps, and the Choices within that steps,
+        this procedure returns a configuration space object to
+        explore.
+
+        Args:
+            include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+                to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+                to remove from the configuration space
+            dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics
+                of the dataset to guide the pipeline choices of components
+
+        Returns:
+            cs (Configuration): The configuration space describing the TimeSeriesRegressionPipeline.
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None or not isinstance(dataset_properties, dict):
+            if not isinstance(dataset_properties, dict):
+                warnings.warn('The given dataset_properties argument contains an illegal value.'
+                              'Proceeding with the default value')
+            dataset_properties = dict()
+
+        if 'target_type' not in dataset_properties:
+            dataset_properties['target_type'] = 'time_series_regression'
+        if dataset_properties['target_type'] != 'time_series_regression':
+            warnings.warn('Time series regression is being used, however the target_type'
+                          'is not given as "time_series_regression". Overriding it.')
+            dataset_properties['target_type'] = 'time_series_regression'
+        # get the base search space given this
+        # dataset properties. Then overwrite with custom
+        # regression requirements
+        cs = self._get_base_search_space(
+            cs=cs, dataset_properties=dataset_properties,
+            exclude=exclude, include=include, pipeline=self.steps)
+
+        # Here we add custom code, like this with this
+        # is not a valid configuration
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
+                by the pipeline.
+        """
+        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
+
+        default_dataset_properties = {'target_type': 'time_series_regression'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+
+        steps.extend([
+            ("scaler", ScalerChoice(default_dataset_properties)),
+            ("preprocessing", EarlyPreprocessing()),
+            ("time_series_transformer", TimeSeriesTransformer()),
+            ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
+            ("network_head", NetworkHeadChoice(default_dataset_properties)),
+            ("network", NetworkComponent()),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties)),
+            ("optimizer", OptimizerChoice(default_dataset_properties)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
+            ("data_loader", TimeSeriesDataLoader()),
+            ("trainer", TrainerChoice(default_dataset_properties)),
+        ])
+        return steps
+
+    def _get_estimator_hyperparameter_name(self) -> str:
+        """
+        Returns the name of the current estimator.
+
+        Returns:
+            str: name of the pipeline type
+        """
+        return "time_series_regressor"
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index f2040fc84..10a01531c 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -15,6 +15,7 @@
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
 from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
+from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -71,12 +72,19 @@ def _get_regression_dataset_requirements(info: Dict[str, Any], include: Dict[str
                                          exclude: Dict[str, List[str]]) -> List[FitRequirement]:
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
     if task_type in TABULAR_TASKS:
-        fit_requirements = TabularRegressionPipeline(
+        return TabularRegressionPipeline(
             dataset_properties=info,
             include=include,
             exclude=exclude
         ).get_dataset_requirements()
-        return fit_requirements
+
+    elif task_type in TIMESERIES_TASKS:
+        return TimeSeriesRegressionPipeline(
+            dataset_properties=info,
+            include=include,
+            exclude=exclude
+        ).get_dataset_requirements()
+
     else:
         raise ValueError("Task_type not supported")
 
@@ -137,13 +145,18 @@ def _get_regression_configuration_space(info: Dict[str, Any], include: Dict[str,
                                         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                         ) -> ConfigurationSpace:
     if STRING_TO_TASK_TYPES[info['task_type']] in TABULAR_TASKS:
-        configuration_space = TabularRegressionPipeline(
-            dataset_properties=info,
-            include=include,
-            exclude=exclude,
-            search_space_updates=search_space_updates
-        ).get_hyperparameter_search_space()
-        return configuration_space
+        pipeline = TabularRegressionPipeline(dataset_properties=info,
+                                             include=include,
+                                             exclude=exclude,
+                                             search_space_updates=search_space_updates)
+        return pipeline.get_hyperparameter_search_space()
+
+    elif STRING_TO_TASK_TYPES[info['task_type']] in TIMESERIES_TASKS:
+        pipeline = TimeSeriesRegressionPipeline(dataset_properties=info,
+                                                include=include, exclude=exclude,
+                                                search_space_updates=search_space_updates)
+        return pipeline.get_hyperparameter_search_space()
+
     else:
         raise ValueError("Task_type not supported")
 
diff --git a/test/conftest.py b/test/conftest.py
index 9e2f3f1c8..80fd90228 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -229,15 +229,16 @@ def get_tabular_data(task):
 
 
 def get_time_series_data(task):
-    sin_wave = np.sin(np.arange(30))
-    cos_wave = np.cos(np.arange(30))
+    length = 10
+    sin_wave = np.sin(np.arange(length))
+    cos_wave = np.cos(np.arange(length))
     sin_waves = []
     cos_waves = []
     # create a dummy dataset with 100 sin and 100 cosine waves
-    for i in range(100):
+    for i in range(200):
         # add some random noise so not every sample is equal
-        sin_waves.append(sin_wave + np.random.randn(30) * 0.1)
-        cos_waves.append(cos_wave + np.random.randn(30) * 0.1)
+        sin_waves.append(sin_wave + np.random.randn(length) * 0.1)
+        cos_waves.append(cos_wave + np.random.randn(length) * 0.1)
     sin_waves = np.stack(sin_waves)[..., np.newaxis]
     cos_waves = np.stack(cos_waves)[..., np.newaxis]
 
@@ -247,6 +248,15 @@ def get_time_series_data(task):
 
         validator = TimeSeriesInputValidator(is_classification=True).fit(X.copy(), y.copy())
 
+    elif task == "regression_numerical_only":
+        X = np.concatenate([sin_waves, cos_waves])
+
+        # use the last value of the time series as dummy regression target
+        y = X[:, -1, 0]
+        X = X[:, :-1, :]
+
+        validator = TimeSeriesInputValidator(is_classification=False).fit(X.copy(), y.copy())
+
     else:
         raise ValueError("Unsupported task {}".format(task))
 
@@ -314,6 +324,8 @@ def fit_dictionary_tabular_dummy(request, backend):
 def fit_dictionary_time_series_dummy(request, backend):
     if request.param == "classification":
         X, y, validator = get_time_series_data("classification_numerical_only")
+    elif request.param == "regression":
+        X, y, validator = get_time_series_data("regression_numerical_only")
     else:
         raise ValueError(f"Unsupported indirect fixture {request.param}")
     return get_time_series_fit_dictionary(X, y, validator, backend)
diff --git a/test/test_pipeline/test_time_series_regression.py b/test/test_pipeline/test_time_series_regression.py
new file mode 100644
index 000000000..4fd4a478a
--- /dev/null
+++ b/test/test_pipeline/test_time_series_regression.py
@@ -0,0 +1,295 @@
+import os
+import re
+
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter,
+)
+
+import numpy as np
+
+import pytest
+
+import torch
+
+from autoPyTorch import metrics
+from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms
+from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
+from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates, \
+    parse_hyperparameter_search_space_updates
+
+
+@pytest.mark.parametrize("fit_dictionary_time_series", ['regression_numerical_only'], indirect=True)
+class TestTimeSeriesRegression:
+    def _assert_pipeline_search_space(self, pipeline, search_space_updates):
+        config_space = pipeline.get_hyperparameter_search_space()
+        for update in search_space_updates.updates:
+            try:
+                assert update.node_name + ':' + update.hyperparameter in config_space
+                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter)
+            except AssertionError:
+                assert any(update.node_name + ':' + update.hyperparameter in name
+                           for name in config_space.get_hyperparameter_names()), \
+                    "Can't find hyperparameter: {}".format(update.hyperparameter)
+                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1')
+            assert update.default_value == hyperparameter.default_value
+            if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)):
+                assert update.value_range[0] == hyperparameter.lower
+                assert update.value_range[1] == hyperparameter.upper
+                if hasattr(update, 'log'):
+                    assert update.log == hyperparameter.log
+            elif isinstance(hyperparameter, CategoricalHyperparameter):
+                assert update.value_range == hyperparameter.choices
+
+    def test_pipeline_fit(self, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to fit
+        given random combinations of hyperparameters across the pipeline"""
+
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+        pipeline.fit(fit_dictionary_time_series)
+
+        # To make sure we fitted the model, there should be a
+        # run summary object with accuracy
+        run_summary = pipeline.named_steps['trainer'].run_summary
+        assert run_summary is not None
+
+        # Make sure that performance was properly captured
+        assert run_summary.performance_tracker['train_loss'][1] > 0
+        assert run_summary.total_parameter_count > 0
+        assert 'r2' in run_summary.performance_tracker['train_metrics'][1]
+
+        # Make sure a network was fit
+        assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)
+
+    @pytest.mark.parametrize("fit_dictionary_time_series_dummy", ["regression"], indirect=True)
+    def test_pipeline_score(self, fit_dictionary_time_series_dummy, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to achieve a decent score on dummy data
+        given the default configuration"""
+        X = fit_dictionary_time_series_dummy['X_train'].copy()
+        y = fit_dictionary_time_series_dummy['y_train'].copy()
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series_dummy['dataset_properties'])
+
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.get_default_configuration()
+        pipeline.set_hyperparameters(config)
+
+        # regression needs more iterations to converge
+        fit_dictionary_time_series_dummy["epochs"] = 1000
+        pipeline.fit(fit_dictionary_time_series_dummy)
+
+        # we expect the output to have the same batch size as the test input,
+        # and number of outputs per batch sample equal to the number of targets ("output_shape" in dataset_properties)
+        expected_output_shape = (X.shape[0],
+                                 fit_dictionary_time_series_dummy["dataset_properties"]["output_shape"])
+
+        prediction = pipeline.predict(X)
+        assert isinstance(prediction, np.ndarray)
+        assert prediction.shape == expected_output_shape
+
+        # we should be able to get a decent score on this dummy data
+        r2_score = metrics.r2(y, prediction.squeeze())
+        assert r2_score >= 0.5
+
+    def test_pipeline_predict(self, fit_dictionary_time_series):
+        """This test makes sure that the pipeline is able to predict
+        given a random configuration"""
+        X = fit_dictionary_time_series['X_train'].copy()
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        pipeline.fit(fit_dictionary_time_series)
+
+        # we expect the output to have the same batch size as the test input,
+        # and number of outputs per batch sample equal to the number of outputs
+        expected_output_shape = (X.shape[0], fit_dictionary_time_series["dataset_properties"]["output_shape"])
+
+        prediction = pipeline.predict(X)
+        assert isinstance(prediction, np.ndarray)
+        assert prediction.shape == expected_output_shape
+
+    def test_pipeline_transform(self, fit_dictionary_time_series):
+        """
+        In the context of autopytorch, transform expands a fit dictionary with
+        components that where previously fit. We can use this as a nice way to make sure
+        that fit properly work.
+        This code is added in light of components not properly added to the fit dicitonary
+        """
+
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        # We do not want to make the same early preprocessing operation to the fit dictionary
+        pipeline.fit(fit_dictionary_time_series.copy())
+
+        transformed_fit_dictionary_time_series = pipeline.transform(fit_dictionary_time_series)
+
+        # First, we do not lose anyone! (We use a fancy subset containment check)
+        assert fit_dictionary_time_series.items() <= transformed_fit_dictionary_time_series.items()
+
+        # Then the pipeline should have added the following keys
+        expected_keys = {'scaler', 'time_series_transformer',
+                         'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler',
+                         'train_data_loader', 'val_data_loader', 'run_summary'}
+        assert expected_keys.issubset(set(transformed_fit_dictionary_time_series.keys()))
+
+        # Then we need to have transformations being created.
+        assert len(get_preprocess_transforms(transformed_fit_dictionary_time_series)) > 0
+
+        # We expect the transformations to be in the pipeline at anytime for inference
+        assert 'preprocess_transforms' in transformed_fit_dictionary_time_series.keys()
+
+    @pytest.mark.parametrize("is_small_preprocess", [True, False])
+    def test_default_configuration(self, fit_dictionary_time_series, is_small_preprocess):
+        """Makes sure that when no config is set, we can trust the
+        default configuration from the space"""
+
+        fit_dictionary_time_series['is_small_preprocess'] = is_small_preprocess
+
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+
+        pipeline.fit(fit_dictionary_time_series)
+
+    def test_remove_key_check_requirements(self, fit_dictionary_time_series):
+        """Makes sure that when a key is removed from X, correct error is outputted"""
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        for key in ['num_run', 'device', 'split_id', 'use_pynisher', 'torch_num_threads', 'dataset_properties']:
+            fit_dictionary_time_series_copy = fit_dictionary_time_series.copy()
+            fit_dictionary_time_series_copy.pop(key)
+            with pytest.raises(ValueError, match=r"To fit .+?, expected fit dictionary to have"):
+                pipeline.fit(fit_dictionary_time_series_copy)
+
+    def test_network_optimizer_lr_handshake(self, fit_dictionary_time_series):
+        """Fitting a network should put the network in the X"""
+        # Create the pipeline to check. A random config should be sufficient
+        pipeline = TimeSeriesRegressionPipeline(
+            dataset_properties=fit_dictionary_time_series['dataset_properties'])
+        cs = pipeline.get_hyperparameter_search_space()
+        config = cs.sample_configuration()
+        pipeline.set_hyperparameters(config)
+
+        # Make sure that fitting a network adds a "network" to X
+        assert 'network' in pipeline.named_steps.keys()
+        fit_dictionary_time_series['network_backbone'] = torch.nn.Linear(3, 4)
+        fit_dictionary_time_series['network_head'] = torch.nn.Linear(4, 1)
+        X = pipeline.named_steps['network'].fit(
+            fit_dictionary_time_series,
+            None
+        ).transform(fit_dictionary_time_series)
+        assert 'network' in X
+
+        # Then fitting a optimizer should fail if no network:
+        assert 'optimizer' in pipeline.named_steps.keys()
+        with pytest.raises(
+            ValueError,
+            match=r"To fit .+?, expected fit dictionary to have 'network' but got .*"
+        ):
+            pipeline.named_steps['optimizer'].fit({'dataset_properties': {}}, None)
+
+        # No error when network is passed
+        X = pipeline.named_steps['optimizer'].fit(X, None).transform(X)
+        assert 'optimizer' in X
+
+        # Then fitting a optimizer should fail if no network:
+        assert 'lr_scheduler' in pipeline.named_steps.keys()
+        with pytest.raises(
+            ValueError,
+            match=r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*"
+        ):
+            pipeline.named_steps['lr_scheduler'].fit({'dataset_properties': {}}, None)
+
+        # No error when network is passed
+        X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X)
+        assert 'optimizer' in X
+
+    def test_get_fit_requirements(self, fit_dictionary_time_series):
+        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
+                              'task_type': 'time_series_classification'}
+        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties)
+        fit_requirements = pipeline.get_fit_requirements()
+
+        # check if fit requirements is a list of FitRequirement named tuples
+        assert isinstance(fit_requirements, list)
+        for requirement in fit_requirements:
+            assert isinstance(requirement, FitRequirement)
+
+    def test_apply_search_space_updates(self, fit_dictionary_time_series, search_space_updates):
+        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
+                              'task_type': 'time_series_classification'}
+        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
+                                                    search_space_updates=search_space_updates)
+        self._assert_pipeline_search_space(pipeline, search_space_updates)
+
+    def test_read_and_update_search_space(self, fit_dictionary_time_series, search_space_updates):
+        import tempfile
+        path = tempfile.gettempdir()
+        path = os.path.join(path, 'updates.txt')
+        # Write to disk
+        search_space_updates.save_as_file(path=path)
+        assert os.path.exists(path=path)
+
+        # Read from disk
+        file_search_space_updates = parse_hyperparameter_search_space_updates(updates_file=path)
+        assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates)
+        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
+                              'task_type': 'time_series_classification'}
+        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
+                                                    search_space_updates=file_search_space_updates)
+        assert file_search_space_updates == pipeline.search_space_updates
+
+    def test_error_search_space_updates(self, fit_dictionary_time_series, error_search_space_updates):
+        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
+                              'task_type': 'time_series_classification'}
+        try:
+            _ = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
+                                                 search_space_updates=error_search_space_updates)
+        except Exception as e:
+            assert isinstance(e, ValueError)
+            assert re.match(r'Unknown hyperparameter for component .*?\. Expected update '
+                            r'hyperparameter to be in \[.*?\] got .+', e.args[0])
+
+    def test_set_range_search_space_updates(self, fit_dictionary_time_series):
+        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
+                              'task_type': 'time_series_classification'}
+        config_dict = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties). \
+            get_hyperparameter_search_space()._hyperparameters
+        updates = HyperparameterSearchSpaceUpdates()
+        for i, (name, hyperparameter) in enumerate(config_dict.items()):
+            if '__choice__' in name:
+                continue
+            name = name.split(':')
+            hyperparameter_name = ':'.join(name[1:])
+            if "network" in name[0]:
+                continue
+            if isinstance(hyperparameter, CategoricalHyperparameter):
+                value_range = (hyperparameter.choices[0],)
+                default_value = hyperparameter.choices[0]
+            else:
+                value_range = (0, 1)
+                default_value = 1
+            updates.append(node_name=name[0], hyperparameter=hyperparameter_name,
+                           value_range=value_range, default_value=default_value)
+        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
+                                                    search_space_updates=updates)
+
+        try:
+            self._assert_pipeline_search_space(pipeline, updates)
+        except AssertionError as e:
+            # As we are setting num_layers to 1 for fully connected
+            # head, units_layer does not exist in the configspace
+            assert 'fully_connected:units_layer' in e.args[0]

From ca815258a8e4c9dc14dec29b40235adec41013f3 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Sat, 27 Feb 2021 14:55:28 +0100
Subject: [PATCH 017/347] add time series regression example

---
 examples/example_time_series_regression.py | 110 +++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 examples/example_time_series_regression.py

diff --git a/examples/example_time_series_regression.py b/examples/example_time_series_regression.py
new file mode 100644
index 000000000..87ae08c1c
--- /dev/null
+++ b/examples/example_time_series_regression.py
@@ -0,0 +1,110 @@
+"""
+======================
+Time Series Regression
+======================
+
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import numpy as np
+
+import sklearn.model_selection
+
+from sktime.datasets import load_gunpoint
+
+from autoPyTorch.api.time_series_regression import TimeSeriesRegressionTask
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+def get_search_space_updates():
+    """
+    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
+    Returns:
+        HyperparameterSearchSpaceUpdates
+    """
+    updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="data_loader",
+                   hyperparameter="batch_size",
+                   value_range=[16, 512],
+                   default_value=32)
+    updates.append(node_name="lr_scheduler",
+                   hyperparameter="CosineAnnealingLR:T_max",
+                   value_range=[50, 60],
+                   default_value=55)
+    updates.append(node_name='optimizer',
+                   hyperparameter='AdamOptimizer:lr',
+                   value_range=[0.0001, 0.001],
+                   default_value=0.0005)
+    return updates
+
+
+if __name__ == '__main__':
+    ############################################################################
+    # Data Loading
+    # ============
+
+    # Create a dummy dataset consisting of sine and cosine waves
+    length = 10
+    sin_wave = np.sin(np.arange(length))
+    cos_wave = np.cos(np.arange(length))
+    sin_waves = []
+    cos_waves = []
+    # create a dummy dataset with 100 sin and 100 cosine waves
+    for i in range(1000):
+        # add some random noise so not every sample is equal
+        sin_waves.append(sin_wave + np.random.randn(length) * 0.01)
+        cos_waves.append(cos_wave + np.random.randn(length) * 0.01)
+    sin_waves = np.stack(sin_waves)[..., np.newaxis]
+    cos_waves = np.stack(cos_waves)[..., np.newaxis]
+
+    X = np.concatenate([sin_waves, cos_waves])
+
+    # use the last value of the time series as dummy regression target
+    y = X[:, -1, 0]
+    X = X[:, :-1, :]
+
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X,
+        y,
+        random_state=1
+    )
+
+    ############################################################################
+    # Build and fit a regressor
+    # ==========================
+    api = TimeSeriesRegressionTask(
+        delete_tmp_folder_after_terminate=False,
+        search_space_updates=get_search_space_updates(),
+        include_components={"network_backbone": ["InceptionTimeBackbone"]}
+    )
+    api.search(
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test.copy(),
+        y_test=y_test.copy(),
+        optimize_metric='r2',
+        budget_type="epochs",
+        budget=1000,
+        total_walltime_limit=500,
+        func_eval_time_limit=50
+    )
+
+    ############################################################################
+    # Print the final ensemble performance
+    # ====================================
+    print(api.run_history, api.trajectory)
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test)
+    print(score)

From 01b2a91c0cfb2973359ff88a1f222695fd3dda3f Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Sat, 27 Feb 2021 15:37:32 +0100
Subject: [PATCH 018/347] update example

---
 examples/example_time_series_regression.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/example_time_series_regression.py b/examples/example_time_series_regression.py
index 87ae08c1c..e7ff75151 100644
--- a/examples/example_time_series_regression.py
+++ b/examples/example_time_series_regression.py
@@ -22,8 +22,6 @@
 
 import sklearn.model_selection
 
-from sktime.datasets import load_gunpoint
-
 from autoPyTorch.api.time_series_regression import TimeSeriesRegressionTask
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -95,8 +93,6 @@ def get_search_space_updates():
         X_test=X_test.copy(),
         y_test=y_test.copy(),
         optimize_metric='r2',
-        budget_type="epochs",
-        budget=1000,
         total_walltime_limit=500,
         func_eval_time_limit=50
     )

From aeccf0839e59c7bc9942e16505935f9a62196fa4 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Sat, 27 Feb 2021 17:43:03 +0100
Subject: [PATCH 019/347] fix flake and mypy

---
 autoPyTorch/pipeline/time_series_regression.py    | 5 ++---
 test/test_pipeline/test_time_series_regression.py | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/autoPyTorch/pipeline/time_series_regression.py b/autoPyTorch/pipeline/time_series_regression.py
index 04b063224..16ea0eb07 100644
--- a/autoPyTorch/pipeline/time_series_regression.py
+++ b/autoPyTorch/pipeline/time_series_regression.py
@@ -10,10 +10,10 @@
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -23,7 +23,6 @@
     NetworkInitializerChoice
 )
 from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
-from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
 from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
     TrainerChoice
diff --git a/test/test_pipeline/test_time_series_regression.py b/test/test_pipeline/test_time_series_regression.py
index 4fd4a478a..e867d8932 100644
--- a/test/test_pipeline/test_time_series_regression.py
+++ b/test/test_pipeline/test_time_series_regression.py
@@ -232,7 +232,7 @@ def test_apply_search_space_updates(self, fit_dictionary_time_series, search_spa
         dataset_properties = {'numerical_features': [0], 'categorical_features': [],
                               'task_type': 'time_series_classification'}
         pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                    search_space_updates=search_space_updates)
+                                                search_space_updates=search_space_updates)
         self._assert_pipeline_search_space(pipeline, search_space_updates)
 
     def test_read_and_update_search_space(self, fit_dictionary_time_series, search_space_updates):
@@ -249,7 +249,7 @@ def test_read_and_update_search_space(self, fit_dictionary_time_series, search_s
         dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
                               'task_type': 'time_series_classification'}
         pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                    search_space_updates=file_search_space_updates)
+                                                search_space_updates=file_search_space_updates)
         assert file_search_space_updates == pipeline.search_space_updates
 
     def test_error_search_space_updates(self, fit_dictionary_time_series, error_search_space_updates):
@@ -257,7 +257,7 @@ def test_error_search_space_updates(self, fit_dictionary_time_series, error_sear
                               'task_type': 'time_series_classification'}
         try:
             _ = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                 search_space_updates=error_search_space_updates)
+                                             search_space_updates=error_search_space_updates)
         except Exception as e:
             assert isinstance(e, ValueError)
             assert re.match(r'Unknown hyperparameter for component .*?\. Expected update '
@@ -285,7 +285,7 @@ def test_set_range_search_space_updates(self, fit_dictionary_time_series):
             updates.append(node_name=name[0], hyperparameter=hyperparameter_name,
                            value_range=value_range, default_value=default_value)
         pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                    search_space_updates=updates)
+                                                search_space_updates=updates)
 
         try:
             self._assert_pipeline_search_space(pipeline, updates)

From 3614ef689fec78503279cffcd5fa4f9da20d53a5 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 1 Mar 2021 10:13:15 +0100
Subject: [PATCH 020/347] fix time series scaler test

---
 test/test_pipeline/components/test_time_series_scalers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_pipeline/components/test_time_series_scalers.py b/test/test_pipeline/components/test_time_series_scalers.py
index 91ea53f75..f051be329 100644
--- a/test/test_pipeline/components/test_time_series_scalers.py
+++ b/test/test_pipeline/components/test_time_series_scalers.py
@@ -150,4 +150,3 @@ def test_none_scaler(self):
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['scaler'], dict)
         self.assertIsNone(X['scaler']['categorical'])
-        self.assertIsNone(X['scaler']['numerical'])

From db024349c9a9adc9d8fac3b6c9261263dbf8c946 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Mon, 1 Mar 2021 10:37:48 +0100
Subject: [PATCH 021/347] update examples and add pipeline examples for time
 series

---
 ...ple_time_series_classification_pipeline.py | 153 ++++++++++++++++++
 examples/example_time_series_regression.py    |  52 +++---
 ...example_time_series_regression_pipeline.py | 149 +++++++++++++++++
 3 files changed, 326 insertions(+), 28 deletions(-)
 create mode 100644 examples/example_time_series_classification_pipeline.py
 create mode 100644 examples/example_time_series_regression_pipeline.py

diff --git a/examples/example_time_series_classification_pipeline.py b/examples/example_time_series_classification_pipeline.py
new file mode 100644
index 000000000..c674e1098
--- /dev/null
+++ b/examples/example_time_series_classification_pipeline.py
@@ -0,0 +1,153 @@
+"""
+======================
+Example for the time series classification pipeline
+---------------------------
+
+This is a temporal example to make sure that ensemble works.
+It also sets how SMAC should create the output information,
+so that the ensemble builder works.
+
+We will remove this file, once SMAC + ensemble builder work
+======================
+"""
+import typing
+
+import numpy as np
+import sklearn
+from sklearn.metrics import accuracy_score
+
+from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
+from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
+from autoPyTorch.utils.backend import Backend, create
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.pipeline import get_dataset_requirements
+
+
+def get_data_to_train(backend: Backend):
+    """
+    This function returns a fit dictionary that within itself, contains all
+    the information to fit a pipeline
+    """
+
+    from sktime.datasets import load_gunpoint
+
+    data, labels = load_gunpoint(return_X_y=True)
+
+    data = [data.iloc[i][0].values for i in range(len(data))]
+    labels = [int(labels.iloc[i]) for i in range(len(labels))]
+
+    data = np.vstack(data)
+    X = data[..., np.newaxis]
+    y = np.array(labels) - 1  # minus one because labels are initially in {1, 2}
+
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X,
+        y,
+        random_state=1,
+        stratify=y
+    )
+
+    validator = TimeSeriesInputValidator(is_classification=True)
+    validator.fit(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
+
+    # Create a datamanager for this toy problem
+    datamanager = TimeSeriesDataset(
+        X=X_train, Y=y_train,
+        X_test=X_test, Y_test=y_test,
+        validator=validator,
+        resampling_strategy=CrossValTypes.stratified_k_fold_cross_validation
+    )
+    backend.save_datamanager(datamanager)
+
+    info = {'task_type': datamanager.task_type,
+            'numerical_features': datamanager.numerical_features,
+            'categorical_features': datamanager.categorical_features,
+            'output_type': datamanager.output_type,
+            'issparse': datamanager.issparse}
+
+    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
+
+    # Fit the pipeline
+    fit_dictionary = {
+        'X_train': X_train,
+        'y_train': y_train,
+        'train_indices': np.arange(X_train.shape[0]),
+        'dataset_properties': dataset_properties,
+        # Training configuration
+        'num_run': 5,
+        'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
+        'device': 'cuda',
+        'runtime': 50,
+        'torch_num_threads': 1,
+        'early_stopping': 20,
+        'use_tensorboard_logger': True,
+        'use_pynisher': False,
+        'memory_limit': 4096,
+        'metrics_during_training': True,
+        'seed': 0,
+        'budget_type': 'epochs',
+        'epochs': 100.0,
+        'split_id': 0,
+        'backend': backend,
+        'job_id': 1
+    }
+
+    return fit_dictionary, X_train, y_train, X_test, y_test
+
+
+if __name__ == "__main__":
+    # Build a repository with random fitted models
+    backend = create(temporary_directory=None, output_directory=None,
+                     delete_tmp_folder_after_terminate=False)
+
+    # Create the directory structure
+    backend._make_internals_directory()
+
+    updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="optimizer",
+                   hyperparameter="AdamOptimizer:lr",
+                   value_range=[0.0001, 0.001],
+                   default_value=0.0005)
+
+    # Get data to train
+    fit_dictionary, X_train, y_train, X_test, y_test = get_data_to_train(backend)
+    pipeline = TimeSeriesClassificationPipeline(
+        dataset_properties=fit_dictionary['dataset_properties'],
+        search_space_updates=updates,
+        include={
+            'network_backbone': ['InceptionTimeBackbone']
+        }
+    )
+
+    # Goal: Able to indicate a network type and train it successfully on dummy data
+    # Step1: Be able to select and MLP with desired hyperparameters
+    pipeline_cs = pipeline.get_hyperparameter_search_space()
+    print(pipeline_cs)
+    config = pipeline_cs.get_default_configuration()
+    pipeline.set_hyperparameters(config)
+    print(config)
+
+    ## Step2: train it on dummy data
+
+    ## Fit the pipeline
+    print("Fitting the pipeline...")
+    something = pipeline.fit(fit_dictionary)
+
+    ## Showcase some components of the pipeline
+    # print(pipeline)
+
+    from sktime.classification import compose
+
+    tsf = compose.TimeSeriesForestClassifier()
+    tsf.fit(np.moveaxis(X_train, 1, 2), y_train)
+    tsf_predictions = tsf.predict(np.moveaxis(X_test, 1, 2))
+
+    ## Showcase performance of pipeline
+    # print(pipeline.named_steps['trainer'].run_summary.performance_tracker)
+
+    predictions = pipeline.predict_proba(X_test)
+    predictions = np.array(predictions).argmax(axis=1)
+    print(f"accuracy={accuracy_score(y_test, predictions)}")
+    print(f"tsf accuracy={accuracy_score(y_test, tsf_predictions)}")
diff --git a/examples/example_time_series_regression.py b/examples/example_time_series_regression.py
index e7ff75151..075e8e438 100644
--- a/examples/example_time_series_regression.py
+++ b/examples/example_time_series_regression.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-import sklearn.model_selection
+from sktime.datasets import load_italy_power_demand
 
 from autoPyTorch.api.time_series_regression import TimeSeriesRegressionTask
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -35,7 +35,7 @@ def get_search_space_updates():
     updates = HyperparameterSearchSpaceUpdates()
     updates.append(node_name="data_loader",
                    hyperparameter="batch_size",
-                   value_range=[16, 512],
+                   value_range=[32, 64],
                    default_value=32)
     updates.append(node_name="lr_scheduler",
                    hyperparameter="CosineAnnealingLR:T_max",
@@ -51,33 +51,27 @@ def get_search_space_updates():
 if __name__ == '__main__':
     ############################################################################
     # Data Loading
+    # (Mostly copied from
+    # https://github.com/sktime/sktime-dl/blob/master/examples/univariate_time_series_regression_and_forecasting.ipynb)
     # ============
-
-    # Create a dummy dataset consisting of sine and cosine waves
-    length = 10
-    sin_wave = np.sin(np.arange(length))
-    cos_wave = np.cos(np.arange(length))
-    sin_waves = []
-    cos_waves = []
-    # create a dummy dataset with 100 sin and 100 cosine waves
-    for i in range(1000):
-        # add some random noise so not every sample is equal
-        sin_waves.append(sin_wave + np.random.randn(length) * 0.01)
-        cos_waves.append(cos_wave + np.random.randn(length) * 0.01)
-    sin_waves = np.stack(sin_waves)[..., np.newaxis]
-    cos_waves = np.stack(cos_waves)[..., np.newaxis]
-
-    X = np.concatenate([sin_waves, cos_waves])
-
-    # use the last value of the time series as dummy regression target
-    y = X[:, -1, 0]
-    X = X[:, :-1, :]
-
-    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-        X,
-        y,
-        random_state=1
-    )
+    X_train_pd, _ = load_italy_power_demand(split='train', return_X_y=True)
+    X_test_pd, _ = load_italy_power_demand(split='test', return_X_y=True)
+
+    # Create some regression values.
+    # Make the value y equal to the sum of the X values at time-steps 1 and 10.
+    X_train = np.zeros((len(X_train_pd), 24, 1), dtype=float)
+    y_train = np.zeros(len(X_train_pd), dtype=float)
+    for i in range(len(X_train_pd)):
+        y_train[i] = X_train_pd.iloc[i].iloc[0].iloc[1]
+        y_train[i] = y_train[i] + X_train_pd.iloc[i].iloc[0].iloc[10]
+        X_train[i] = X_train_pd.iloc[i].iloc[0][:, np.newaxis]
+
+    X_test = np.zeros((len(X_test_pd), 24, 1), dtype=float)
+    y_test = np.zeros(len(X_test_pd))
+    for i in range(len(X_test_pd)):
+        y_test[i] = X_test_pd.iloc[i].iloc[0].iloc[1]
+        y_test[i] = y_test[i] + X_test_pd.iloc[i].iloc[0].iloc[10]
+        X_test[i] = X_test_pd.iloc[i].iloc[0][:, np.newaxis]
 
     ############################################################################
     # Build and fit a regressor
@@ -93,6 +87,8 @@ def get_search_space_updates():
         X_test=X_test.copy(),
         y_test=y_test.copy(),
         optimize_metric='r2',
+        budget_type="runtime",
+        budget=50,
         total_walltime_limit=500,
         func_eval_time_limit=50
     )
diff --git a/examples/example_time_series_regression_pipeline.py b/examples/example_time_series_regression_pipeline.py
new file mode 100644
index 000000000..276260bc0
--- /dev/null
+++ b/examples/example_time_series_regression_pipeline.py
@@ -0,0 +1,149 @@
+"""
+======================
+Ensemble from random search
+---------------------------
+
+This is a temporal example to make sure that ensemble works.
+It also sets how SMAC should create the output information,
+so that the ensemble builder works.
+
+We will remove this file, once SMAC + ensemble builder work
+======================
+"""
+
+import numpy as np
+from sklearn.metrics import r2_score
+from sklearn.model_selection import train_test_split
+from sktime.datasets import load_italy_power_demand
+
+from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
+from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
+from autoPyTorch.utils.backend import Backend, create
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.pipeline import get_dataset_requirements
+
+
+def get_data_to_train(backend: Backend):
+    """
+    This function returns a fit dictionary that within itself, contains all
+    the information to fit a pipeline
+    """
+
+    X_train_pd, y_train = load_italy_power_demand(split='train', return_X_y=True)
+    X_test_pd, y_test = load_italy_power_demand(split='test', return_X_y=True)
+
+    # Create some regression values.
+    # Make the value y equal to the sum of the X values at time-steps 1 and 10.
+    X_train = np.zeros((len(X_train_pd), 24, 1), dtype=float)
+    y_train = np.zeros(len(y_train), dtype=float)
+    for i in range(len(X_train_pd)):
+        y_train[i] = X_train_pd.iloc[i].iloc[0].iloc[1]
+        y_train[i] = y_train[i] + X_train_pd.iloc[i].iloc[0].iloc[10]
+        X_train[i] = X_train_pd.iloc[i].iloc[0][:, np.newaxis]
+
+    X_test = np.zeros((len(X_test_pd), 24, 1), dtype=float)
+    y_test = np.zeros(len(y_test))
+    for i in range(len(X_test_pd)):
+        y_test[i] = X_test_pd.iloc[i].iloc[0].iloc[1]
+        y_test[i] = y_test[i] + X_test_pd.iloc[i].iloc[0].iloc[10]
+        X_test[i] = X_test_pd.iloc[i].iloc[0][:, np.newaxis]
+
+    validator = TimeSeriesInputValidator(is_classification=False)
+    validator.fit(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
+
+    # Create a datamanager for this toy problem
+    datamanager = TimeSeriesDataset(
+        X=X_train, Y=y_train,
+        X_test=X_test, Y_test=y_test,
+        validator=validator
+    )
+    backend.save_datamanager(datamanager)
+
+    info = {'task_type': datamanager.task_type,
+            'numerical_features': datamanager.numerical_features,
+            'categorical_features': datamanager.categorical_features,
+            'output_type': datamanager.output_type,
+            'issparse': datamanager.issparse}
+
+    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
+
+    # Fit the pipeline
+    fit_dictionary = {
+        'X_train': X_train,
+        'y_train': y_train,
+        'train_indices': np.arange(X_train.shape[0]),
+        'dataset_properties': dataset_properties,
+        # Training configuration
+        'num_run': 5,
+        'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
+        'device': 'cpu',
+        'runtime': 100,
+        'torch_num_threads': 1,
+        'early_stopping': 20,
+        'use_tensorboard_logger': True,
+        'use_pynisher': False,
+        'memory_limit': 4096,
+        'metrics_during_training': True,
+        'seed': 0,
+        'budget_type': 'epochs',
+        'epochs': 100.0,
+        'split_id': 0,
+        'backend': backend,
+        'job_id': 1
+    }
+
+    return fit_dictionary, X_train, y_train, X_test, y_test
+
+
+if __name__ == "__main__":
+    # Build a repository with random fitted models
+    backend = create(temporary_directory=None, output_directory=None,
+                     delete_tmp_folder_after_terminate=False)
+
+    # Create the directory structure
+    backend._make_internals_directory()
+
+    updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="optimizer",
+                   hyperparameter="AdamOptimizer:lr",
+                   value_range=[0.0001, 0.001],
+                   default_value=0.0005)
+
+    # Get data to train
+    fit_dictionary, X_train, y_train, X_test, y_test = get_data_to_train(backend)
+    pipeline = TimeSeriesClassificationPipeline(
+        dataset_properties=fit_dictionary['dataset_properties'],
+        search_space_updates=updates,
+        include={
+            'network_backbone': ['InceptionTimeBackbone']
+        }
+    )
+
+    # Goal: Able to indicate a network type and train it successfully on dummy data
+    # Step1: Be able to select and MLP with desired hyperparameters
+    pipeline_cs = pipeline.get_hyperparameter_search_space()
+    print(pipeline_cs)
+    config = pipeline_cs.get_default_configuration()
+    pipeline.set_hyperparameters(config)
+    print(config)
+
+    ## Step2: train it on dummy data
+
+    ## Fit the pipeline
+    print("Fitting the pipeline...")
+    something = pipeline.fit(fit_dictionary)
+
+    ## Showcase some components of the pipeline
+    # print(pipeline)
+
+    from sktime.regression import compose
+
+    tsf = compose.TimeSeriesForestRegressor()
+    tsf.fit(np.moveaxis(X_train, 1, 2), y_train)
+    tsf_predictions = tsf.predict(np.moveaxis(X_test, 1, 2))
+
+    predictions = pipeline.predict(X_test)
+    print(f"r2={r2_score(y_test, predictions)}")
+    print(f"tsf r2={r2_score(y_test, tsf_predictions)}")

From a0059dbb7faf949e96980adcd9fdca158309ad3b Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Mon, 1 Mar 2021 18:07:27 +0100
Subject: [PATCH 022/347] Match paper libraries-versions

---
 requirements.txt | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cf29fe78a..c3bbc1b23 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,21 @@
 setuptools
-Cython
-netifaces
-numpy
-pandas
-scipy
-statsmodels
-scikit-learn>=0.20.0
-imblearn
-ConfigSpace
-pynisher
-hpbandster
-fasteners
-torch
-torchvision
-tensorboard_logger
-openml
-lightgbm
-catboost
+Cython==0.29.21
+netifaces==0.10.9
+numpy==1.19.5
+pandas==1.2.0
+scipy==1.6.0
+statsmodels==0.12.1
+scikit-learn==0.23.0
+imbalanced-learn==0.7.0
+imblearn==0.0
+ConfigSpace==0.4.17
+pynisher==0.6.3
+hpbandster==0.7.4
+fasteners==0.16
+torch==1.7.1
+torchvision==0.8.2
+tensorboard-logger==0.1.0
+openml==0.11.0
+lightgbm==3.1.1
+catboost==0.24.4
+pexpect==4.8.0

From 13a25f9fe09284b6472180f1ba6251bfc899b690 Mon Sep 17 00:00:00 2001
From: Sebastian Walter <sebastian.walter98@gmail.com>
Date: Tue, 2 Mar 2021 09:14:31 +0100
Subject: [PATCH 023/347] added pipeline representation

---
 .../pipeline/time_series_classification.py    | 34 +++++++++++++++++++
 .../pipeline/time_series_regression.py        | 34 +++++++++++++++++++
 .../example_time_series_classification.py     |  1 +
 ..._series_classification_sequential_mnist.py |  9 +++--
 examples/example_time_series_regression.py    |  1 +
 5 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/time_series_classification.py b/autoPyTorch/pipeline/time_series_classification.py
index 9ace78f17..8a3cbbc04 100644
--- a/autoPyTorch/pipeline/time_series_classification.py
+++ b/autoPyTorch/pipeline/time_series_classification.py
@@ -10,6 +10,7 @@
 
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
@@ -217,6 +218,39 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
         ])
         return steps
 
+    def get_pipeline_representation(self) -> Dict[str, str]:
+        """
+        Returns a representation of the pipeline, so that it can be
+        consumed and formatted by the API.
+
+        It should be a representation that follows:
+        [{'PreProcessing': <>, 'Estimator': <>}]
+
+        Returns:
+            Dict: contains the pipeline representation in a short format
+        """
+        preprocessing = []
+        estimator = []
+        skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
+                      'preprocessing', 'time_series_transformer']
+        for step_name, step_component in self.steps:
+            if step_name in skip_steps:
+                continue
+            properties = {}
+            if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
+                properties = step_component.choice.get_properties()
+            elif isinstance(step_component, autoPyTorchComponent):
+                properties = step_component.get_properties()
+            if 'shortname' in properties:
+                if 'network' in step_name:
+                    estimator.append(properties['shortname'])
+                else:
+                    preprocessing.append(properties['shortname'])
+        return {
+            'Preprocessing': ','.join(preprocessing),
+            'Estimator': ','.join(estimator),
+        }
+
     def _get_estimator_hyperparameter_name(self) -> str:
         """
         Returns the name of the current estimator.
diff --git a/autoPyTorch/pipeline/time_series_regression.py b/autoPyTorch/pipeline/time_series_regression.py
index 16ea0eb07..d18207256 100644
--- a/autoPyTorch/pipeline/time_series_regression.py
+++ b/autoPyTorch/pipeline/time_series_regression.py
@@ -10,6 +10,7 @@
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
@@ -166,6 +167,39 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         ])
         return steps
 
+    def get_pipeline_representation(self) -> Dict[str, str]:
+        """
+        Returns a representation of the pipeline, so that it can be
+        consumed and formatted by the API.
+
+        It should be a representation that follows:
+        [{'PreProcessing': <>, 'Estimator': <>}]
+
+        Returns:
+            Dict: contains the pipeline representation in a short format
+        """
+        preprocessing = []
+        estimator = []
+        skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
+                      'preprocessing', 'time_series_transformer']
+        for step_name, step_component in self.steps:
+            if step_name in skip_steps:
+                continue
+            properties = {}
+            if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
+                properties = step_component.choice.get_properties()
+            elif isinstance(step_component, autoPyTorchComponent):
+                properties = step_component.get_properties()
+            if 'shortname' in properties:
+                if 'network' in step_name:
+                    estimator.append(properties['shortname'])
+                else:
+                    preprocessing.append(properties['shortname'])
+        return {
+            'Preprocessing': ','.join(preprocessing),
+            'Estimator': ','.join(estimator),
+        }
+
     def _get_estimator_hyperparameter_name(self) -> str:
         """
         Returns the name of the current estimator.
diff --git a/examples/example_time_series_classification.py b/examples/example_time_series_classification.py
index 41d38d4bc..1294c7ab8 100644
--- a/examples/example_time_series_classification.py
+++ b/examples/example_time_series_classification.py
@@ -100,3 +100,4 @@ def get_search_space_updates():
     y_pred = api.predict(X_test)
     score = api.score(y_pred, y_test)
     print(score)
+    print(api.show_models())
diff --git a/examples/example_time_series_classification_sequential_mnist.py b/examples/example_time_series_classification_sequential_mnist.py
index 7b913de71..f03bb425f 100644
--- a/examples/example_time_series_classification_sequential_mnist.py
+++ b/examples/example_time_series_classification_sequential_mnist.py
@@ -78,7 +78,7 @@ def get_search_space_updates():
     # Build and fit a classifier
     # ==========================
     api = TimeSeriesClassificationTask(
-        n_jobs=4,
+        n_jobs=6,
         delete_tmp_folder_after_terminate=False,
         search_space_updates=get_search_space_updates(),
         exclude_components={"network_backbone": ["LSTMBackbone"]}
@@ -89,11 +89,9 @@ def get_search_space_updates():
         y_train=y_train,
         X_test=X_test.copy(),
         y_test=y_test.copy(),
-        budget_type="epochs",
-        budget=5,
         optimize_metric='accuracy',
-        total_walltime_limit=3600,
-        func_eval_time_limit=3600
+        total_walltime_limit=1200,
+        func_eval_time_limit=1200
     )
 
     ############################################################################
@@ -103,3 +101,4 @@ def get_search_space_updates():
     y_pred = api.predict(X_test)
     score = api.score(y_pred, y_test)
     print(score)
+    print(api.show_models())
diff --git a/examples/example_time_series_regression.py b/examples/example_time_series_regression.py
index 075e8e438..be77209a8 100644
--- a/examples/example_time_series_regression.py
+++ b/examples/example_time_series_regression.py
@@ -100,3 +100,4 @@ def get_search_space_updates():
     y_pred = api.predict(X_test)
     score = api.score(y_pred, y_test)
     print(score)
+    print(api.show_models())

From abfc2f27d7b43984036fc1def1f2278ec21fd0b3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 12 Mar 2021 20:21:37 +0100
Subject: [PATCH 024/347] add time series forecasting pipeline

---
 ...ime_series_forecasting_target_validator.py |   0
 .../data/time_series_forecasting_validator.py |  54 ++++
 .../time_series_forecasting_data_loader.py    | 266 ++++++++++++++++++
 .../pipeline/time_series_forecasting.py       | 228 +++++++++++++++
 test.py                                       |   0
 5 files changed, 548 insertions(+)
 create mode 100644 autoPyTorch/data/time_series_forecasting_target_validator.py
 create mode 100644 autoPyTorch/data/time_series_forecasting_validator.py
 create mode 100644 autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
 create mode 100644 autoPyTorch/pipeline/time_series_forecasting.py
 create mode 100644 test.py

diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
new file mode 100644
index 000000000..1f1330a92
--- /dev/null
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -0,0 +1,54 @@
+from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
+
+# -*- encoding: utf-8 -*-
+import logging
+import typing
+
+from autoPyTorch.data.base_validator import BaseInputValidator
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+from autoPyTorch.data.time_series_forecasting_target_validator import TimeSeriesForecastingTargetValidator
+from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
+
+
+class TimeSeriesForecastingInputValidator(TimeSeriesInputValidator):
+    """
+    Makes sure the input data complies with Auto-PyTorch requirements.
+
+    This class also perform checks for data integrity and flags the user
+    via informative errors.
+
+    Attributes:
+        is_classification (bool):
+            For classification task, this flag indicates that the target data
+            should be encoded
+        feature_validator (FeatureValidator):
+            A FeatureValidator instance used to validate and encode feature columns to match
+            sklearn expectations on the data
+        target_validator (TargetValidator):
+            A TargetValidator instance used to validate and encode (in case of classification)
+            the target values
+    """
+
+    def __init__(
+        self,
+        is_classification: bool = False,
+        logger_port: typing.Optional[int] = None,
+    ) -> None:
+        super().__init__(is_classification=is_classification, logger_port=logger_port)
+        self.is_classification = is_classification
+        self.logger_port = logger_port
+        if self.logger_port is not None:
+            self.logger: typing.Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
+                name='Validation',
+                port=self.logger_port,
+            )
+        else:
+            self.logger = logging.getLogger('Validation')
+
+        self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
+        self.target_validator = TimeSeriesForecastingTargetValidator(
+            is_classification=self.is_classification,
+            logger=self.logger
+        )
+
+        self._is_fitted = False
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
new file mode 100644
index 000000000..d27979212
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -0,0 +1,266 @@
+from typing import Any, Dict, Optional, Tuple
+
+from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
+
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformIntegerHyperparameter, Constant
+)
+
+import numpy as np
+
+import torch
+
+import torchvision
+
+import warnings
+
+
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+from autoPyTorch.utils.backend import Backend
+from autoPyTorch.utils.common import FitRequirement, custom_collate_fn
+
+
+class TimeSeriesForecastingDataLoader(TimeSeriesDataLoader):
+    """This class is an interface to the PyTorch Dataloader.
+
+    It gives the possibility to read various types of mapped
+    datasets as described in:
+    https://pytorch.org/docs/stable/data.html
+
+    """
+
+    def __init__(self,
+                 batch_size: int = 64,
+                 sequence_length: int = 1,
+                 upper_sequence_length: int = np.iinfo(np.int32).max,
+                 n_prediction_steps: int = 1) -> None:
+        super().__init__(batch_size=batch_size)
+        self.sequence_length: int = sequence_length
+        self.upper_seuqnce_length = upper_sequence_length
+        self.n_prediction_steps = n_prediction_steps
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """The transform function calls the transform function of the
+        underlying model and returns the transformed array.
+
+        Args:
+            X (np.ndarray): input features
+
+        Returns:
+            np.ndarray: Transformed features
+        """
+        X.update({'train_data_loader': self.train_data_loader,
+                  'val_data_loader': self.val_data_loader})
+        return X
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
+        """
+        Fits a component by using an input dictionary with pre-requisites
+
+        Args:
+            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
+            y (Any): not used. To comply with sklearn API
+
+        Returns:
+            A instance of self
+        """
+
+        # Make sure there is an optimizer
+        self.check_requirements(X, y)
+
+        # Incorporate the transform to the dataset
+        datamanager = X['backend'].load_datamanager()
+        datamanager = self._update_dataset(datamanager)
+        self.train_transform = self.build_transform(X, mode='train')
+        self.val_transform = self.build_transform(X, mode='val')
+        self.test_transform = self.build_transform(X, mode='test')
+        datamanager.update_transform(
+            self.train_transform,
+            train=True,
+        )
+        datamanager.update_transform(
+            self.val_transform,
+            train=False,
+        )
+        import pdb
+        pdb.set_trace()
+        if X['dataset_properties']["is_small_preprocess"]:
+            # This parameter indicates that the data has been pre-processed for speed
+            # Overwrite the datamanager with the pre-processes data
+            datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
+        train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
+
+        self.train_data_loader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=min(self.batch_size, len(train_dataset)),
+            shuffle=True,
+            num_workers=X.get('num_workers', 0),
+            pin_memory=X.get('pin_memory', True),
+            drop_last=X.get('drop_last', True),
+            collate_fn=custom_collate_fn,
+        )
+
+        self.val_data_loader = torch.utils.data.DataLoader(
+            val_dataset,
+            batch_size=min(self.batch_size, len(val_dataset)),
+            shuffle=False,
+            num_workers=X.get('num_workers', 0),
+            pin_memory=X.get('pin_memory', True),
+            drop_last=X.get('drop_last', False),
+            collate_fn=custom_collate_fn,
+        )
+
+        return self
+
+    def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
+        """
+        update the dataset to build time sequence
+        """
+        X_train, y_train = datamanager.train_tensors
+        val_tensors = datamanager.val_tensors
+        test_tensors = datamanager.test_tensors
+
+        time_series_length, population_size, num_features = X_train.shape
+        num_datapoints = time_series_length - self.sequence_length - self.n_prediction_steps + 1
+        num_targets = y_train.shape[-1]
+
+        y_train = y_train[-num_datapoints:, :]
+        if test_tensors is not None:
+            X_test, y_test = test_tensors
+
+            if val_tensors is not None:
+                X_val, y_val = val_tensors
+
+                X_test = np.concatenate([X_val[-self.sequence_length + 1:], X_test])
+                X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
+                val_tensors = self._ser2seq(X_val, y_val, num_datapoints, num_features, num_targets)
+                datamanager.val_tensors = val_tensors
+
+            X_test = np.concatenate([X_train[-self.sequence_length + 1:], X_test])
+            self.X_val_tail = X_test[-self.sequence_length + 1:]
+
+            test_tensors = self._ser2seq(X_test, y_test, num_datapoints, num_features, num_targets)
+            datamanager.test_tensors = test_tensors
+
+        elif val_tensors is not None:
+            X_val, y_val = val_tensors
+            X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
+
+            # used for prediction
+            self.X_val_tail = X_val[-self.sequence_length+1:]
+            val_tensors = self._ser2seq(X_val, y_val, num_datapoints, num_features, num_targets)
+            datamanager.val_tensors = val_tensors
+        else:
+            self.X_val_tail = X_train[-self.sequence_length+1:]
+        train_tensors = self._ser2seq(X_train, y_train, num_datapoints, num_features, num_targets)
+        datamanager.train_tensors = train_tensors
+        datamanager.splits = datamanager.get_splits_from_resampling_strategy()
+        return datamanager
+
+    def _ser2seq(self, X_in, y_in, num_datapoints, num_features, num_targets):
+        """
+        build a sliding window transformer for the given data
+         Args:
+            X_in (np.ndarray): input feature array to be transformed with shape
+             [time_series_length, population_size, num_features]
+            y_in (np.ndarray): input target array with shape [time_series_length, population_size, num_targets]
+            num_datapoints: number of actual data points stored in the dataset
+            num_features: number of features
+            num_targets: number of targets
+        Returns:
+            X_in_trans: transformed input featuer array with shpae
+            [num_datapoints * population_size, sequence_length, num_features]
+            y_in_trans: transformed input target array with shape
+            [num_datapoints * population_size, num_targets]
+        """
+        X_in = np.concatenate([np.roll(X_in, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
+                               axis=2,
+                               dtype=np.float32)[:num_datapoints]
+        X_in = X_in.reshape((-1, self.sequence_length, num_features))
+        y_in = y_in.reshape((-1, num_targets))
+        return X_in, y_in
+
+    def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.inf,
+                   ) -> torch.utils.data.DataLoader:
+        """
+        Creates a data loader object from the provided data,
+        applying the transformations meant to validation objects
+        """
+        if X.ndim == 3:
+            X = np.swapaxes(X, 0, 1)
+            X = np.concatenate([self.X_val_tail, X])
+            time_series_length, population_size, num_features = X.shape
+            X = X.reshape((-1, self.sequence_length, num_features))
+
+        dataset = BaseDataset(
+            train_tensors=(X, y),
+            # This dataset is used for loading test data in a batched format
+            train_transforms=self.test_transform,
+            val_transforms=self.test_transform,
+        )
+        return torch.utils.data.DataLoader(
+            dataset,
+            batch_size=min(batch_size, len(dataset)),
+            shuffle=False,
+            collate_fn=custom_collate_fn,
+        )
+
+    def get_train_data_loader(self) -> torch.utils.data.DataLoader:
+        """Returns a data loader object for the train data
+
+        Returns:
+            torch.utils.data.DataLoader: A train data loader
+        """
+        assert self.train_data_loader is not None, "No train data loader fitted"
+        return self.train_data_loader
+
+    def get_val_data_loader(self) -> torch.utils.data.DataLoader:
+        """Returns a data loader object for the validation data
+
+        Returns:
+            torch.utils.data.DataLoader: A validation data loader
+        """
+        assert self.val_data_loader is not None, "No val data loader fitted"
+        return self.val_data_loader
+
+    def get_test_data_loader(self) -> torch.utils.data.DataLoader:
+        """Returns a data loader object for the test data
+
+        Returns:
+            torch.utils.data.DataLoader: A validation data loader
+        """
+        return self.test_data_loader
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
+                                        batch_size: Tuple[Tuple, int] = ((32, 320), 64),
+                                        sequence_length: Tuple[Tuple, int] = ((1, 20), 1)
+                                        ) -> ConfigurationSpace:
+        batch_size = UniformIntegerHyperparameter(
+            "batch_size", batch_size[0][0], batch_size[0][1], default_value=batch_size[1])
+        if "upper_sequence_length" not in dataset_properties:
+            warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
+                          'length that is greater than the maximal allowed length of the dataset')
+            upper_sequence_length = min(np.iinfo(np.int32).max, sequence_length[0][1])
+        else:
+            upper_sequence_length = min(dataset_properties["upper_sequence_length"], sequence_length[0][1])
+        if sequence_length[0][0] >= upper_sequence_length:
+            warnings.warn("the lower bound of sequence length is greater than the upper bound")
+            sequence_length = Constant("sequence_length", upper_sequence_length)
+        else:
+            sequence_length = UniformIntegerHyperparameter("sequence_length",
+                                                           lower=sequence_length[0][0],
+                                                           upper=upper_sequence_length,
+                                                           default_value=sequence_length[1])
+        cs = ConfigurationSpace()
+        cs.add_hyperparameters([batch_size, sequence_length])
+        return cs
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.train_data_loader.__class__.__name__
+        return string
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
new file mode 100644
index 000000000..71bd031c6
--- /dev/null
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -0,0 +1,228 @@
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
+
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+from ConfigSpace.hyperparameters import UniformIntegerHyperparameter
+
+import numpy as np
+
+from sklearn.base import RegressorMixin
+from sklearn.pipeline import Pipeline
+
+from autoPyTorch.constants import STRING_TO_TASK_TYPES
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
+from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
+from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import (
+    NetworkInitializerChoice
+)
+from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
+    TimeSeriesForecastingDataLoader
+)
+
+from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
+    TrainerChoice
+)
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.common import subsampler
+
+
+class TimeSeriesForecastingPipeline(RegressorMixin, BasePipeline):
+    """This class is a proof of concept to integrate AutoPyTorch Components
+
+    It implements a pipeline, which includes as steps:
+
+        ->One preprocessing step
+        ->One neural network
+
+    Contrary to the sklearn API it is not possible to enumerate the
+    possible parameters in the __init__ function because we only know the
+    available regressors at runtime. For this reason the user must
+    specifiy the parameters by passing an instance of
+    ConfigSpace.configuration_space.Configuration.
+
+
+    Args:
+        config (Configuration)
+            The configuration to evaluate.
+        random_state (Optional[RandomState): random_state is the random number generator
+
+    Attributes:
+    Examples
+    """
+
+    def __init__(self,
+                 config: Optional[Configuration] = None,
+                 steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None,
+                 dataset_properties: Optional[Dict[str, Any]] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 random_state: Optional[np.random.RandomState] = None,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 n_prediction_steps: int = 1,
+                 ):
+        # TODO consider multi steps prediction
+        self.n_prediction_steps = n_prediction_steps
+        if 'upper_sequence_length' not in dataset_properties:
+            warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
+                          'length that is greater than the maximal allowed length of the dataset')
+            self.upper_sequence_length = np.iinfo(np.int32).max
+        else:
+            self.upper_sequence_length = dataset_properties['upper_sequence_length']
+
+        super().__init__(
+            config, steps, dataset_properties, include, exclude,
+            random_state, init_params, search_space_updates)
+
+    def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
+        """Scores the fitted estimator on (X, y)
+
+        Args:
+            X (np.ndarray): input to the pipeline, from which to guess targets
+            batch_size (Optional[int]): batch_size controls whether the pipeline
+                will be called on small chunks of the data. Useful when calling the
+                predict method on the whole array X results in a MemoryError.
+        Returns:
+            np.ndarray: coefficient of determination R^2 of the prediction
+        """
+        from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
+        metrics = get_metrics(self.dataset_properties, ['r2'])
+        y_pred = self.predict(X, batch_size=batch_size)
+        r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']],
+                             metrics=metrics)['r2']
+        return r2
+
+    def _get_hyperparameter_search_space(self,
+                                         dataset_properties: Dict[str, Any],
+                                         include: Optional[Dict[str, Any]] = None,
+                                         exclude: Optional[Dict[str, Any]] = None,
+                                         ) -> ConfigurationSpace:
+        """Create the hyperparameter configuration space.
+
+        For the given steps, and the Choices within that steps,
+        this procedure returns a configuration space object to
+        explore.
+
+        Args:
+            include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+                to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+                to remove from the configuration space
+            dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics
+                of the dataset to guide the pipeline choices of components
+
+        Returns:
+            cs (Configuration): The configuration space describing the TimeSeriesRegressionPipeline.
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None or not isinstance(dataset_properties, dict):
+            if not isinstance(dataset_properties, dict):
+                warnings.warn('The given dataset_properties argument contains an illegal value.'
+                              'Proceeding with the default value')
+            dataset_properties = dict()
+
+        if 'target_type' not in dataset_properties:
+            dataset_properties['target_type'] = 'time_series_regression'
+        if dataset_properties['target_type'] != 'time_series_regression':
+            warnings.warn('Time series regression is being used, however the target_type'
+                          'is not given as "time_series_regression". Overriding it.')
+            dataset_properties['target_type'] = 'time_series_regression'
+        # get the base search space given this
+        # dataset properties. Then overwrite with custom
+        # regression requirements
+        cs = self._get_base_search_space(
+            cs=cs, dataset_properties=dataset_properties,
+            exclude=exclude, include=include, pipeline=self.steps)
+
+        # Here we add custom code, like this with this
+        # is not a valid configuration
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
+                by the pipeline.
+        """
+        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
+
+        default_dataset_properties = {'target_type': 'time_series_prediction'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+
+        steps.extend([
+            ("scaler", ScalerChoice(default_dataset_properties)),
+            ("preprocessing", EarlyPreprocessing()),
+            ("time_series_transformer", TimeSeriesTransformer()),
+            ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
+            ("network_head", NetworkHeadChoice(default_dataset_properties)),
+            ("network", NetworkComponent()),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties)),
+            ("optimizer", OptimizerChoice(default_dataset_properties)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
+            ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
+                                                            n_prediction_steps=self.n_prediction_steps,
+                                                            )),
+            ("trainer", TrainerChoice(default_dataset_properties)),
+        ])
+        return steps
+
+    def get_pipeline_representation(self) -> Dict[str, str]:
+        """
+        Returns a representation of the pipeline, so that it can be
+        consumed and formatted by the API.
+
+        It should be a representation that follows:
+        [{'PreProcessing': <>, 'Estimator': <>}]
+
+        Returns:
+            Dict: contains the pipeline representation in a short format
+        """
+        preprocessing = []
+        estimator = []
+        skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
+                      'preprocessing', 'time_series_transformer']
+        for step_name, step_component in self.steps:
+            if step_name in skip_steps:
+                continue
+            properties = {}
+            if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
+                properties = step_component.choice.get_properties()
+            elif isinstance(step_component, autoPyTorchComponent):
+                properties = step_component.get_properties()
+            if 'shortname' in properties:
+                if 'network' in step_name:
+                    estimator.append(properties['shortname'])
+                else:
+                    preprocessing.append(properties['shortname'])
+        return {
+            'Preprocessing': ','.join(preprocessing),
+            'Estimator': ','.join(estimator),
+        }
+
+    def _get_estimator_hyperparameter_name(self) -> str:
+        """
+        Returns the name of the current estimator.
+
+        Returns:
+            str: name of the pipeline type
+        """
+        return "time_series_predictor"
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..e69de29bb

From 91843047dff47cc88bc23508fd3168f07137421e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 12 Mar 2021 21:03:22 +0100
Subject: [PATCH 025/347] move n_prediction_steps from pipeline to dataset

---
 autoPyTorch/constants.py                      |  14 +-
 ...ime_series_forecasting_target_validator.py | 131 ++++++++++++++
 autoPyTorch/datasets/time_series_dataset.py   | 171 +++++++++++-------
 .../time_series_forecasting_data_loader.py    |   7 +-
 .../pipeline/time_series_forecasting.py       |   3 -
 5 files changed, 250 insertions(+), 76 deletions(-)

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 6680423a3..1c4ee7d1f 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -4,15 +4,17 @@
 IMAGE_REGRESSION = 4
 TIMESERIES_CLASSIFICATION = 5
 TIMESERIES_REGRESSION = 6
+TIMESERIES_FORECASTING = 7
 
-REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION]
+REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION, TIMESERIES_FORECASTING]
 CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION, TIMESERIES_CLASSIFICATION]
+FORECASTING_TASKS = [TIMESERIES_FORECASTING]
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
-TIMESERIES_TASKS = [TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION]
+TIMESERIES_TASKS = [TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION, TIMESERIES_FORECASTING]
 
-TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS
+TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + TIMESERIES_TASKS
 
 TASK_TYPES_TO_STRING = \
     {TABULAR_CLASSIFICATION: 'tabular_classification',
@@ -20,7 +22,8 @@
      TABULAR_REGRESSION: 'tabular_regression',
      IMAGE_REGRESSION: 'image_regression',
      TIMESERIES_CLASSIFICATION: 'time_series_classification',
-     TIMESERIES_REGRESSION: 'time_series_regression'}
+     TIMESERIES_REGRESSION: 'time_series_regression',
+     TIMESERIES_FORECASTING: 'time_series_forecasting'}
 
 STRING_TO_TASK_TYPES = \
     {'tabular_classification': TABULAR_CLASSIFICATION,
@@ -28,7 +31,8 @@
      'tabular_regression': TABULAR_REGRESSION,
      'image_regression': IMAGE_REGRESSION,
      'time_series_classification': TIMESERIES_CLASSIFICATION,
-     'time_series_regression': TIMESERIES_REGRESSION}
+     'time_series_regression': TIMESERIES_REGRESSION,
+     'time_series_forecasting': TIMESERIES_FORECASTING}
 
 # Output types have been defined as in scikit-learn type_of_target
 # (https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html)
diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
index e69de29bb..ca6f84fa6 100644
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_target_validator.py
@@ -0,0 +1,131 @@
+from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+
+import typing
+
+import numpy as np
+
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+
+import scipy.sparse
+
+import sklearn.utils
+from sklearn import preprocessing
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
+from sklearn.utils.multiclass import type_of_target
+
+from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES
+
+
+class TimeSeriesForecastingTargetValidator(TabularTargetValidator):
+    def transform(
+            self,
+            y: typing.Union[SUPPORTED_TARGET_TYPES],
+    ) -> np.ndarray:
+        if not self._is_fitted:
+            raise NotFittedError("Cannot call transform on a validator that is not fitted")
+
+        # Check the data here so we catch problems on new test data
+        self._check_data(y)
+
+        # sklearn check array will make sure we have the
+        # correct numerical features for the array
+        # Also, a numpy array will be created
+        y = sklearn.utils.check_array(
+            y,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        )
+        return y
+
+    """
+    Validator for time series forecasting, currently only consider regression tasks
+    TODO: Considering Classification Validator
+    """
+    def _check_data(
+        self,
+        y: SUPPORTED_TARGET_TYPES,
+    ) -> None:
+        """
+        Perform dimensionality and data type checks on the targets
+
+        Arguments:
+            y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
+                A set of features whose dimensionality and data type is going to be checked
+        """
+
+        if not isinstance(
+                y, (np.ndarray, pd.DataFrame, list, pd.Series)) and not scipy.sparse.issparse(y):
+            raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
+                             " pd.Series, sparse data and Python Lists as targets, yet, "
+                             "the provided input is of type {}".format(
+                                 type(y)
+                             ))
+
+        # Sparse data muss be numerical
+        # Type ignore on attribute because sparse targets have a dtype
+        if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
+                                                          np.number):
+            raise ValueError("When providing a sparse matrix as targets, the only supported "
+                             "values are numerical. Please consider using a dense"
+                             " instead."
+                             )
+
+        if self.data_type is None:
+            self.data_type = type(y)
+        if self.data_type != type(y):
+            self.logger.warning("AutoPyTorch previously received targets of type %s "
+                                "yet the current features have type %s. Changing the dtype "
+                                "of inputs to an estimator might cause problems" % (
+                                    str(self.data_type),
+                                    str(type(y)),
+                                ),
+                                )
+
+        # No Nan is supported
+        has_nan_values = False
+        if hasattr(y, 'iloc'):
+            has_nan_values = typing.cast(pd.DataFrame, y).isnull().values.any()
+        if scipy.sparse.issparse(y):
+            y = typing.cast(scipy.sparse.spmatrix, y)
+            has_nan_values = not np.array_equal(y.data, y.data)
+        else:
+            # List and array like values are considered here
+            # np.isnan cannot work on strings, so we have to check for every element
+            # but NaN, are not equal to themselves:
+            has_nan_values = not np.array_equal(y, y)
+        if has_nan_values:
+            raise ValueError("Target values cannot contain missing/NaN values. "
+                             "This is not supported by scikit-learn. "
+                             )
+
+        # Pandas Series is not supported for multi-label indicator
+        # This format checks are done by type of target
+        try:
+            self.type_of_target = type_of_target(y[0])
+        except Exception as e:
+            raise ValueError("The provided data could not be interpreted by AutoPyTorch. "
+                             "While determining the type of the targets via type_of_target "
+                             "run into exception: {}.".format(e))
+
+        supported_output_types = ('binary',
+                                  'continuous',
+                                  'continuous-multioutput',
+                                  'multiclass',
+                                  'multilabel-indicator',
+                                  # Notice unknown/multiclass-multioutput are not supported
+                                  # This can only happen during testing only as estimators
+                                  # should filter out unsupported types.
+                                  )
+        if self.type_of_target not in supported_output_types:
+            raise ValueError("Provided targets are not supported by AutoPyTorch. "
+                             "Provided type is {} whereas supported types are {}.".format(
+                                 self.type_of_target,
+                                 supported_output_types
+                             ))
+
+
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 2f9c29a4f..d50d1f2f8 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,40 +1,55 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
+import warnings
 
 import numpy as np
 
 import pandas as pd
+import sklearn
 
 import torchvision.transforms
 
-from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, CLASSIFICATION_TASKS, REGRESSION_OUTPUTS, \
-    STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING, TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION
+from autoPyTorch.constants import (
+    CLASSIFICATION_OUTPUTS,
+    CLASSIFICATION_TASKS,
+    REGRESSION_OUTPUTS,
+    STRING_TO_OUTPUT_TYPES,
+    STRING_TO_TASK_TYPES,
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_CLASSIFICATION,
+    TIMESERIES_REGRESSION,
+    TIMESERIES_FORECASTING,
+)
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
+    DEFAULT_RESAMPLING_PARAMETERS,
     CrossValTypes,
     HoldoutValTypes,
     get_cross_validators,
-    get_holdout_validators, is_stratified
+    get_holdout_validators,
+    is_stratified,
 )
 
-TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
-TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
-TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
+from autoPyTorch.utils.common import FitRequirement
 
+#TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
+#TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
+#TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
 class TimeSeriesForecastingDataset(BaseDataset):
     def __init__(self,
-                 target_variables: Tuple[int],
-                 sequence_length: int,
-                 n_steps: int,
-                 train: TIME_SERIES_FORECASTING_INPUT,
-                 val: Optional[TIME_SERIES_FORECASTING_INPUT] = None,
+                 X: np.ndarray,
+                 Y: Union[np.ndarray, pd.Series],
+                 X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = False,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 validator: Optional[BaseInputValidator] = None,
+                 n_prediction_steps: int = 1,
                  ):
         """
 
@@ -44,74 +59,102 @@ def __init__(self,
         :param train: Tuple with one tensor holding the training data
         :param val: Tuple with one tensor holding the validation data
         """
-        _check_time_series_forecasting_inputs(
-            target_variables=target_variables,
-            sequence_length=sequence_length,
-            n_steps=n_steps,
-            train=train,
-            val=val)
-        train = _prepare_time_series_forecasting_tensor(tensor=train,
-                                                        target_variables=target_variables,
-                                                        sequence_length=sequence_length,
-                                                        n_steps=n_steps)
-        if val is not None:
-            val = _prepare_time_series_forecasting_tensor(tensor=val,
-                                                          target_variables=target_variables,
-                                                          sequence_length=sequence_length,
-                                                          n_steps=n_steps)
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=shuffle,
+
+        self.validator = validator
+        if self.validator is not None:
+            X, Y = self.validator.transform(X, Y)
+            if X_test is not None:
+                X_test, Y_test = self.validator.transform(X_test, Y_test)
+
+        _check_time_series_forecasting_inputs(train=X, val=X_test)
+        # swap the axis of population_size and sequence_length hence the splitter will split the dataset w.r.t. sequence
+        X = np.swapaxes(X, 0, 1)
+        Y = np.swapaxes(Y, 0, 1)
+        train_tensors = (X.astype(np.float32), Y.astype(np.float32)[0])
+        if X_test is not None and Y_test is not None:
+            X_test = np.swapaxes(X_test, 0, 1)
+            Y_test = np.swapaxes(Y_test, 0, 1)
+            test_tensors = (X_test.astype(np.float32)[0], Y_test.astype(np.float32))
+        else:
+            test_tensors = None
+        if shuffle:
+            warnings.WarningMessage("Time Series Forecasting will not shuffle the data")
+        super().__init__(train_tensors=train_tensors, test_tensors=test_tensors, shuffle=False,
                          resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
                          seed=seed,
                          train_transforms=train_transforms,
                          val_transforms=val_transforms,
                          )
+        self.is_small_preprocess = False
+
+        self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
+
+        self.train_tensors = (X.astype(np.float32), Y.astype(np.float32))
+        self.num_features = self.train_tensors[0].shape[2]
+        self.numerical_features: List[int] = list(range(self.num_features))
+        self.categorical_features: List[int] = []
+        self.n_prediction_steps = n_prediction_steps
+
         self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
         self.holdout_validators = get_holdout_validators(HoldoutValTypes.holdout_validation)
 
+        self.splits = self.get_splits_from_resampling_strategy()
+
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+
+        time_series_length = self.train_tensors[0].shape[0]
+        if isinstance(self.resampling_strategy, HoldoutValTypes):
+            if self.val_tensors is not None:
+                max_sequence_length = time_series_length - self.n_prediction_steps
+            else:
+                val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                    'val_share', None)
+                if self.resampling_strategy_args is not None:
+                    val_share = self.resampling_strategy_args.get('val_share', val_share)
+                upper_sequence_length = int(time_series_length * val_share) - self.n_prediction_steps
+
+        elif isinstance(self.resampling_strategy, CrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'num_splits', None)
+            if self.resampling_strategy_args is not None:
+                num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
+            upper_sequence_length = (time_series_length // num_splits) - self.n_prediction_steps
+        else:
+            raise ValueError()
+        self.upper_sequence_length = upper_sequence_length
+
+    def get_required_dataset_info(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary containing required dataset properties to instantiate a pipeline,
+        """
+        info = super().get_required_dataset_info()
+        info.update({
+            'task_type': self.task_type,
+            'numerical_features': self.numerical_features,
+            'categorical_features': self.categorical_features,
+            'upper_sequence_length': self.upper_sequence_length,
+        })
+        return info
+
+    def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
+        dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
+        dataset_properties.update({'upper_sequence_length': self.upper_sequence_length})
+        return dataset_properties
+
 
-def _check_time_series_forecasting_inputs(target_variables: Tuple[int],
-                                          sequence_length: int,
-                                          n_steps: int,
-                                          train: TIME_SERIES_FORECASTING_INPUT,
-                                          val: Optional[TIME_SERIES_FORECASTING_INPUT] = None) -> None:
-    if train[0].ndim != 3:
+def _check_time_series_forecasting_inputs(train: np.ndarray,
+                                          val: Optional[np.ndarray] = None) -> None:
+    if train.ndim != 3:
         raise ValueError(
             "The training data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
     if val is not None:
-        if val[0].ndim != 3:
+        if val.ndim != 3:
             raise ValueError(
                 "The validation data for time series forecasting "
                 "has to be a three-dimensional tensor of shape PxLxM.")
-    _, time_series_length, num_features = train[0].shape
-    if sequence_length + n_steps > time_series_length:
-        raise ValueError(f"Invalid sequence length: Cannot create dataset "
-                         f"using sequence_length={sequence_length} and n_steps={n_steps} "
-                         f"when the time series are of length {time_series_length}")
-    for t in target_variables:
-        if t < 0 or t >= num_features:
-            raise ValueError(f"Target variable {t} is out of bounds. Number of features is {num_features}, "
-                             f"so each target variable has to be between 0 and {num_features - 1}.")
-
-
-def _prepare_time_series_forecasting_tensor(tensor: TIME_SERIES_FORECASTING_INPUT,
-                                            target_variables: Tuple[int],
-                                            sequence_length: int,
-                                            n_steps: int) -> Tuple[np.ndarray, np.ndarray]:
-    population_size, time_series_length, num_features = tensor[0].shape
-    num_targets = len(target_variables)
-    num_datapoints = time_series_length - sequence_length - n_steps + 1
-    x_tensor = np.zeros((num_datapoints, population_size, sequence_length, num_features), dtype=np.float32)
-    y_tensor = np.zeros((num_datapoints, population_size, num_targets), dtype=np.float32)
-
-    for p in range(population_size):
-        for i in range(num_datapoints):
-            x_tensor[i, p, :, :] = tensor[0][p, i:i + sequence_length, :]
-            y_tensor[i, p, :] = tensor[0][p, i + sequence_length + n_steps - 1, target_variables]
-
-    # get rid of population dimension by reshaping
-    x_tensor = x_tensor.reshape((-1, sequence_length, num_features))
-    y_tensor = y_tensor.reshape((-1, num_targets))
-    return x_tensor, y_tensor
 
 
 class TimeSeriesDataset(BaseDataset):
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index d27979212..5801e313b 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -41,7 +41,6 @@ def __init__(self,
         super().__init__(batch_size=batch_size)
         self.sequence_length: int = sequence_length
         self.upper_seuqnce_length = upper_sequence_length
-        self.n_prediction_steps = n_prediction_steps
 
     def transform(self, X: np.ndarray) -> np.ndarray:
         """The transform function calls the transform function of the
@@ -86,8 +85,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             self.val_transform,
             train=False,
         )
-        import pdb
-        pdb.set_trace()
+
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
@@ -123,9 +121,10 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
         X_train, y_train = datamanager.train_tensors
         val_tensors = datamanager.val_tensors
         test_tensors = datamanager.test_tensors
+        n_prediction_steps = datamanager.n_prediction_steps
 
         time_series_length, population_size, num_features = X_train.shape
-        num_datapoints = time_series_length - self.sequence_length - self.n_prediction_steps + 1
+        num_datapoints = time_series_length - self.sequence_length - n_prediction_steps + 1
         num_targets = y_train.shape[-1]
 
         y_train = y_train[-num_datapoints:, :]
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 71bd031c6..57691e1e0 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -70,10 +70,8 @@ def __init__(self,
                  random_state: Optional[np.random.RandomState] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-                 n_prediction_steps: int = 1,
                  ):
         # TODO consider multi steps prediction
-        self.n_prediction_steps = n_prediction_steps
         if 'upper_sequence_length' not in dataset_properties:
             warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
                           'length that is greater than the maximal allowed length of the dataset')
@@ -179,7 +177,6 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("optimizer", OptimizerChoice(default_dataset_properties)),
             ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
             ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
-                                                            n_prediction_steps=self.n_prediction_steps,
                                                             )),
             ("trainer", TrainerChoice(default_dataset_properties)),
         ])

From feaa2d1e2804f2b43e3d9e64ab7a9f9486edb655 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 15 Mar 2021 12:58:11 +0100
Subject: [PATCH 026/347] maint

---
 .../time_series_forecasting_data_loader.py    | 26 ++++++++++++++-----
 .../components/training/metrics/utils.py      |  3 ++-
 autoPyTorch/utils/pipeline.py                 | 19 +++++++++++++-
 test.py                                       |  0
 4 files changed, 40 insertions(+), 8 deletions(-)
 delete mode 100644 test.py

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5801e313b..79f452980 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -41,6 +41,7 @@ def __init__(self,
         super().__init__(batch_size=batch_size)
         self.sequence_length: int = sequence_length
         self.upper_seuqnce_length = upper_sequence_length
+        self.n_prediction_steps = n_prediction_steps
 
     def transform(self, X: np.ndarray) -> np.ndarray:
         """The transform function calls the transform function of the
@@ -124,6 +125,8 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
         n_prediction_steps = datamanager.n_prediction_steps
 
         time_series_length, population_size, num_features = X_train.shape
+        self.population_size = population_size
+        self.num_features = num_features
         num_datapoints = time_series_length - self.sequence_length - n_prediction_steps + 1
         num_targets = y_train.shape[-1]
 
@@ -140,7 +143,8 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
                 datamanager.val_tensors = val_tensors
 
             X_test = np.concatenate([X_train[-self.sequence_length + 1:], X_test])
-            self.X_val_tail = X_test[-self.sequence_length + 1:]
+            self.X_val_tail = X_test[-self.sequence_length + 1:] if self.sequence_length > 1 \
+                else np.zeros((0, population_size, num_features), dtype=X_test.dtype)
 
             test_tensors = self._ser2seq(X_test, y_test, num_datapoints, num_features, num_targets)
             datamanager.test_tensors = test_tensors
@@ -150,11 +154,14 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
             X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
 
             # used for prediction
-            self.X_val_tail = X_val[-self.sequence_length+1:]
+            self.X_val_tail = X_val[-self.sequence_length+1:] if self.sequence_length > 1 \
+                else np.zeros((0, population_size, num_features), dtype=X_val.dtype)
             val_tensors = self._ser2seq(X_val, y_val, num_datapoints, num_features, num_targets)
             datamanager.val_tensors = val_tensors
         else:
-            self.X_val_tail = X_train[-self.sequence_length+1:]
+            self.X_val_tail = X_train[-self.sequence_length+1:] if self.sequence_length > 1 \
+                else np.zeros((0, population_size, num_features), dtype=X_train.dtype)
+
         train_tensors = self._ser2seq(X_train, y_train, num_datapoints, num_features, num_targets)
         datamanager.train_tensors = train_tensors
         datamanager.splits = datamanager.get_splits_from_resampling_strategy()
@@ -190,10 +197,17 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         applying the transformations meant to validation objects
         """
         if X.ndim == 3:
-            X = np.swapaxes(X, 0, 1)
+            X_shape = X.shape
+            if X_shape[1] == self.population_size and X_shape[0] == self.num_features:
+                pass
+            elif X_shape[1] == self.num_features and X_shape[0] == self.population_size:
+                X = np.swapaxes(X, 0, 1)
+            else:
+                raise ValueError("the shape of test data is incompatible with the training data")
             X = np.concatenate([self.X_val_tail, X])
-            time_series_length, population_size, num_features = X.shape
-            X = X.reshape((-1, self.sequence_length, num_features))
+            X = X.reshape((-1, self.sequence_length, self.num_features))
+        else:
+            raise ValueError("The test data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
 
         dataset = BaseDataset(
             train_tensors=(X, y),
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index d386ce47e..5729ebbae 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -6,6 +6,7 @@
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
     REGRESSION_TASKS,
+    FORECASTING_TASKS,
     STRING_TO_TASK_TYPES,
     TASK_TYPES,
 )
@@ -16,7 +17,7 @@
 def sanitize_array(array: np.ndarray) -> np.ndarray:
     """
     Replace NaN and Inf (there should not be any!)
-    :param array:
+    :param array:z
     :return:
     """
     a = np.ravel(array)
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 10a01531c..afb8ed383 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -16,6 +16,7 @@
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
 from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
 from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -132,12 +133,18 @@ def get_configuration_space(info: Dict[str, Any],
                                                    exclude if exclude is not None else {},
                                                    search_space_updates=search_space_updates
                                                    )
-    else:
+    elif task_type in CLASSIFICATION_TASKS:
         return _get_classification_configuration_space(info,
                                                        include if include is not None else {},
                                                        exclude if exclude is not None else {},
                                                        search_space_updates=search_space_updates
                                                        )
+    else:
+        return _get_forecasting_configuration_space(info,
+                                                       include if include is not None else {},
+                                                       exclude if exclude is not None else {},
+                                                       search_space_updates=search_space_updates
+                                                       )
 
 
 def _get_regression_configuration_space(info: Dict[str, Any], include: Dict[str, List[str]],
@@ -185,3 +192,13 @@ def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[
             get_hyperparameter_search_space()
     else:
         raise ValueError("Task_type not supported")
+
+
+def _get_forecasting_configuration_space(info: Dict[str, Any], include: Dict[str, List[str]],
+                                         exclude: Dict[str, List[str]],
+                                         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                                         ) -> ConfigurationSpace:
+    pipeline = TimeSeriesForecastingPipeline(dataset_properties=info,
+                                             include=include, exclude=exclude,
+                                             search_space_updates=search_space_updates)
+    return pipeline.get_hyperparameter_search_space()
diff --git a/test.py b/test.py
deleted file mode 100644
index e69de29bb..000000000

From a62123a3f996020c1cb5cae505ebc0bff47455d7 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 15 Mar 2021 16:55:47 +0100
Subject: [PATCH 027/347] maint

---
 ...me_series_forecasting_feature_validator.py |  0
 ...ime_series_forecasting_target_validator.py | 60 +++++++++++++++++++
 .../data/time_series_forecasting_validator.py |  5 +-
 autoPyTorch/datasets/time_series_dataset.py   |  1 +
 .../time_series_forecasting_data_loader.py    | 18 +++++-
 5 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 autoPyTorch/data/time_series_forecasting_feature_validator.py

diff --git a/autoPyTorch/data/time_series_forecasting_feature_validator.py b/autoPyTorch/data/time_series_forecasting_feature_validator.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
index ca6f84fa6..7010731bf 100644
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_target_validator.py
@@ -19,6 +19,66 @@
 
 
 class TimeSeriesForecastingTargetValidator(TabularTargetValidator):
+    def fit(
+        self,
+        y_train: SUPPORTED_TARGET_TYPES,
+        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+    ) -> BaseEstimator:
+        """
+        Validates and fit a categorical encoder (if needed) to the targets
+        The supported data types are List, numpy arrays and pandas DataFrames.
+
+        Arguments:
+            y_train (SUPPORTED_TARGET_TYPES)
+                A set of targets set aside for training
+            y_test (typing.Union[SUPPORTED_TARGET_TYPES])
+                A hold out set of data used of the targets. It is also used to fit the
+                categories of the encoder.
+        """
+        # Check that the data is valid
+        self._check_data(y_train)
+
+        shape = np.shape(y_train)
+        if y_test is not None:
+            self._check_data(y_test)
+
+            if len(shape) != len(np.shape(y_test)) or (
+                    len(shape) > 1 and (shape[0] != np.shape(y_test)[0] or shape[-1] != np.shape(y_test)[-1])):
+                raise ValueError("The dimensionality of the train and test targets "
+                                 "does not match train({}) != test({})".format(
+                                     np.shape(y_train),
+                                     np.shape(y_test)
+                                 ))
+            if isinstance(y_train, pd.DataFrame):
+                y_train = typing.cast(pd.DataFrame, y_train)
+                y_test = typing.cast(pd.DataFrame, y_test)
+                if y_train.columns.tolist() != y_test.columns.tolist():
+                    raise ValueError(
+                        "Train and test targets must both have the same columns, yet "
+                        "y={} and y_test={} ".format(
+                            y_train.columns,
+                            y_test.columns
+                        )
+                    )
+
+                if list(y_train.dtypes) != list(y_test.dtypes):
+                    raise ValueError("Train and test targets must both have the same dtypes")
+
+        if self.out_dimensionality is None:
+            self.out_dimensionality = 1 if len(shape) == 1 else shape[1]
+        else:
+            _n_outputs = 1 if len(shape) == 1 else shape[1]
+            if self.out_dimensionality != _n_outputs:
+                raise ValueError('Number of outputs changed from %d to %d!' %
+                                 (self.out_dimensionality, _n_outputs))
+
+        # Fit on the training data
+        self._fit(y_train, y_test)
+
+        self._is_fitted = True
+
+        return self
+
     def transform(
             self,
             y: typing.Union[SUPPORTED_TARGET_TYPES],
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 1f1330a92..9739b0868 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -4,8 +4,7 @@
 import logging
 import typing
 
-from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+from autoPyTorch.data.time_series_forecasting_feature_validator import TimeSeriesForecastingFeatureValidator
 from autoPyTorch.data.time_series_forecasting_target_validator import TimeSeriesForecastingTargetValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 
@@ -45,7 +44,7 @@ def __init__(
         else:
             self.logger = logging.getLogger('Validation')
 
-        self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
+        self.feature_validator = TimeSeriesForecastingFeatureValidator(logger=self.logger)
         self.target_validator = TimeSeriesForecastingTargetValidator(
             is_classification=self.is_classification,
             logger=self.logger
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index d50d1f2f8..75d114d8b 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -90,6 +90,7 @@ def __init__(self,
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
         self.train_tensors = (X.astype(np.float32), Y.astype(np.float32))
+        self.test_tensors = (X_test.astype(np.float32), Y.astype(np.float32))
         self.num_features = self.train_tensors[0].shape[2]
         self.numerical_features: List[int] = list(range(self.num_features))
         self.categorical_features: List[int] = []
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 79f452980..f4f7709c8 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -54,7 +54,16 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             np.ndarray: Transformed features
         """
         X.update({'train_data_loader': self.train_data_loader,
-                  'val_data_loader': self.val_data_loader})
+                  'val_data_loader': self.val_data_loader,
+                  'X_train': self.datamanager.train_tensors[0],
+                  'y_train': self.datamanager.train_tensors[1]})
+        if self.datamanager.val_tensors is not None and 'X_val' in X:
+            X.update({'X_val': self.datamanager.val_tensors[0],
+                      'y_val': self.datamanager.val_tensors[1]})
+        if self.datamanager.test_tensors is not None and 'X_test' in X:
+            X.update({'X_test': self.datamanager.test_tensors[0],
+                      'y_test': self.datamanager.test_tensors[1]})
+
         return X
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
@@ -87,12 +96,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             train=False,
         )
 
+
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
+        self.datamanager = datamanager
+
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=min(self.batch_size, len(train_dataset)),
@@ -198,9 +210,9 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         """
         if X.ndim == 3:
             X_shape = X.shape
-            if X_shape[1] == self.population_size and X_shape[0] == self.num_features:
+            if X_shape[1] == self.population_size and X_shape[-1] == self.num_features:
                 pass
-            elif X_shape[1] == self.num_features and X_shape[0] == self.population_size:
+            elif X_shape[-1] == self.num_features and X_shape[0] == self.population_size:
                 X = np.swapaxes(X, 0, 1)
             else:
                 raise ValueError("the shape of test data is incompatible with the training data")

From 8f02b31d6866d6146f1104fa0d6fa8bfebb3b30e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 19 Mar 2021 22:51:22 +0100
Subject: [PATCH 028/347] update forecasting task api

---
 autoPyTorch/api/time_series_forecasting.py    | 239 ++++++++++++++++++
 ...me_series_forecasting_feature_validator.py | 100 ++++++++
 autoPyTorch/datasets/time_series_dataset.py   | 151 +++++++++--
 autoPyTorch/evaluation/abstract_evaluator.py  |   7 +-
 .../time_series_forecasting_data_loader.py    |  55 +++-
 5 files changed, 517 insertions(+), 35 deletions(-)
 create mode 100644 autoPyTorch/api/time_series_forecasting.py

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
new file mode 100644
index 000000000..ad7f5bc7c
--- /dev/null
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -0,0 +1,239 @@
+import os
+import uuid
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+
+import numpy as np
+
+import pandas as pd
+
+from autoPyTorch.api.base_task import BaseTask
+from autoPyTorch.constants import (
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING,
+)
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+)
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+from autoPyTorch.utils.backend import Backend
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+class TimeSeriesForecastingTask(BaseTask):
+    """
+    Time Series Forcasting API to the pipelines.
+    Args:
+        seed (int): seed to be used for reproducibility.
+        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        logging_config (Optional[Dict]): specifies configuration
+            for logging, if None, it is loaded from the logging.yaml
+        ensemble_size (int), (default=50): Number of models added to the ensemble built by
+            Ensemble selection from libraries of models.
+            Models are drawn with replacement.
+        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
+            models to build the ensemble
+        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
+            Also, controls the size of the ensemble as any additional models will be deleted.
+            Must be greater than or equal to 1.
+        temporary_directory (str): folder to store configuration output and log file
+        output_directory (str): folder to store predictions for optional test set
+        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
+            when finished
+        include_components (Optional[Dict]): If None, all possible components are used.
+            Otherwise specifies set of components to use.
+        exclude_components (Optional[Dict]): If None, all possible components are used.
+            Otherwise specifies set of components not to use. Incompatible with include
+            components
+    """
+    def __init__(
+        self,
+        seed: int = 1,
+        n_jobs: int = 1,
+        logging_config: Optional[Dict] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
+        temporary_directory: Optional[str] = None,
+        output_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        delete_output_folder_after_terminate: bool = True,
+        include_components: Optional[Dict] = None,
+        exclude_components: Optional[Dict] = None,
+        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        backend: Optional[Backend] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        n_prediction_steps: int = 1,
+    ):
+        super().__init__(
+            seed=seed,
+            n_jobs=n_jobs,
+            logging_config=logging_config,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
+            temporary_directory=temporary_directory,
+            output_directory=output_directory,
+            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
+            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
+            include_components=include_components,
+            exclude_components=exclude_components,
+            backend=backend,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            search_space_updates=search_space_updates,
+            task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
+        )
+        self.n_prediction_steps = n_prediction_steps
+
+    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
+        if not isinstance(dataset, TimeSeriesForecastingDataset):
+            raise ValueError("Dataset is incompatible for the given task,: {}".format(
+                type(dataset)
+            ))
+        return dataset.get_required_dataset_info()
+
+    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesForecastingPipeline:
+        return TimeSeriesForecastingPipeline(dataset_properties=dataset_properties)
+
+    def search(
+        self,
+        optimize_metric: str,
+        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        #target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+        dataset_name: Optional[str] = None,
+        budget_type: Optional[str] = None,
+        budget: Optional[float] = None,
+        total_walltime_limit: int = 100,
+        func_eval_time_limit: int = 60,
+        traditional_per_total_budget: float = 0.,
+        memory_limit: Optional[int] = 4096,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: List = [],
+        load_models: bool = True,
+    ) -> 'BaseTask':
+        """
+        Search for the best pipeline configuration for the given dataset.
+
+        Fit both optimizes the machine learning models and builds an ensemble out of them.
+        To disable ensembling, set ensemble_size==0.
+        using the optimizer.
+        Args:
+            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
+                A pair of features (X_train) and targets (y_train) used to fit a
+                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
+                be provided to track the generalization performance of each stage.
+            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]]
+                indices indicating which variables need to be predicted, if X is given, either 'target_variables'
+                and 'y' needs to be given
+            optimize_metric (str): name of the metric that is used to
+                evaluate a pipeline.
+            budget_type (Optional[str]):
+                Type of budget to be used when fitting the pipeline.
+                Either 'epochs' or 'runtime'. If not provided, uses
+                the default in the pipeline config ('epochs')
+            budget (Optional[float]):
+                Budget to fit a single run of the pipeline. If not
+                provided, uses the default in the pipeline config
+            total_walltime_limit (int), (default=100): Time limit
+                in seconds for the search of appropriate models.
+                By increasing this value, autopytorch has a higher
+                chance of finding better models.
+            func_eval_time_limit (int), (default=60): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit. Set
+                this value high enough so that typical machine
+                learning algorithms can be fit on the training
+                data.
+            traditional_per_total_budget (float), (default=0.1):
+                Percent of total walltime to be allocated for
+                running traditional classifiers.
+            memory_limit (Optional[int]), (default=4096): Memory
+                limit in MB for the machine learning algorithm. autopytorch
+                will stop fitting the machine learning algorithm if it tries
+                to allocate more than memory_limit MB. If None is provided,
+                no memory limit is set. In case of multi-processing, memory_limit
+                will be per job. This memory limit also applies to the ensemble
+                creation process.
+            smac_scenario_args (Optional[Dict]): Additional arguments inserted
+                into the scenario of SMAC. See the
+                [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
+            get_smac_object_callback (Optional[Callable]): Callback function
+                to create an object of class
+                [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
+                The function must accept the arguments scenario_dict,
+                instances, num_params, runhistory, seed and ta. This is
+                an advanced feature. Use only if you are familiar with
+                [SMAC](https://automl.github.io/SMAC3/master/index.html).
+            all_supported_metrics (bool), (default=True): if True, all
+                metrics supporting current task will be calculated
+                for each pipeline and results will be available via cv_results
+            precision (int), (default=32): Numeric precision used when loading
+                ensemble data. Can be either '16', '32' or '64'.
+            disable_file_output (Union[bool, List]):
+            load_models (bool), (default=True): Whether to load the
+                models after fitting AutoPyTorch.
+
+        Returns:
+            self
+
+        """
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # we have to create a logger for at this point for the validator
+        self._logger = self._get_logger(dataset_name)
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        self.InputValidator = TimeSeriesForecastingInputValidator(
+            is_classification=False,
+            logger_port=self._logger_port,
+        )
+
+        # Fit a input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+
+        self.dataset = TimeSeriesForecastingDataset(
+            X=X_train, Y=y_train,
+            X_test=X_test, Y_test=y_test,
+            validator=self.InputValidator,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+            n_prediction_steps=self.n_prediction_steps,
+        )
+
+        if traditional_per_total_budget > 0.:
+            self._logger.warning("Time series Forecasting for now does not support traditional classifiers. "
+                                 "Setting traditional_per_total_budget to 0.")
+            traditional_per_total_budget = 0.
+
+        return self._search(
+            dataset=self.dataset,
+            optimize_metric=optimize_metric,
+            budget_type=budget_type,
+            budget=budget,
+            total_walltime_limit=total_walltime_limit,
+            func_eval_time_limit=func_eval_time_limit,
+            traditional_per_total_budget=traditional_per_total_budget,
+            memory_limit=memory_limit,
+            smac_scenario_args=smac_scenario_args,
+            get_smac_object_callback=get_smac_object_callback,
+            all_supported_metrics=all_supported_metrics,
+            precision=precision,
+            disable_file_output=disable_file_output,
+            load_models=load_models,
+        )
diff --git a/autoPyTorch/data/time_series_forecasting_feature_validator.py b/autoPyTorch/data/time_series_forecasting_feature_validator.py
index e69de29bb..9f703254e 100644
--- a/autoPyTorch/data/time_series_forecasting_feature_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_feature_validator.py
@@ -0,0 +1,100 @@
+from typing import Optional
+
+import numpy as np
+
+import sklearn.utils
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
+
+from autoPyTorch.data.tabular_validator import TabularFeatureValidator
+
+
+class TimeSeriesForecastingFeatureValidator(TabularFeatureValidator):
+    def fit(self,
+            X_train: np.ndarray,
+            X_test: Optional[np.ndarray] = None) -> BaseEstimator:
+        """
+
+        Arguments:
+            X_train (np.ndarray):
+                A set of data that are going to be validated (type and dimensionality
+                checks) and used for fitting
+
+            X_test (Optional[np.ndarray]):
+                An optional set of data that is going to be validated
+
+        Returns:
+            self:
+                The fitted base estimator
+        """
+
+        if not isinstance(X_train, np.ndarray):
+            raise ValueError(f"Time series train data must be given as a numpy array, but got {type(X_train)}")
+
+        if X_train.ndim != 3:
+            raise ValueError(f"Invalid number of dimensions for time series train data, "
+                             f"expected 3 but got {X_train.ndim}. "
+                             f"Time series data has to be of shape [B, T, F] where B is the "
+                             f"batch dimension, T is the time dimension and F are the number of features.")
+
+        _ = sklearn.utils.check_array(
+            X_train,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        )
+
+        if X_test is not None:
+            if not isinstance(X_test, np.ndarray):
+                raise ValueError(f"Time series test data must be given as a numpy array, but got {type(X_test)}")
+
+            if not X_test.ndim == 3:
+                raise ValueError(f"Invalid number of dimensions for time series test data, "
+                                 f"expected 3 but got {X_train.ndim}. "
+                                 f"Time series data has to be of shape [B, T, F] where B is the "
+                                 f"batch dimension, T is the time dimension and F are the number of features")
+
+            if X_train.shape[0] != X_test.shape[0] or X_train.shape[-1] != X_test.shape[-1]:
+                raise ValueError(f"Time series train and test data are expected to have the same shape except for "
+                                 f"the sequence length, but got {X_train.shape} for train data and "
+                                 f"{X_test.shape} for test data")
+
+            _ = sklearn.utils.check_array(
+                X_test,
+                force_all_finite=True,
+                ensure_2d=False,
+                allow_nd=True,
+                accept_sparse=False,
+                accept_large_sparse=False
+            )
+        self._fit(X_train[0])
+
+        self._is_fitted = True
+
+        return self
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """
+
+        Arguments:
+            X (np.ndarray):
+                A set of data, that is going to be transformed
+
+        Return:
+            np.ndarray:
+                The transformed array
+        """
+        if not self._is_fitted:
+            raise NotFittedError("Cannot call transform on a validator that is not fitted")
+
+        return sklearn.utils.check_array(
+            X,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        )
+
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 75d114d8b..97be20d3a 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import warnings
 
 import numpy as np
@@ -36,6 +36,7 @@
 #TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
 #TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
+
 class TimeSeriesForecastingDataset(BaseDataset):
     def __init__(self,
                  X: np.ndarray,
@@ -59,26 +60,37 @@ def __init__(self,
         :param train: Tuple with one tensor holding the training data
         :param val: Tuple with one tensor holding the validation data
         """
-
+        self.n_prediction_steps = n_prediction_steps
         self.validator = validator
         if self.validator is not None:
             X, Y = self.validator.transform(X, Y)
             if X_test is not None:
                 X_test, Y_test = self.validator.transform(X_test, Y_test)
 
+        population_size, time_series_length, num_features = X.shape
+        _, _, num_target = Y.shape
+        self.population_size = population_size
+        self.time_series_length = time_series_length
+        self.num_features = num_features
+        self.num_target = num_target
+
+        self.categorical_columns = validator.feature_validator.categorical_columns
+        self.numerical_columns = validator.feature_validator.numerical_columns
+
+
         _check_time_series_forecasting_inputs(train=X, val=X_test)
         # swap the axis of population_size and sequence_length hence the splitter will split the dataset w.r.t. sequence
-        X = np.swapaxes(X, 0, 1)
-        Y = np.swapaxes(Y, 0, 1)
-        train_tensors = (X.astype(np.float32), Y.astype(np.float32)[0])
+        X = np.swapaxes(X, 0, 1).reshape(-1, 1, num_features)
+        Y = np.swapaxes(Y, 0, 1).reshape(-1, num_target)
         if X_test is not None and Y_test is not None:
-            X_test = np.swapaxes(X_test, 0, 1)
-            Y_test = np.swapaxes(Y_test, 0, 1)
-            test_tensors = (X_test.astype(np.float32)[0], Y_test.astype(np.float32))
+            X_test = np.swapaxes(X_test, 0, 1).reshape(-1, num_features)
+            Y_test = np.swapaxes(Y_test, 0, 1).reshape(-1, num_target)
+            test_tensors = (X_test, Y_test)
         else:
             test_tensors = None
         if shuffle:
             warnings.WarningMessage("Time Series Forecasting will not shuffle the data")
+        train_tensors = (X, Y)
         super().__init__(train_tensors=train_tensors, test_tensors=test_tensors, shuffle=False,
                          resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
                          seed=seed,
@@ -89,12 +101,8 @@ def __init__(self,
 
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
-        self.train_tensors = (X.astype(np.float32), Y.astype(np.float32))
-        self.test_tensors = (X_test.astype(np.float32), Y.astype(np.float32))
-        self.num_features = self.train_tensors[0].shape[2]
         self.numerical_features: List[int] = list(range(self.num_features))
         self.categorical_features: List[int] = []
-        self.n_prediction_steps = n_prediction_steps
 
         self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
         self.holdout_validators = get_holdout_validators(HoldoutValTypes.holdout_validation)
@@ -106,26 +114,50 @@ def __init__(self,
         self.train_transform = train_transforms
         self.val_transform = val_transforms
 
-        time_series_length = self.train_tensors[0].shape[0]
+    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
+        """
+        Creates a set of splits based on a resampling strategy provided, apart from the
+        'get_splits_from_resampling_strategy' implemented in base_dataset, here we will get self.upper_sequence_length
+        with the given value
+
+        Returns
+            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
+        """
+        splits = []
         if isinstance(self.resampling_strategy, HoldoutValTypes):
+            val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'val_share', None)
+            if self.resampling_strategy_args is not None:
+                val_share = self.resampling_strategy_args.get('val_share', val_share)
+            splits.append(
+                self.create_holdout_val_split(
+                    holdout_val_type=self.resampling_strategy,
+                    val_share=val_share,
+                )
+            )
+
             if self.val_tensors is not None:
-                max_sequence_length = time_series_length - self.n_prediction_steps
+                upper_sequence_length = self.time_series_length - self.n_prediction_steps
             else:
-                val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
-                    'val_share', None)
-                if self.resampling_strategy_args is not None:
-                    val_share = self.resampling_strategy_args.get('val_share', val_share)
-                upper_sequence_length = int(time_series_length * val_share) - self.n_prediction_steps
+                upper_sequence_length = int(self.time_series_length * val_share) - self.n_prediction_steps
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
                 'num_splits', None)
             if self.resampling_strategy_args is not None:
                 num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
-            upper_sequence_length = (time_series_length // num_splits) - self.n_prediction_steps
+            # Create the split if it was not created before
+            splits.extend(
+                self.create_cross_val_splits(
+                    cross_val_type=self.resampling_strategy,
+                    num_splits=cast(int, num_splits),
+                )
+            )
+            upper_sequence_length = (self.time_series_length // num_splits) - self.n_prediction_steps
         else:
-            raise ValueError()
+            raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
         self.upper_sequence_length = upper_sequence_length
+        return splits
 
     def get_required_dataset_info(self) -> Dict[str, Any]:
         """
@@ -136,6 +168,8 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'task_type': self.task_type,
             'numerical_features': self.numerical_features,
             'categorical_features': self.categorical_features,
+            'numerical_columns': self.numerical_columns,
+            'categorical_columns': self.categorical_columns,
             'upper_sequence_length': self.upper_sequence_length,
         })
         return info
@@ -145,6 +179,81 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties.update({'upper_sequence_length': self.upper_sequence_length})
         return dataset_properties
 
+    def create_cross_val_splits(
+        self,
+        cross_val_type: CrossValTypes,
+        num_splits: int
+    ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
+        """
+        This function creates the cross validation split for the given task.
+
+        It is done once per dataset to have comparable results among pipelines
+        Args:
+            cross_val_type (CrossValTypes):
+            num_splits (int): number of splits to be created
+
+        Returns:
+            (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]):
+                list containing 'num_splits' splits.
+        """
+        # Create just the split once
+        # This is gonna be called multiple times, because the current dataset
+        # is being used for multiple pipelines. That is, to be efficient with memory
+        # we dump the dataset to memory and read it on a need basis. So this function
+        # should be robust against multiple calls, and it does so by remembering the splits
+        if not isinstance(cross_val_type, CrossValTypes):
+            raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
+        kwargs = {}
+        if is_stratified(cross_val_type):
+            # we need additional information about the data for stratification
+            kwargs["stratify"] = self.train_tensors[-1]
+        splits_raw = self.cross_validators[cross_val_type.name](
+            num_splits, self._get_indices(), **kwargs)
+        splits = [() for i in range(len(splits_raw))]
+        for i, split in enumerate(splits_raw):
+            train = split[0]
+            val = split[1]
+            val = np.concatenate([train[-(len(train) % self.time_series_length)], val])
+            train = train[:- (len(train) % self.time_series_length)]
+            splits[i] = (train, val)
+        return splits
+
+    def create_holdout_val_split(
+        self,
+        holdout_val_type: HoldoutValTypes,
+        val_share: float,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        This function creates the holdout split for the given task.
+
+        It is done once per dataset to have comparable results among pipelines
+        Args:
+            holdout_val_type (HoldoutValTypes):
+            val_share (float): share of the validation data
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
+        """
+        if holdout_val_type is None:
+            raise ValueError(
+                '`val_share` specified, but `holdout_val_type` not specified.'
+            )
+        if self.val_tensors is not None:
+            raise ValueError(
+                '`val_share` specified, but the Dataset was a given a pre-defined split at initialization already.')
+        if val_share < 0 or val_share > 1:
+            raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
+        if not isinstance(holdout_val_type, HoldoutValTypes):
+            raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
+        kwargs = {}
+        if is_stratified(holdout_val_type):
+            # we need additional information about the data for stratification
+            kwargs["stratify"] = self.train_tensors[-1]
+        # we want to ensure that both training and validation sets have the same
+        val_share = int(val_share * self.time_series_length) * self.population_size
+        train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
+        return train, val
+
 
 def _check_time_series_forecasting_inputs(train: np.ndarray,
                                           val: Optional[np.ndarray] = None) -> None:
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 9591d1ed1..b75426537 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -21,6 +21,7 @@
 import autoPyTorch.pipeline.tabular_regression
 import autoPyTorch.pipeline.time_series_classification
 import autoPyTorch.pipeline.traditional_tabular_classification
+import autoPyTorch.pipeline.time_series_forecasting
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
     IMAGE_TASKS,
@@ -29,6 +30,7 @@
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS, TIMESERIES_TASKS,
+    FORECASTING_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.evaluation.utils import (
@@ -266,7 +268,10 @@ def __init__(self, backend: Backend,
                 raise ValueError("Only tabular classifications tasks "
                                  "are currently supported with traditional methods")
             elif isinstance(self.configuration, Configuration):
-                self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline
+                if self.task_type in TABULAR_TASKS:
+                    self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline
+                elif self.task_type in FORECASTING_TASKS:
+                    self.pipeline_class = autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline
             else:
                 raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_regression
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index f4f7709c8..2f8896bb1 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -77,13 +77,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Returns:
             A instance of self
         """
-
         # Make sure there is an optimizer
         self.check_requirements(X, y)
 
         # Incorporate the transform to the dataset
         datamanager = X['backend'].load_datamanager()
         datamanager = self._update_dataset(datamanager)
+
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
         self.test_transform = self.build_transform(X, mode='test')
@@ -131,12 +131,19 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
         """
         update the dataset to build time sequence
         """
+        num_features = datamanager.num_features
+        population_size = datamanager.population_size
+        num_target = datamanager.num_target
+
         X_train, y_train = datamanager.train_tensors
         val_tensors = datamanager.val_tensors
         test_tensors = datamanager.test_tensors
         n_prediction_steps = datamanager.n_prediction_steps
 
-        time_series_length, population_size, num_features = X_train.shape
+        X_train = X_train.reshape([-1, population_size, num_features])
+        y_train = y_train.reshape([-1, population_size, num_target])
+
+        time_series_length = X_train.shape[0]
         self.population_size = population_size
         self.num_features = num_features
         num_datapoints = time_series_length - self.sequence_length - n_prediction_steps + 1
@@ -146,19 +153,28 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
         if test_tensors is not None:
             X_test, y_test = test_tensors
 
+            X_test = X_test.reshape([-1, population_size, num_features])
+            y_test = y_test.reshape([-1, population_size, num_target])
+
             if val_tensors is not None:
                 X_val, y_val = val_tensors
 
+                X_val = X_val.reshape([-1, population_size, num_features])
+                y_val = y_val.reshape([-1, population_size, num_target])
+
+                num_datapoints_val = X_val.shape[0]
+
                 X_test = np.concatenate([X_val[-self.sequence_length + 1:], X_test])
                 X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
-                val_tensors = self._ser2seq(X_val, y_val, num_datapoints, num_features, num_targets)
+                val_tensors = self._ser2seq(X_val, y_val, num_datapoints_val, num_features, num_targets)
                 datamanager.val_tensors = val_tensors
 
+            num_datapoints_test = X_test.shape[0]
             X_test = np.concatenate([X_train[-self.sequence_length + 1:], X_test])
             self.X_val_tail = X_test[-self.sequence_length + 1:] if self.sequence_length > 1 \
                 else np.zeros((0, population_size, num_features), dtype=X_test.dtype)
 
-            test_tensors = self._ser2seq(X_test, y_test, num_datapoints, num_features, num_targets)
+            test_tensors = self._ser2seq(X_test, y_test, num_datapoints_test, num_features, num_targets)
             datamanager.test_tensors = test_tensors
 
         elif val_tensors is not None:
@@ -196,8 +212,8 @@ def _ser2seq(self, X_in, y_in, num_datapoints, num_features, num_targets):
             [num_datapoints * population_size, num_targets]
         """
         X_in = np.concatenate([np.roll(X_in, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
-                               axis=2,
-                               dtype=np.float32)[:num_datapoints]
+                              axis=2,
+                              dtype=np.float32)[:num_datapoints]
         X_in = X_in.reshape((-1, self.sequence_length, num_features))
         y_in = y_in.reshape((-1, num_targets))
         return X_in, y_in
@@ -210,16 +226,29 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         """
         if X.ndim == 3:
             X_shape = X.shape
-            if X_shape[1] == self.population_size and X_shape[-1] == self.num_features:
-                pass
-            elif X_shape[-1] == self.num_features and X_shape[0] == self.population_size:
+            if X_shape[-1] != self.num_features:
+                raise ValueError("the features of test data is incompatible with the training data")
+
+            if X_shape[1] == self.population_size:
+                num_points_X_in = X_shape[0]
+            elif X_shape[0] == self.population_size:
+                num_points_X_in = X_shape[1]
                 X = np.swapaxes(X, 0, 1)
+            elif X_shape[1] == 1:
+                X = X.reshape([-1, self.population_size, self.num_features])
+                num_points_X_in = X_shape[0]
             else:
-                raise ValueError("the shape of test data is incompatible with the training data")
-            X = np.concatenate([self.X_val_tail, X])
-            X = X.reshape((-1, self.sequence_length, self.num_features))
+                raise ValueError("test shape is incompatible with the training shape")
         else:
-            raise ValueError("The test data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
+            raise ValueError(
+                "The test data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
+
+        X = np.concatenate([self.X_val_tail, X])
+        X = np.concatenate([np.roll(X, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
+                           axis=2,
+                           dtype=np.float32)[:num_points_X_in]
+        X = X.reshape((-1, self.sequence_length, self.num_features))
+
 
         dataset = BaseDataset(
             train_tensors=(X, y),

From e5b7beb8008d67de9f2ceb8ad9771d4facd1a05e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 22 Mar 2021 15:08:37 +0100
Subject: [PATCH 029/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py   |  2 +-
 autoPyTorch/optimizer/smbo.py                 |  3 +--
 .../components/training/metrics/utils.py      |  1 -
 autoPyTorch/utils/pipeline.py                 | 21 ++++++++++++-------
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 97be20d3a..c72596043 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -83,7 +83,7 @@ def __init__(self,
         X = np.swapaxes(X, 0, 1).reshape(-1, 1, num_features)
         Y = np.swapaxes(Y, 0, 1).reshape(-1, num_target)
         if X_test is not None and Y_test is not None:
-            X_test = np.swapaxes(X_test, 0, 1).reshape(-1, num_features)
+            X_test = np.swapaxes(X_test, 0, 1).reshape(-1, 1, num_features)
             Y_test = np.swapaxes(Y_test, 0, 1).reshape(-1, num_target)
             test_tensors = (X_test, Y_test)
         else:
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index c2f20f07f..7dc9d989b 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -78,8 +78,7 @@ def get_smac_object(
 
 
 class AutoMLSMBO(object):
-
-    def __init__(self,
+    def  __init__(self,
                  config_space: ConfigSpace.ConfigurationSpace,
                  dataset_name: str,
                  backend: Backend,
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 5729ebbae..3c8208e15 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -6,7 +6,6 @@
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
     REGRESSION_TASKS,
-    FORECASTING_TASKS,
     STRING_TO_TASK_TYPES,
     TASK_TYPES,
 )
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index afb8ed383..3fb166e56 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -10,6 +10,7 @@
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS,
     TIMESERIES_TASKS,
+    FORECASTING_TASKS,
 )
 from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
@@ -85,6 +86,12 @@ def _get_regression_dataset_requirements(info: Dict[str, Any], include: Dict[str
             include=include,
             exclude=exclude
         ).get_dataset_requirements()
+    elif task_type in FORECASTING_TASKS:
+        return TimeSeriesForecastingPipeline(
+            dataset_properties=info,
+            include=include,
+            exclude=exclude
+        ).get_dataset_requirements()
 
     else:
         raise ValueError("Task_type not supported")
@@ -128,19 +135,19 @@ def get_configuration_space(info: Dict[str, Any],
     task_type: int = STRING_TO_TASK_TYPES[info['task_type']]
 
     if task_type in REGRESSION_TASKS:
+        if task_type in FORECASTING_TASKS:
+            return _get_forecasting_configuration_space(info,
+                                                        include if include is not None else {},
+                                                        exclude if exclude is not None else {},
+                                                        search_space_updates=search_space_updates
+                                                        )
         return _get_regression_configuration_space(info,
                                                    include if include is not None else {},
                                                    exclude if exclude is not None else {},
                                                    search_space_updates=search_space_updates
                                                    )
-    elif task_type in CLASSIFICATION_TASKS:
-        return _get_classification_configuration_space(info,
-                                                       include if include is not None else {},
-                                                       exclude if exclude is not None else {},
-                                                       search_space_updates=search_space_updates
-                                                       )
     else:
-        return _get_forecasting_configuration_space(info,
+        return _get_classification_configuration_space(info,
                                                        include if include is not None else {},
                                                        exclude if exclude is not None else {},
                                                        search_space_updates=search_space_updates

From 55086a7c3ca1cdefd53d0243a288c5af5834ec2e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 24 Mar 2021 13:47:11 +0100
Subject: [PATCH 030/347] maint

---
 .../time_series_forecasting_data_loader.py      | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 2f8896bb1..7569570ea 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -164,6 +164,7 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
 
                 num_datapoints_val = X_val.shape[0]
 
+                #TODO needs to be fixed here!!!
                 X_test = np.concatenate([X_val[-self.sequence_length + 1:], X_test])
                 X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
                 val_tensors = self._ser2seq(X_val, y_val, num_datapoints_val, num_features, num_targets)
@@ -172,23 +173,21 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
             num_datapoints_test = X_test.shape[0]
             X_test = np.concatenate([X_train[-self.sequence_length + 1:], X_test])
             self.X_val_tail = X_test[-self.sequence_length + 1:] if self.sequence_length > 1 \
-                else np.zeros((0, population_size, num_features), dtype=X_test.dtype)
+                else np.zeros((0, population_size, num_features)).astype(dtype=X_test.dtype)
 
             test_tensors = self._ser2seq(X_test, y_test, num_datapoints_test, num_features, num_targets)
             datamanager.test_tensors = test_tensors
 
         elif val_tensors is not None:
             X_val, y_val = val_tensors
-            X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
+            X_val = np.concatenate([X_train[-self.sequence_length + self.n_prediction_steps - 1:], X_val])
 
             # used for prediction
-            self.X_val_tail = X_val[-self.sequence_length+1:] if self.sequence_length > 1 \
-                else np.zeros((0, population_size, num_features), dtype=X_val.dtype)
+            self.X_val_tail = X_val[-self.sequence_length + self.n_prediction_steps - 1:]
             val_tensors = self._ser2seq(X_val, y_val, num_datapoints, num_features, num_targets)
             datamanager.val_tensors = val_tensors
         else:
-            self.X_val_tail = X_train[-self.sequence_length+1:] if self.sequence_length > 1 \
-                else np.zeros((0, population_size, num_features), dtype=X_train.dtype)
+            self.X_val_tail = X_train[-self.sequence_length + self.n_prediction_steps - 1:]
 
         train_tensors = self._ser2seq(X_train, y_train, num_datapoints, num_features, num_targets)
         datamanager.train_tensors = train_tensors
@@ -212,8 +211,7 @@ def _ser2seq(self, X_in, y_in, num_datapoints, num_features, num_targets):
             [num_datapoints * population_size, num_targets]
         """
         X_in = np.concatenate([np.roll(X_in, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
-                              axis=2,
-                              dtype=np.float32)[:num_datapoints]
+                              axis=2).astype(np.float32)[:num_datapoints]
         X_in = X_in.reshape((-1, self.sequence_length, num_features))
         y_in = y_in.reshape((-1, num_targets))
         return X_in, y_in
@@ -245,8 +243,7 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
 
         X = np.concatenate([self.X_val_tail, X])
         X = np.concatenate([np.roll(X, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
-                           axis=2,
-                           dtype=np.float32)[:num_points_X_in]
+                           axis=2).astype(np.float32)[:num_points_X_in]
         X = X.reshape((-1, self.sequence_length, self.num_features))
 
 

From 6bf49c6204819c1c39b33843a8f6eed3e44aaf34 Mon Sep 17 00:00:00 2001
From: Marius Lindauer <marius.rks@googlemail.com>
Date: Wed, 28 Apr 2021 12:54:52 +0200
Subject: [PATCH 031/347] Update README.md

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index dc10911bf..c73e53d2b 100644
--- a/README.md
+++ b/README.md
@@ -147,6 +147,17 @@ along with this program (see LICENSE file).
 
 ## Reference
 
+```bibtex
+  @article{zimmer-tpami21a,
+  author = {Lucas Zimmer and Marius Lindauer and Frank Hutter},
+  title = {Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year = {2021},
+  note = { IEEE early access; To appear},
+  pages = {1-12}
+}
+```
+
 ```bibtex
 @incollection{mendoza-automlbook18a,
   author    = {Hector Mendoza and Aaron Klein and Matthias Feurer and Jost Tobias Springenberg and Matthias Urban and Michael Burkart and Max Dippel and Marius Lindauer and Frank Hutter},

From 70ce56bdd955243fbd6fa9876ba657de427dca48 Mon Sep 17 00:00:00 2001
From: Marius Lindauer <marius.rks@googlemail.com>
Date: Wed, 5 May 2021 16:25:02 +0200
Subject: [PATCH 032/347] Update README.md

---
 README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index c73e53d2b..3e67677ef 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,15 @@
 # Auto-PyTorch
 
-Copyright (C) 2019  [AutoML Group Freiburg](http://www.automl.org/)
+Copyright (C) 2021  [AutoML Groups Freiburg and Hannover](http://www.automl.org/)
 
-This a very early pre-alpha version of our upcoming Auto-PyTorch.
-So far, Auto-PyTorch supports featurized data (classification, regression) and image data (classification).
+While early AutoML frameworks focused on optimizing traditional ML pipelines and their hyperparameters, another trend in AutoML is to focus on neural architecture search. To bring the best of these two worlds together, we developed **Auto-PyTorch**, which jointly and robustly optimizes the network architecture and the training hyperparameters to enable fully automated deep learning (AutoDL).
 
-The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799).
+Auto-PyTorch is mainly developed to support tabular data (classification, regression), but can also be applied to image data (classification).
+The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below of bibtex ref).
+
+## Alpha Status of Next Release
+
+The upcoming release of Auto-PyTorch will further improve usability, robustness and efficiency by using SMAC as the underlying optimization package, changing the code structure and other improvements. If you would like to try to give it a try, check out the `refactor` branch.
 
 ## Installation
 
@@ -173,9 +177,6 @@ along with this program (see LICENSE file).
 }
 ```
 
-**Note**: Previously, the name of the project was AutoNet. Since this was too generic, we changed the name to AutoPyTorch. AutoNet 2.0 in the reference mention above is indeed AutoPyTorch.
-
-
 ## Contact
 
-Auto-PyTorch is developed by the [AutoML Group of the University of Freiburg](http://www.automl.org/).
+Auto-PyTorch is developed by the [AutoML Groups of the University of Freiburg and Hannover](http://www.automl.org/).

From 865feb37a65cc793a5b1fd7d88bf2ad01fd691ac Mon Sep 17 00:00:00 2001
From: Frank <fh@informatik.uni-freiburg.de>
Date: Wed, 5 May 2021 20:29:41 +0200
Subject: [PATCH 033/347] Update README.md

---
 README.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 3e67677ef..6834a43a3 100644
--- a/README.md
+++ b/README.md
@@ -5,11 +5,11 @@ Copyright (C) 2021  [AutoML Groups Freiburg and Hannover](http://www.automl.org/
 While early AutoML frameworks focused on optimizing traditional ML pipelines and their hyperparameters, another trend in AutoML is to focus on neural architecture search. To bring the best of these two worlds together, we developed **Auto-PyTorch**, which jointly and robustly optimizes the network architecture and the training hyperparameters to enable fully automated deep learning (AutoDL).
 
 Auto-PyTorch is mainly developed to support tabular data (classification, regression), but can also be applied to image data (classification).
-The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below of bibtex ref).
+The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below for bibtex ref).
 
 ## Alpha Status of Next Release
 
-The upcoming release of Auto-PyTorch will further improve usability, robustness and efficiency by using SMAC as the underlying optimization package, changing the code structure and other improvements. If you would like to try to give it a try, check out the `refactor` branch.
+The upcoming release of Auto-PyTorch will further improve usability, robustness and efficiency by using SMAC as the underlying optimization package, changing the code structure and other improvements. If you would like to give it a try, check out the `refactor` branch.
 
 ## Installation
 
@@ -157,7 +157,7 @@ along with this program (see LICENSE file).
   title = {Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL},
   journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
   year = {2021},
-  note = { IEEE early access; To appear},
+  note = {IEEE early access; also available under https://arxiv.org/abs/2006.13799},
   pages = {1-12}
 }
 ```
@@ -172,8 +172,7 @@ along with this program (see LICENSE file).
   booktitle = {AutoML: Methods, Sytems, Challenges},
   publisher = {Springer},
   chapter   = {7},
-  pages     = {141--156},
-  note      = {To appear.},
+  pages     = {141--156}
 }
 ```
 

From a3a3257990624a0bf4be80dd7fba31e7cfb5be00 Mon Sep 17 00:00:00 2001
From: Francisco Rivera Valverde
 <44504424+franchuterivera@users.noreply.github.com>
Date: Wed, 12 May 2021 15:20:52 +0200
Subject: [PATCH 034/347] [FIX] master branch README (#209)

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6834a43a3..d28dbb45d 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ The newest features in Auto-PyTorch for tabular data are described in the paper
 
 ## Alpha Status of Next Release
 
-The upcoming release of Auto-PyTorch will further improve usability, robustness and efficiency by using SMAC as the underlying optimization package, changing the code structure and other improvements. If you would like to give it a try, check out the `refactor` branch.
+The upcoming release of Auto-PyTorch will further improve usability, robustness and efficiency by using SMAC as the underlying optimization package, changing the code structure and other improvements. If you would like to give it a try, check out the `development` branch or it's [documentation](https://automl.github.io/Auto-PyTorch/development/).
 
 ## Installation
 
@@ -20,10 +20,10 @@ $ cd install/path
 $ git clone https://github.com/automl/Auto-PyTorch.git
 $ cd Auto-PyTorch
 ```
-If you want to contribute to this repository switch to our current develop branch
+If you want to contribute to this repository switch to our current development branch
 
 ```sh
-$ git checkout develop
+$ git checkout development
 ```
 
 Install pytorch: 

From 8aa5257109577e5593bfbc45fa9934ad03633d24 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 29 Jun 2021 13:44:44 +0200
Subject: [PATCH 035/347] dataset size as budget

---
 autoPyTorch/api/base_task.py                  |  6 ++-
 autoPyTorch/api/time_series_forecasting.py    |  6 ++-
 autoPyTorch/evaluation/abstract_evaluator.py  |  3 ++
 autoPyTorch/evaluation/tae.py                 | 19 ++++---
 autoPyTorch/optimizer/smbo.py                 | 14 +++--
 .../time_series_forecasting_data_loader.py    | 52 +++++++++++++------
 6 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 738ab6161..cf5c5b464 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -701,7 +701,6 @@ def _search(
             self
 
         """
-
         if self.task_type != dataset.task_type:
             raise ValueError("Incompatible dataset entered for current task,"
                              "expected dataset to have task type :{} got "
@@ -736,12 +735,16 @@ def _search(
 
         budget_config: Dict[str, Union[float, str]] = {}
         if budget_type is not None and budget is not None:
+            # budget type is specified by the user
             budget_config['budget_type'] = budget_type
             budget_config[budget_type] = budget
         elif budget_type is not None or budget is not None:
             raise ValueError(
                 "budget type was not specified in budget_config"
             )
+        else:
+            # TODO do meta learning here to determine the fiedlity to use?
+            pass
 
         if self.task_type is None:
             raise ValueError("Cannot interpret task type from the dataset")
@@ -755,6 +758,7 @@ def _search(
         else:
             self._is_dask_client_internally_created = False
 
+
         # ============> Run dummy predictions
         num_run = 1
         dummy_task_name = 'runDummy'
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index ad7f5bc7c..6d05b7822 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -89,6 +89,10 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
         )
         self.n_prediction_steps = n_prediction_steps
+        # here fraction of subset could be number of images, tabular data or resolution of time-series datasets.
+        #TODO if budget type dataset_size is applied to all datasets, we will put it to configs
+        self.pipeline_options.update({"min_fraction_subset": 0.1,
+                                      "fraction_subset": 1.0})
 
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TimeSeriesForecastingDataset):
@@ -140,7 +144,7 @@ def search(
                 evaluate a pipeline.
             budget_type (Optional[str]):
                 Type of budget to be used when fitting the pipeline.
-                Either 'epochs' or 'runtime'. If not provided, uses
+                Either 'epochs' or 'runtime' or 'dataset_size'. If not provided, uses
                 the default in the pipeline config ('epochs')
             budget (Optional[float]):
                 Budget to fit a single run of the pipeline. If not
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index b75426537..56124c3a0 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -323,6 +323,9 @@ def __init__(self, backend: Backend,
         # If the budget is epochs, we want to limit that in the fit dictionary
         if self.budget_type == 'epochs':
             self.fit_dictionary['epochs'] = budget
+        if self.budget_type == 'dataset_size':
+            if self.task_type in TIMESERIES_TASKS:
+                self.fit_dictionary['sample_interval'] = int(np.ceil(1.0 / budget))
 
         self.num_run = 0 if num_run is None else num_run
 
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index fdb33d9a0..2e5e90563 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -198,12 +198,19 @@ def run_wrapper(
                     'If budget_type is None, budget must be.0, but is %f' % run_info.budget
                 )
         else:
-            if run_info.budget == 0:
-                run_info = run_info._replace(budget=100.0)
-            elif run_info.budget <= 0 or run_info.budget > 100:
-                raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
-                                 run_info.budget)
-            if self.budget_type not in ('epochs', 'runtime'):
+            if self.budget_type in ('epochs', 'runtime'):
+                if run_info.budget == 0:
+                    run_info = run_info._replace(budget=100.0)
+                elif run_info.budget <= 0 or run_info.budget > 100:
+                    raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
+                                     run_info.budget)
+            elif self.budget_type == 'dataset_size':
+                if run_info.budget == 0:
+                    run_info = run_info._replace(budget=1.0)
+                elif run_info.budget <= 0 or run_info.budget > 1.:
+                    raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
+                                     run_info.budget)
+            else:
                 raise ValueError("Illegal value for budget type, must be one of "
                                  "('epochs', 'runtime'), but is : %s" %
                                  self.budget_type)
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 7dc9d989b..f4d6a06ec 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -301,9 +301,17 @@ def run_smbo(self, func: typing.Optional[typing.Callable] = None
                         self.smac_scenario_args[arg]
                     )
             scenario_dict.update(self.smac_scenario_args)
-
-        initial_budget = self.pipeline_config['min_epochs']
-        max_budget = self.pipeline_config['epochs']
+        budget_type = self.pipeline_config['budget_type']
+        if budget_type == 'epochs':
+            initial_budget = self.pipeline_config['min_epochs']
+            max_budget = self.pipeline_config['epochs']
+        elif budget_type == 'dataset_size':
+            initial_budget = self.pipeline_config.get('min_fraction_subset', 0.1)
+            max_budget = self.pipeline_config.get('fraction_subset', 1.0)
+        else:
+            raise ValueError("Illegal value for budget type, must be one of "
+                             "('epochs', 'runtime'), but is : %s" %
+                             budget_type)
 
         if self.get_smac_object_callback is not None:
             smac = self.get_smac_object_callback(scenario_dict=scenario_dict,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 7569570ea..3f44dc355 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -36,12 +36,27 @@ class TimeSeriesForecastingDataLoader(TimeSeriesDataLoader):
     def __init__(self,
                  batch_size: int = 64,
                  sequence_length: int = 1,
+                 #sample_interval: int = 1,
                  upper_sequence_length: int = np.iinfo(np.int32).max,
                  n_prediction_steps: int = 1) -> None:
+        """
+        initialize a dataloader
+        Args:
+            batch_size: batch size
+            sequence_length: length of each sequence
+            sample_interval: sample interval ,its value is the interval of the resolution
+            upper_sequence_length: upper limit of sequence length, to avoid a sequence length larger than dataset length
+            or specified by the users
+            n_prediction_steps: how many stpes to predict in advance
+        """
         super().__init__(batch_size=batch_size)
         self.sequence_length: int = sequence_length
         self.upper_seuqnce_length = upper_sequence_length
         self.n_prediction_steps = n_prediction_steps
+        self.sample_interval = 1
+        # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
+        # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
+        self.tail_length = (self.sequence_length * self.sample_interval) + self.n_prediction_steps - 1
 
     def transform(self, X: np.ndarray) -> np.ndarray:
         """The transform function calls the transform function of the
@@ -77,6 +92,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Returns:
             A instance of self
         """
+        fraction_subset = X.get('fraction_subset', 1.0)
+        self.sample_interval = int(np.ceil(1.0 / fraction_subset))
+        print("!"*50)
+        print(self.sample_interval)
+        print("#"*50)
+        self.tail_length = (self.sequence_length * self.sample_interval) + self.n_prediction_steps - 1
+
         # Make sure there is an optimizer
         self.check_requirements(X, y)
 
@@ -96,7 +118,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             train=False,
         )
 
-
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
@@ -146,10 +167,10 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
         time_series_length = X_train.shape[0]
         self.population_size = population_size
         self.num_features = num_features
-        num_datapoints = time_series_length - self.sequence_length - n_prediction_steps + 1
+        num_datapoints_train = time_series_length - (self.sequence_length - 1) * self.sample_interval - n_prediction_steps + 1
         num_targets = y_train.shape[-1]
 
-        y_train = y_train[-num_datapoints:, :]
+        y_train = y_train[-num_datapoints_train:, :]
         if test_tensors is not None:
             X_test, y_test = test_tensors
 
@@ -164,15 +185,15 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
 
                 num_datapoints_val = X_val.shape[0]
 
-                #TODO needs to be fixed here!!!
-                X_test = np.concatenate([X_val[-self.sequence_length + 1:], X_test])
-                X_val = np.concatenate([X_train[-self.sequence_length+1:], X_val])
+                X_val = np.concatenate([X_train[-self.tail_length:], X_val])
+                X_test = np.concatenate([X_val[-self.tail_length:], X_test])
                 val_tensors = self._ser2seq(X_val, y_val, num_datapoints_val, num_features, num_targets)
                 datamanager.val_tensors = val_tensors
 
             num_datapoints_test = X_test.shape[0]
-            X_test = np.concatenate([X_train[-self.sequence_length + 1:], X_test])
-            self.X_val_tail = X_test[-self.sequence_length + 1:] if self.sequence_length > 1 \
+
+            X_test = np.concatenate([X_train[-self.tail_length:], X_test])
+            self.X_val_tail = X_test[-self.tail_length:] if self.tail_length > 1 \
                 else np.zeros((0, population_size, num_features)).astype(dtype=X_test.dtype)
 
             test_tensors = self._ser2seq(X_test, y_test, num_datapoints_test, num_features, num_targets)
@@ -180,16 +201,16 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
 
         elif val_tensors is not None:
             X_val, y_val = val_tensors
-            X_val = np.concatenate([X_train[-self.sequence_length + self.n_prediction_steps - 1:], X_val])
+            X_val = np.concatenate([X_train[-self.tail_length:], X_val])
 
             # used for prediction
-            self.X_val_tail = X_val[-self.sequence_length + self.n_prediction_steps - 1:]
-            val_tensors = self._ser2seq(X_val, y_val, num_datapoints, num_features, num_targets)
+            self.X_val_tail = X_val[-self.tail_length:]
+            val_tensors = self._ser2seq(X_val, y_val, num_datapoints_train, num_features, num_targets)
             datamanager.val_tensors = val_tensors
         else:
-            self.X_val_tail = X_train[-self.sequence_length + self.n_prediction_steps - 1:]
+            self.X_val_tail = X_train[-self.tail_length:]
 
-        train_tensors = self._ser2seq(X_train, y_train, num_datapoints, num_features, num_targets)
+        train_tensors = self._ser2seq(X_train, y_train, num_datapoints_train, num_features, num_targets)
         datamanager.train_tensors = train_tensors
         datamanager.splits = datamanager.get_splits_from_resampling_strategy()
         return datamanager
@@ -210,7 +231,7 @@ def _ser2seq(self, X_in, y_in, num_datapoints, num_features, num_targets):
             y_in_trans: transformed input target array with shape
             [num_datapoints * population_size, num_targets]
         """
-        X_in = np.concatenate([np.roll(X_in, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
+        X_in = np.concatenate([np.roll(X_in, shift=i * self.sample_interval, axis=0) for i in range(0, -self.sequence_length, -1)],
                               axis=2).astype(np.float32)[:num_datapoints]
         X_in = X_in.reshape((-1, self.sequence_length, num_features))
         y_in = y_in.reshape((-1, num_targets))
@@ -226,7 +247,6 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
             X_shape = X.shape
             if X_shape[-1] != self.num_features:
                 raise ValueError("the features of test data is incompatible with the training data")
-
             if X_shape[1] == self.population_size:
                 num_points_X_in = X_shape[0]
             elif X_shape[0] == self.population_size:
@@ -242,7 +262,7 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
                 "The test data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
 
         X = np.concatenate([self.X_val_tail, X])
-        X = np.concatenate([np.roll(X, shift=i, axis=0) for i in range(0, -self.sequence_length, -1)],
+        X = np.concatenate([np.roll(X, shift=i * self.sample_interval, axis=0) for i in range(0, -self.sequence_length, -1)],
                            axis=2).astype(np.float32)[:num_points_X_in]
         X = X.reshape((-1, self.sequence_length, self.num_features))
 

From 3871e7a6807e5b50f9ee2f8c9d83adae12a6c755 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 29 Jun 2021 13:48:11 +0200
Subject: [PATCH 036/347] remove redudant codes

---
 .../data_loader/time_series_forecasting_data_loader.py     | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 3f44dc355..fa96dbc73 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -92,11 +92,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Returns:
             A instance of self
         """
-        fraction_subset = X.get('fraction_subset', 1.0)
-        self.sample_interval = int(np.ceil(1.0 / fraction_subset))
-        print("!"*50)
-        print(self.sample_interval)
-        print("#"*50)
+        sample_interval = X.get('sample_interval', 1)
+        self.sample_interval = sample_interval
         self.tail_length = (self.sequence_length * self.sample_interval) + self.n_prediction_steps - 1
 
         # Make sure there is an optimizer

From 295c538d715cf1b24898fbbf17d81698a17953c0 Mon Sep 17 00:00:00 2001
From: Francisco Rivera Valverde
 <44504424+franchuterivera@users.noreply.github.com>
Date: Tue, 29 Jun 2021 14:38:21 +0200
Subject: [PATCH 037/347] Enable github actions (#273)

---
 .github/workflows/long_regression_test.yml | 35 ++++++++++++++++++++++
 .github/workflows/scheduled_test.yml       | 34 +++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 .github/workflows/long_regression_test.yml
 create mode 100644 .github/workflows/scheduled_test.yml

diff --git a/.github/workflows/long_regression_test.yml b/.github/workflows/long_regression_test.yml
new file mode 100644
index 000000000..135c45fb0
--- /dev/null
+++ b/.github/workflows/long_regression_test.yml
@@ -0,0 +1,35 @@
+name: Tests
+
+on:
+  schedule:
+    # Every Truesday at 7AM UTC
+    # TODO teporary set to every day just for the PR
+    #- cron: '0 07 * * 2'
+    - cron: '0 07 * * *'
+
+
+jobs:
+  ubuntu:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+      fail-fast:  false
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        ref: development
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install test dependencies
+      run: |
+        git submodule update --init --recursive
+        python -m pip install --upgrade pip
+        pip install -e .[test]
+    - name: Run tests
+      run: |
+        python -m pytest --durations=200 cicd/test_preselected_configs.py -vs
diff --git a/.github/workflows/scheduled_test.yml b/.github/workflows/scheduled_test.yml
new file mode 100644
index 000000000..68f37d72d
--- /dev/null
+++ b/.github/workflows/scheduled_test.yml
@@ -0,0 +1,34 @@
+name: Tests
+
+on:
+  schedule:
+    # Every Monday at 7AM UTC
+    - cron: '0 07 * * 1'
+
+
+jobs:
+  ubuntu:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+      fail-fast:  false
+      max-parallel: 2
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        ref: development
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install test dependencies
+      run: |
+        git submodule update --init --recursive
+        python -m pip install --upgrade pip
+        pip install -e .[test]
+    - name: Run tests
+      run: |
+        python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v test
\ No newline at end of file

From e2dafb61946040c1990bc4093840d5bac4ccd45a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 8 Jul 2021 16:03:49 +0200
Subject: [PATCH 038/347] allow TS data with different sequence lengthes

---
 autoPyTorch/api/time_series_forecasting.py    |   7 +-
 ...me_series_forecasting_feature_validator.py | 115 ++++++--
 ...ime_series_forecasting_target_validator.py |  70 ++++-
 autoPyTorch/data/time_series_validator.py     |   8 +
 autoPyTorch/datasets/resampling_strategy.py   |  24 +-
 autoPyTorch/datasets/time_series_dataset.py   | 278 +++++++++++++-----
 .../TimeSeriesTransformer.py                  |   5 +-
 .../network_backbone/base_network_backbone.py |  16 +-
 .../time_series_forecasting_data_loader.py    | 201 +++++++------
 9 files changed, 512 insertions(+), 212 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 6d05b7822..27182e547 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -63,11 +63,10 @@ def __init__(
         delete_output_folder_after_terminate: bool = True,
         include_components: Optional[Dict] = None,
         exclude_components: Optional[Dict] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        n_prediction_steps: int = 1,
     ):
         super().__init__(
             seed=seed,
@@ -88,7 +87,6 @@ def __init__(
             search_space_updates=search_space_updates,
             task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
         )
-        self.n_prediction_steps = n_prediction_steps
         # here fraction of subset could be number of images, tabular data or resolution of time-series datasets.
         #TODO if budget type dataset_size is applied to all datasets, we will put it to configs
         self.pipeline_options.update({"min_fraction_subset": 0.1,
@@ -112,6 +110,7 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         #target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+        n_prediction_steps: int= 1,
         dataset_name: Optional[str] = None,
         budget_type: Optional[str] = None,
         budget: Optional[float] = None,
@@ -217,7 +216,7 @@ def search(
             validator=self.InputValidator,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
-            n_prediction_steps=self.n_prediction_steps,
+            n_prediction_steps=n_prediction_steps,
         )
 
         if traditional_per_total_budget > 0.:
diff --git a/autoPyTorch/data/time_series_forecasting_feature_validator.py b/autoPyTorch/data/time_series_forecasting_feature_validator.py
index 9f703254e..f41d4ce9c 100644
--- a/autoPyTorch/data/time_series_forecasting_feature_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_feature_validator.py
@@ -1,4 +1,10 @@
-from typing import Optional
+from typing import Optional, Union, List
+import logging
+import copy
+import sklearn.utils
+
+
+from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 import numpy as np
 
@@ -10,10 +16,18 @@
 
 
 class TimeSeriesForecastingFeatureValidator(TabularFeatureValidator):
+    def __init__(self,
+                 logger: Optional[Union[PicklableClientLogger, logging.Logger
+                 ]] = None,
+                 ) -> None:
+        TabularFeatureValidator.__init__(self, logger)
+        self._extend_feature_dims = False
+
     def fit(self,
-            X_train: np.ndarray,
+            X_train: Union[np.ndarray, List[np.ndarray]],
             X_test: Optional[np.ndarray] = None) -> BaseEstimator:
         """
+        We expect a time series dataset stored in the form :[population, time_series, features]
 
         Arguments:
             X_train (np.ndarray):
@@ -27,16 +41,18 @@ def fit(self,
             self:
                 The fitted base estimator
         """
-
-        if not isinstance(X_train, np.ndarray):
-            raise ValueError(f"Time series train data must be given as a numpy array, but got {type(X_train)}")
-
-        if X_train.ndim != 3:
-            raise ValueError(f"Invalid number of dimensions for time series train data, "
-                             f"expected 3 but got {X_train.ndim}. "
-                             f"Time series data has to be of shape [B, T, F] where B is the "
-                             f"batch dimension, T is the time dimension and F are the number of features.")
-
+        # TODO only allow np.ndarray(3D) and List of np.ndarray(2D) or array of array (2D) to reduce complexity!!!!!
+        if isinstance(X_train, np.ndarray):
+            if X_train.ndim > 3:
+                raise ValueError(f"Number of dimensions too large for time series train data.")
+            if X_train.ndim == 1:
+                self.validate_ts_data(X_train)
+        elif isinstance(X_train, list):
+            self.validate_ts_data(X_train)
+        else:
+            raise ValueError(f"Time series train data must be given as a numpy array or nested list,"
+                             f" but got {type(X_train)}")
+        """
         _ = sklearn.utils.check_array(
             X_train,
             force_all_finite=True,
@@ -45,22 +61,19 @@ def fit(self,
             accept_sparse=False,
             accept_large_sparse=False
         )
-
+        """
         if X_test is not None:
-            if not isinstance(X_test, np.ndarray):
-                raise ValueError(f"Time series test data must be given as a numpy array, but got {type(X_test)}")
-
-            if not X_test.ndim == 3:
-                raise ValueError(f"Invalid number of dimensions for time series test data, "
-                                 f"expected 3 but got {X_train.ndim}. "
-                                 f"Time series data has to be of shape [B, T, F] where B is the "
-                                 f"batch dimension, T is the time dimension and F are the number of features")
-
-            if X_train.shape[0] != X_test.shape[0] or X_train.shape[-1] != X_test.shape[-1]:
-                raise ValueError(f"Time series train and test data are expected to have the same shape except for "
-                                 f"the sequence length, but got {X_train.shape} for train data and "
-                                 f"{X_test.shape} for test data")
-
+            if isinstance(X_test, np.ndarray):
+                if X_test.ndim > 3:
+                    raise ValueError(f"Number of dimensions too large for time series train data.")
+                if X_test.ndim == 1:
+                    self.validate_ts_data(X_test)
+            elif isinstance(X_test, list):
+                self.validate_ts_data(X_test)
+            else:
+                raise ValueError(f"Time series train data must be given as a numpy array or nested list,"
+                                 f" but got {type(X_test)}")
+            """
             _ = sklearn.utils.check_array(
                 X_test,
                 force_all_finite=True,
@@ -69,12 +82,52 @@ def fit(self,
                 accept_sparse=False,
                 accept_large_sparse=False
             )
-        self._fit(X_train[0])
+            """
+        first_sequence = np.array(X_train[0])
+
+        if self._extend_feature_dims:
+            first_sequence = np.expand_dims(first_sequence, axis=-1)
+            self.n_feature_dims = 1
+        self._fit(first_sequence)
 
         self._is_fitted = True
 
         return self
 
+    def validate_ts_data(self, X, is_train_set=True):
+        n_feature_dims = [0] * len(X)
+        seq_ndims = [0] * len(X)
+        for idx_seq, x in enumerate(X):
+            x_array_shape = np.array(x).shape
+            x_array_n_dims = len(x_array_shape)
+            seq_ndims[idx_seq] = x_array_n_dims
+
+            if x_array_n_dims == 1:
+                # As lots of time series prediction tasks only have one sequence feature, we will not raise an error here
+                #self.logger.warning(f"For each piece of time series data, we will automatically convert 1D vector to"
+                #                    f"2D matrix!")
+                self._extend_feature_dims = True
+                n_feature_dims[idx_seq] = 1
+            elif x_array_n_dims > 2:
+                raise ValueError(f"Invalid number of dimensions for time series train data")
+            else:
+                n_feature_dims[idx_seq] = x_array_shape[-1]
+
+
+        if not np.all(np.asarray(seq_ndims) == seq_ndims[0]):
+            raise ValueError(f"All the sequence needs to have the same shape!")
+        if not np.all(np.asarray(n_feature_dims) == n_feature_dims[0]):
+            raise ValueError(f"Number of features does not match for all the sequence")
+
+        if is_train_set:
+            self.n_feature_dims = n_feature_dims[0]
+            self.seq_ndims = seq_ndims[0]
+        else:
+            if seq_ndims[0] != self.seq_ndims:
+                raise ValueError("number of sequence dimensions does not match for training and test sets!")
+            if n_feature_dims[0] != self.n_feature_dims:
+                raise ValueError("number of feature dimensions does not match for training and test sets!")
+
     def transform(self, X: np.ndarray) -> np.ndarray:
         """
 
@@ -89,6 +142,11 @@ def transform(self, X: np.ndarray) -> np.ndarray:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
+        if self._extend_feature_dims:
+            for seq_idx in range(len(X)):
+                X[seq_idx] = np.expand_dims(X[seq_idx], axis=-1)
+        return X
+        """
         return sklearn.utils.check_array(
             X,
             force_all_finite=True,
@@ -97,4 +155,5 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             accept_sparse=False,
             accept_large_sparse=False
         )
+        """
 
diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
index 7010731bf..e642fcfbe 100644
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_target_validator.py
@@ -1,7 +1,8 @@
 from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
 
+import copy
 import typing
-
+import logging
 import numpy as np
 
 import pandas as pd
@@ -16,9 +17,18 @@
 from sklearn.utils.multiclass import type_of_target
 
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES
+from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
 class TimeSeriesForecastingTargetValidator(TabularTargetValidator):
+    def __init__(self,
+                 is_classification: bool = False,
+                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
+                                                      ]] = None,
+                 ) -> None:
+        TabularTargetValidator.__init__(self, is_classification, logger)
+        self._extend_feature_dims = False
+
     def fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
@@ -36,19 +46,34 @@ def fit(
                 categories of the encoder.
         """
         # Check that the data is valid
-        self._check_data(y_train)
+        y_train_first_seq = np.array(y_train[0])
+        if len(y_train_first_seq.shape) == 1:
+            y_train_first_seq = np.expand_dims(y_train_first_seq, -1)
+        self._check_data(y_train_first_seq)
+
+        num_seq = len(y_train)
+
+        if len(np.shape(y_train[0])) == 1:
+            self._extend_feature_dims = True
+            num_target_train = [1] * num_seq
+        else:
+            num_target_train = [0] * num_seq
+            for seq_idx in range(num_seq):
+                num_target_train[seq_idx] = np.shape(y_train[seq_idx])[-1]
 
-        shape = np.shape(y_train)
         if y_test is not None:
-            self._check_data(y_test)
-
-            if len(shape) != len(np.shape(y_test)) or (
-                    len(shape) > 1 and (shape[0] != np.shape(y_test)[0] or shape[-1] != np.shape(y_test)[-1])):
-                raise ValueError("The dimensionality of the train and test targets "
-                                 "does not match train({}) != test({})".format(
-                                     np.shape(y_train),
-                                     np.shape(y_test)
-                                 ))
+            self._check_data(y_test[0])
+            if len(y_train) != len(y_test):
+                raise ValueError("Training test must have the same amount of sequences as test set!")
+            if len(np.shape(y_train[1])) == 1:
+                num_target_test = [1] * num_seq
+            else:
+                num_target_test = [0] * num_seq
+                for seq_idx in range(num_seq):
+                    test_seq = y_test[seq_idx]
+                    test_seq_shape = np.shape(test_seq)
+                    num_target_test[seq_idx] = test_seq_shape[-1]
+
             if isinstance(y_train, pd.DataFrame):
                 y_train = typing.cast(pd.DataFrame, y_train)
                 y_test = typing.cast(pd.DataFrame, y_test)
@@ -64,10 +89,16 @@ def fit(
                 if list(y_train.dtypes) != list(y_test.dtypes):
                     raise ValueError("Train and test targets must both have the same dtypes")
 
+            if not np.all(np.asarray(num_target_test) == num_target_test[0]):
+                raise ValueError("Test sets have inconsistent number of targets")
+
+        if not np.all(np.asarray(num_target_train) == num_target_train[0]):
+            raise ValueError("Train sets have inconsistent number of targets")
+
         if self.out_dimensionality is None:
-            self.out_dimensionality = 1 if len(shape) == 1 else shape[1]
+            self.out_dimensionality = 1 if self._extend_feature_dims else num_target_train[0]
         else:
-            _n_outputs = 1 if len(shape) == 1 else shape[1]
+            _n_outputs = 1 if self._extend_feature_dims else num_target_train[0]
             if self.out_dimensionality != _n_outputs:
                 raise ValueError('Number of outputs changed from %d to %d!' %
                                  (self.out_dimensionality, _n_outputs))
@@ -87,8 +118,16 @@ def transform(
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
         # Check the data here so we catch problems on new test data
-        self._check_data(y)
+        y_first_seq = np.array(y[0])
+        if len(y_first_seq.shape) == 1:
+            y_train_first_seq = np.expand_dims(y_first_seq, -1)
+
+        self._check_data(y_train_first_seq)
+        if self._extend_feature_dims:
+            for seq_idx in range(len(y)):
+                y[seq_idx] = np.expand_dims(y[seq_idx], axis=-1)
 
+        """
         # sklearn check array will make sure we have the
         # correct numerical features for the array
         # Also, a numpy array will be created
@@ -100,6 +139,7 @@ def transform(
             accept_sparse=False,
             accept_large_sparse=False
         )
+        """
         return y
 
     """
diff --git a/autoPyTorch/data/time_series_validator.py b/autoPyTorch/data/time_series_validator.py
index 62be3318f..373a6a740 100644
--- a/autoPyTorch/data/time_series_validator.py
+++ b/autoPyTorch/data/time_series_validator.py
@@ -2,6 +2,11 @@
 import logging
 import typing
 
+from sklearn.base import BaseEstimator
+
+from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
+from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
+
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
@@ -49,3 +54,6 @@ def __init__(
         )
 
         self._is_fitted = False
+
+
+
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 1d0bc3077..d4dce13b5 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -41,6 +41,7 @@ class CrossValTypes(IntEnum):
 class HoldoutValTypes(IntEnum):
     holdout_validation = 6
     stratified_holdout_validation = 7
+    time_series_hold_out_validation = 8
 
 
 RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes]
@@ -52,6 +53,9 @@ class HoldoutValTypes(IntEnum):
     HoldoutValTypes.stratified_holdout_validation: {
         'val_share': 0.33,
     },
+    HoldoutValTypes.time_series_hold_out_validation: {
+    'val_share': 0.33
+    },
     CrossValTypes.k_fold_cross_validation: {
         'num_splits': 3,
     },
@@ -135,6 +139,22 @@ def k_fold_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any)
     return splits
 
 
+# TODO DO we move these under autoPyTorch/datasets/time_series_dataset.py?
+def time_series_hold_out_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \
+        -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Return holdout indices respecting hte temporal ordering of the data
+    Args:
+        val_share:
+        indices: List of all possible indices
+        **kwargs:
+
+    Returns:
+    """
+    train, val = train_test_split(indices, test_size=val_share, shuffle=False)
+    return train, val
+
+
 def time_series_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any) \
         -> List[Tuple[np.ndarray, np.ndarray]]:
     """
@@ -144,10 +164,12 @@ def time_series_cross_validation(num_splits: int, indices: np.ndarray, **kwargs:
         [0, 1] [2]
         [0, 1, 2] [3]
 
-    :param indices: array of indices to be split
+    :param indices: array of indices to be split, seq_length
     :param num_splits: number of cross validation splits
     :return: list of tuples of training and validation indices
     """
+    # TODO: we use gap=n_prediction_step here, we need to consider if we want to implement n_prediction_step here or
+    # under DATALOADER!!!
     cv = TimeSeriesSplit(n_splits=num_splits)
     splits = list(cv.split(indices))
     return splits
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index c72596043..83d939bfe 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -4,7 +4,9 @@
 import numpy as np
 
 import pandas as pd
-import sklearn
+from scipy.sparse import issparse
+
+from torch.utils.data.dataset import Dataset, Subset, ConcatDataset
 
 import torchvision.transforms
 
@@ -20,7 +22,7 @@
     TIMESERIES_FORECASTING,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.base_dataset import BaseDataset, type_check, type_of_target, TransformSubset
 from autoPyTorch.datasets.resampling_strategy import (
     DEFAULT_RESAMPLING_PARAMETERS,
     CrossValTypes,
@@ -31,81 +33,195 @@
 )
 
 from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
+from autoPyTorch.datasets.tabular_dataset import TabularDataset
 
 #TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
 #TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
 #TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
+"""
+class TimeSeriesSequence(BaseDataset):
+    def __init__(self,
+                 train_tensors: Union[np.ndarray, List[List]],
+                 dataset_name: Optional[str] = None,
+                 val_tensors: Optional[ Union[np.ndarray, List[List]]] = None,
+                 test_tensors: Optional[ Union[np.ndarray, List[List]]] = None,
+                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
+                 resampling_strategy_args: Optional[Dict[str, Any]] = None,
+                 seed: Optional[int] = 42,
+                 train_transforms: Optional[torchvision.transforms.Compose] = None,
+                 val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 ):
+        if dataset_name is not None:
+            self.dataset_name = dataset_name
+        else:
+            self.dataset_name = hash_array_or_matrix(train_tensors[0])
+        if not hasattr(train_tensors[0], 'shape'):
+            type_check(train_tensors, val_tensors)
+        self.train_tensors = train_tensors
+        self.val_tensors = val_tensors
+        self.test_tensors = test_tensors
+        self.cross_validators = {}
+        self.holdout_validators = {}
+        self.rand = np.random.RandomState(seed=seed)
+        self.shuffle = False
+        self.task_type: Optional[str] = None
+
+        self.resampling_strategy = resampling_strategy
+        self.resampling_strategy_args = resampling_strategy_args
+
+        self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
+        self.holdout_validators = get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
+
+        self.splits = self.get_splits_from_resampling_strategy()
+
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+"""
+
 
 class TimeSeriesForecastingDataset(BaseDataset):
     def __init__(self,
-                 X: np.ndarray,
+                 X: Union[np.ndarray, List[List]],
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
+                 dataset_name: Optional[str] = None,
+                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = False,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
-                 validator: Optional[BaseInputValidator] = None,
+                 validator: Optional[TimeSeriesForecastingInputValidator] = None,
                  n_prediction_steps: int = 1,
                  ):
         """
-
         :param target_variables: The indices of the variables you want to forecast
         :param sequence_length: The amount of past data you want to use to forecast future value
         :param n_steps: The number of steps you want to forecast into the future
         :param train: Tuple with one tensor holding the training data
         :param val: Tuple with one tensor holding the validation data
         """
+        assert X is not Y, "Training and Test data needs to belong two different object!!!"
         self.n_prediction_steps = n_prediction_steps
         self.validator = validator
         if self.validator is not None:
+            if not isinstance(validator, TimeSeriesForecastingInputValidator):
+                raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
+                                 f"but receive {type(validator)}")
+
             X, Y = self.validator.transform(X, Y)
+            self.num_features = self.validator.feature_validator.n_feature_dims
+            self.num_target = self.validator.target_validator.out_dimensionality
+
             if X_test is not None:
                 X_test, Y_test = self.validator.transform(X_test, Y_test)
+        else:
+            self.num_features = np.shape(X[0])[-1]
+            self.num_target = np.shape(Y[0])[-1]
+
+        self.num_sequences = len(X)
+        self.sequence_lengths_train = [0] * self.num_sequences
+        for seq_idx in range(self.num_sequences):
+            self.sequence_lengths_train[seq_idx] = len(X[seq_idx])
 
-        population_size, time_series_length, num_features = X.shape
-        _, _, num_target = Y.shape
-        self.population_size = population_size
-        self.time_series_length = time_series_length
-        self.num_features = num_features
-        self.num_target = num_target
+        self.sequence_lengths_val = [0] * self.num_sequences
+        self.sequence_lengths_test = [0] * self.num_sequences
 
         self.categorical_columns = validator.feature_validator.categorical_columns
         self.numerical_columns = validator.feature_validator.numerical_columns
 
+        num_train_data = np.sum(self.sequence_lengths_train)
+        X_train_flatten = np.empty([num_train_data, self.num_features])
+        y_train_flatten = np.empty([num_train_data, self.num_features])
+        start_idx = 0
+
+        self.sequences = []
+
+        if shuffle:
+            warnings.WarningMessage("Time Series Forecasting will not shuffle the data")
+        for seq_idx, seq_length in enumerate(self.sequence_lengths_train):
+            end_idx = start_idx + seq_length
+            X_train_flatten[start_idx: end_idx] = np.array(X[seq_idx])
+            y_train_flatten[start_idx: end_idx] = np.array(Y[seq_idx])
+            start_idx = end_idx
+
+        train_tensors = (X_train_flatten, y_train_flatten)
 
-        _check_time_series_forecasting_inputs(train=X, val=X_test)
-        # swap the axis of population_size and sequence_length hence the splitter will split the dataset w.r.t. sequence
-        X = np.swapaxes(X, 0, 1).reshape(-1, 1, num_features)
-        Y = np.swapaxes(Y, 0, 1).reshape(-1, num_target)
         if X_test is not None and Y_test is not None:
-            X_test = np.swapaxes(X_test, 0, 1).reshape(-1, 1, num_features)
-            Y_test = np.swapaxes(Y_test, 0, 1).reshape(-1, num_target)
-            test_tensors = (X_test, Y_test)
+            for seq_idx in range(self.num_sequences):
+                self.sequence_lengths_test[seq_idx] = len(X_test[seq_idx])
+            num_test_data = np.sum(self.sequence_lengths_test)
+            X_test_flatten = np.empty([num_test_data, self.num_features])
+            y_test_flatten = np.empty([num_test_data, self.num_target])
+            start_idx = 0
+
+            for seq_idx, seq_length in enumerate(self.sequence_lengths_test):
+                end_idx = start_idx + seq_length
+                X_test_flatten[start_idx: end_idx] = np.array(X[seq_idx])
+                y_test_flatten[start_idx: end_idx] = np.array(Y[seq_idx])
+                start_idx = end_idx
+            test_tensors = (X_test_flatten, y_test_flatten)
         else:
             test_tensors = None
-        if shuffle:
-            warnings.WarningMessage("Time Series Forecasting will not shuffle the data")
-        train_tensors = (X, Y)
-        super().__init__(train_tensors=train_tensors, test_tensors=test_tensors, shuffle=False,
-                         resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
-                         seed=seed,
-                         train_transforms=train_transforms,
-                         val_transforms=val_transforms,
-                         )
+        """
+        super(TimeSeriesForecastingDataset, self).__init__(train_tensors=train_tensors,
+                                                     dataset_name=dataset_name,
+                                                     test_tensors=test_tensors,
+                                                     resampling_strategy=resampling_strategy,
+                                                     resampling_strategy_args=resampling_strategy_args,
+                                                     shuffle=False,
+                                                     seed=seed,
+                                                     train_transforms=train_transforms,
+                                                     val_transforms=val_transforms)
+        """
+        if dataset_name is not None:
+            self.dataset_name = dataset_name
+        else:
+            self.dataset_name = hash_array_or_matrix(train_tensors[0])
+
+        self.train_tensors = train_tensors
+        self.val_tensors = None
+        self.test_tensors = test_tensors
+        self.rand = np.random.RandomState(seed=seed)
+        self.shuffle = False
+        self.resampling_strategy = resampling_strategy
+        self.resampling_strategy_args = resampling_strategy_args
+        self.task_type: Optional[str] = None
+        self.issparse: bool = issparse(self.train_tensors[0])
+        # TODO find a way to edit input shape!
+        self.input_shape: Tuple[int] = [np.min(self.sequence_lengths_train), self.num_features]
+
+        if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
+            self.output_type: str = type_of_target(self.train_tensors[1])
+
+            if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
+                self.output_shape = len(np.unique(self.train_tensors[1]))
+            else:
+                # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
+                self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
+
+        # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = False
 
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+
+
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
         self.numerical_features: List[int] = list(range(self.num_features))
         self.categorical_features: List[int] = []
 
         self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
-        self.holdout_validators = get_holdout_validators(HoldoutValTypes.holdout_validation)
+        self.holdout_validators = get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
 
         self.splits = self.get_splits_from_resampling_strategy()
 
@@ -123,23 +239,19 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
         Returns
             (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
         """
-        splits = []
+        splits= []
         if isinstance(self.resampling_strategy, HoldoutValTypes):
             val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
                 'val_share', None)
             if self.resampling_strategy_args is not None:
                 val_share = self.resampling_strategy_args.get('val_share', val_share)
-            splits.append(
-                self.create_holdout_val_split(
-                    holdout_val_type=self.resampling_strategy,
-                    val_share=val_share,
-                )
-            )
+            splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
+                                                   val_share=val_share))
 
             if self.val_tensors is not None:
-                upper_sequence_length = self.time_series_length - self.n_prediction_steps
+                upper_window_size = np.min(self.sequence_lengths_train) - self.n_prediction_steps
             else:
-                upper_sequence_length = int(self.time_series_length * val_share) - self.n_prediction_steps
+                upper_window_size = int(np.min(self.sequence_lengths_train) * val_share) - self.n_prediction_steps
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
@@ -147,16 +259,14 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             if self.resampling_strategy_args is not None:
                 num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
             # Create the split if it was not created before
-            splits.extend(
-                self.create_cross_val_splits(
+            splits.extend(self.create_cross_val_splits(
                     cross_val_type=self.resampling_strategy,
                     num_splits=cast(int, num_splits),
-                )
-            )
-            upper_sequence_length = (self.time_series_length // num_splits) - self.n_prediction_steps
+            ))
+            upper_window_size = (np.min(self.sequence_lengths_train) // num_splits) - self.n_prediction_steps
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
-        self.upper_sequence_length = upper_sequence_length
+        self.upper_window_size = upper_window_size
         return splits
 
     def get_required_dataset_info(self) -> Dict[str, Any]:
@@ -170,15 +280,22 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'categorical_features': self.categorical_features,
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
-            'upper_sequence_length': self.upper_sequence_length,
+            'upper_window_size': self.upper_window_size,
         })
         return info
 
     def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
-        dataset_properties.update({'upper_sequence_length': self.upper_sequence_length})
+        dataset_properties.update({'upper_window_size': self.upper_window_size})
         return dataset_properties
 
+    def update_sequence_lengths_train(self, sequence_length):
+        if len(sequence_length) != self.num_sequences:
+            raise ValueError("number of sequence must match!")
+        if np.sum(sequence_length) != self.train_tensors[0].shape[0]:
+            raise ValueError("sequence length needs to be consistent with train tensors")
+        self.sequence_lengths_train = sequence_length
+
     def create_cross_val_splits(
         self,
         cross_val_type: CrossValTypes,
@@ -203,20 +320,26 @@ def create_cross_val_splits(
         # should be robust against multiple calls, and it does so by remembering the splits
         if not isinstance(cross_val_type, CrossValTypes):
             raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
-        kwargs = {}
-        if is_stratified(cross_val_type):
-            # we need additional information about the data for stratification
-            kwargs["stratify"] = self.train_tensors[-1]
-        splits_raw = self.cross_validators[cross_val_type.name](
-            num_splits, self._get_indices(), **kwargs)
-        splits = [() for i in range(len(splits_raw))]
-        for i, split in enumerate(splits_raw):
-            train = split[0]
-            val = split[1]
-            val = np.concatenate([train[-(len(train) % self.time_series_length)], val])
-            train = train[:- (len(train) % self.time_series_length)]
-            splits[i] = (train, val)
-        return splits
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        splits = [[() for _ in range(self.num_sequences)] for _ in range(num_splits)]
+        idx_all = self._get_indices()
+        idx_start = 0
+        for idx_seq, seq_length in enumerate(self.sequence_lengths_train):
+            idx_end = idx_start + seq_length
+            split = self.cross_validators[cross_val_type.name](num_splits, idx_all[idx_start: idx_end], **kwargs)
+            for idx_split in range(num_splits):
+                splits[idx_split][idx_seq] = split[idx_split]
+            idx_start = idx_end
+        # in this case, splits is stored as :
+        #  [ first split, second_split ...]
+        #  first_split = [([0], [1]), ([2], [3])] ....
+        splits_merged = []
+        for i in range(num_splits):
+            split = splits[i]
+            train_indices = np.concatenate([sp[0] for sp in split])
+            test_indices = np.concatenate([sp[1] for sp in split])
+            splits_merged.append((train_indices, test_indices))
+        return splits_merged
 
     def create_holdout_val_split(
         self,
@@ -245,26 +368,37 @@ def create_holdout_val_split(
             raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
         if not isinstance(holdout_val_type, HoldoutValTypes):
             raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
-        kwargs = {}
-        if is_stratified(holdout_val_type):
-            # we need additional information about the data for stratification
-            kwargs["stratify"] = self.train_tensors[-1]
-        # we want to ensure that both training and validation sets have the same
-        val_share = int(val_share * self.time_series_length) * self.population_size
-        train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
-        return train, val
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+
+        splits = [[() for _ in range(self.num_sequences)] for _ in range(2)]
+        idx_all = self._get_indices()
+        idx_start = 0
+        for idx_seq, seq_length in enumerate(self.sequence_lengths_train):
+            idx_end = idx_start + seq_length
+            split = self.holdout_validators[holdout_val_type.name](val_share, idx_all[idx_start: idx_end], **kwargs)
+            for idx_split in range(2):
+                splits[idx_split][idx_seq] = split[idx_split]
+            idx_start = idx_end
+
+        train_indices = np.concatenate([sp for sp in splits[0]])
+        test_indices = np.concatenate([sp for sp in splits[1]])
+
+        return train_indices, test_indices
+
+
 
 
 def _check_time_series_forecasting_inputs(train: np.ndarray,
                                           val: Optional[np.ndarray] = None) -> None:
-    if train.ndim != 3:
+    if train.ndim != 3 or any(isinstance(i, (list, np.ndarray)) for i in train):
         raise ValueError(
-            "The training data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
+            "The training data for time series forecasting has to be a three-dimensional tensor of shape PxLxM. or a"
+            "nested list")
     if val is not None:
-        if val.ndim != 3:
+        if val.ndim != 3 or any(isinstance(i, (list, np.ndarray)) for i in val):
             raise ValueError(
                 "The validation data for time series forecasting "
-                "has to be a three-dimensional tensor of shape PxLxM.")
+                "has to be a three-dimensional tensor of shape PxLxM or a nested list.")
 
 
 class TimeSeriesDataset(BaseDataset):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 28f9b5aa0..e0e9bedd5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -45,11 +45,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
 
         # Where to get the data -- Prioritize X_train if any else
         # get from backend
+        # TODO consider how to handle the inconsistency between Transformer and Datasets
+        X_train = X['backend'].load_datamanager().train_tensors[0]
+        """
         if 'X_train' in X:
             X_train = subsampler(X['X_train'], X['train_indices'])
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
-
+        """
         self.preprocessor.fit(X_train)
         return self
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index 94cb3471c..387ee66c4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -17,6 +17,7 @@
     autoPyTorchComponent,
 )
 from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.constants import TIMESERIES_FORECASTING, TASK_TYPES_TO_STRING
 
 
 class NetworkBackboneComponent(autoPyTorchComponent):
@@ -50,13 +51,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         self.check_requirements(X, y)
         X_train = X['X_train']
-
-        if X["dataset_properties"]["is_small_preprocess"]:
-            input_shape = X_train.shape[1:]
+        if X["dataset_properties"]["task_type"] == TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]:
+            input_shape = X["dataset_properties"]['input_shape']
         else:
-            # get input shape by transforming first two elements of the training set
-            transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
-            input_shape = transforms(X_train[:1, ...]).shape[1:]
+
+            if X["dataset_properties"]["is_small_preprocess"]:
+                input_shape = X_train.shape[1:]
+            else:
+                # get input shape by transforming first two elements of the training set
+                transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
+                input_shape = transforms(X_train[:1, ...]).shape[1:]
 
         self.input_shape = input_shape
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index fa96dbc73..cd0b5d9e9 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -35,7 +35,7 @@ class TimeSeriesForecastingDataLoader(TimeSeriesDataLoader):
 
     def __init__(self,
                  batch_size: int = 64,
-                 sequence_length: int = 1,
+                 window_size: int = 1,
                  #sample_interval: int = 1,
                  upper_sequence_length: int = np.iinfo(np.int32).max,
                  n_prediction_steps: int = 1) -> None:
@@ -50,13 +50,13 @@ def __init__(self,
             n_prediction_steps: how many stpes to predict in advance
         """
         super().__init__(batch_size=batch_size)
-        self.sequence_length: int = sequence_length
+        self.window_size: int = window_size
         self.upper_seuqnce_length = upper_sequence_length
         self.n_prediction_steps = n_prediction_steps
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
-        self.tail_length = (self.sequence_length * self.sample_interval) + self.n_prediction_steps - 1
+        self.tail_length = (self.window_size * self.sample_interval) + self.n_prediction_steps - 1
 
     def transform(self, X: np.ndarray) -> np.ndarray:
         """The transform function calls the transform function of the
@@ -94,7 +94,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
         sample_interval = X.get('sample_interval', 1)
         self.sample_interval = sample_interval
-        self.tail_length = (self.sequence_length * self.sample_interval) + self.n_prediction_steps - 1
+        self.tail_length = (self.window_size * self.sample_interval) + self.n_prediction_steps - 1
 
         # Make sure there is an optimizer
         self.check_requirements(X, y)
@@ -150,65 +150,109 @@ def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
         update the dataset to build time sequence
         """
         num_features = datamanager.num_features
-        population_size = datamanager.population_size
-        num_target = datamanager.num_target
+        num_sequences = datamanager.num_sequences
+        num_targets = datamanager.num_target
 
         X_train, y_train = datamanager.train_tensors
         val_tensors = datamanager.val_tensors
         test_tensors = datamanager.test_tensors
         n_prediction_steps = datamanager.n_prediction_steps
+        sequence_lengths_train = datamanager.sequence_lengths_train
+        sequence_lengths_val = datamanager.sequence_lengths_val
+        sequence_lengths_test = datamanager.sequence_lengths_test
 
-        X_train = X_train.reshape([-1, population_size, num_features])
-        y_train = y_train.reshape([-1, population_size, num_target])
-
-        time_series_length = X_train.shape[0]
-        self.population_size = population_size
+        self.num_sequences = num_sequences
         self.num_features = num_features
-        num_datapoints_train = time_series_length - (self.sequence_length - 1) * self.sample_interval - n_prediction_steps + 1
-        num_targets = y_train.shape[-1]
-
-        y_train = y_train[-num_datapoints_train:, :]
-        if test_tensors is not None:
-            X_test, y_test = test_tensors
-
-            X_test = X_test.reshape([-1, population_size, num_features])
-            y_test = y_test.reshape([-1, population_size, num_target])
-
-            if val_tensors is not None:
-                X_val, y_val = val_tensors
-
-                X_val = X_val.reshape([-1, population_size, num_features])
-                y_val = y_val.reshape([-1, population_size, num_target])
-
-                num_datapoints_val = X_val.shape[0]
-
-                X_val = np.concatenate([X_train[-self.tail_length:], X_val])
-                X_test = np.concatenate([X_val[-self.tail_length:], X_test])
-                val_tensors = self._ser2seq(X_val, y_val, num_datapoints_val, num_features, num_targets)
-                datamanager.val_tensors = val_tensors
+        #X_train = X_train.reshape([-1, population_size, num_features])
+        #y_train = y_train.reshape([-1, population_size, num_target])
+
+        X_train_pieces = [[] for _ in range(num_sequences)]
+        y_train_pieces = [[] for _ in range(num_sequences)]
+
+        X_val_pieces = [[] for _ in range(num_sequences)]
+        y_val_pieces = [[] for _ in range(num_sequences)]
+
+        X_test_pieces = [[] for _ in range(num_sequences)]
+        y_test_pieces = [[] for _ in range(num_sequences)]
+
+        X_val_tail = [[] for _ in range(num_sequences)]
+        num_data_points_seqs = [0] * num_sequences
+        start_idx_train = 0
+        start_idx_test = 0
+        start_idx_val = 0
+        for i, seq_length_train in enumerate(sequence_lengths_train):
+            end_idx_train = start_idx_train + seq_length_train
+            num_datapoints_train = seq_length_train - (self.window_size - 1) * self.sample_interval - n_prediction_steps + 1
+
+            X_train_tmp = X_train[start_idx_train: end_idx_train]
+            y_train_tmp = y_train[start_idx_train: end_idx_train]
+
+            y_train_tmp = y_train_tmp[-num_datapoints_train:]
+            if test_tensors is not None:
+                end_idx_test = start_idx_test + sequence_lengths_test[i]
+                X_test_tmp  = test_tensors[0][start_idx_test: end_idx_test]
+                y_test_tmp = test_tensors[1][start_idx_test: end_idx_test]
+
+                if val_tensors is not None:
+                    end_idx_val = start_idx_val + sequence_lengths_val[i]
+                    X_val_tmp = val_tensors[0][start_idx_val: end_idx_val]
+                    y_val_tmp = val_tensors[1][start_idx_val: end_idx_val]
+                    num_datapoints_val = sequence_lengths_val[i]
+
+                    X_val_tmp = np.concatenate([X_train_tmp[-self.tail_length:], X_val_tmp])
+                    X_test_tmp = np.concatenate([X_val_tmp[-self.tail_length:], X_test_tmp])
+                    X_val_tmp, y_val_tmp = self._ser2seq(X_val_tmp, y_val_tmp, num_datapoints_val, num_features, num_targets)
+
+                    X_val_pieces[i] = X_val_tmp
+                    y_val_pieces[i] = y_val_tmp
+
+                num_datapoints_test = sequence_lengths_test[i]
+
+                X_test_tmp = np.concatenate([X_train_tmp[-self.tail_length:], X_test_tmp])
+                X_val_tail[i] = X_test_tmp[-self.tail_length:] if self.tail_length > 1 \
+                    else np.zeros((0, num_features)).astype(dtype=X_test_tmp.dtype)
+
+                X_test_tmp, y_test_tmp = self._ser2seq(X_test_tmp, y_test_tmp, num_datapoints_test, num_features, num_targets)
+                X_test_pieces[i] = X_test_tmp
+                y_test_pieces[i] = y_test_tmp
+                datamanager.test_tensors = test_tensors
+
+            elif val_tensors is not None:
+                end_idx_val = start_idx_val + sequence_lengths_val[i]
+                X_val_tmp = val_tensors[0][start_idx_val: end_idx_val]
+                y_val_tmp = val_tensors[1][start_idx_val: end_idx_val]
+                num_datapoints_val = sequence_lengths_val[i]
+                X_val_tmp = np.concatenate([X_train_tmp[-self.tail_length:], X_val_tmp])
+
+                # used for prediction
+                X_val_tail[i] = X_val_tmp[-self.tail_length:] if self.tail_length > 1 \
+                    else np.zeros((0, num_features)).astype(dtype=X_val_tmp.dtype)
+                X_val_tmp, y_val_tmp = self._ser2seq(X_val_tmp, y_val_tmp, num_datapoints_val, num_features, num_targets)
+
+                X_val_pieces[i] = X_val_tmp
+                y_val_pieces[i] = y_val_tmp
+            else:
+                X_val_tail[i] = X_train_tmp[-self.tail_length:] if self.tail_length > 1 \
+                    else np.zeros((0, num_features)).astype(dtype=X_train_tmp.dtype)
 
-            num_datapoints_test = X_test.shape[0]
+            X_train_tmp, y_train_tmp = self._ser2seq(X_train_tmp, y_train_tmp,
+                                                     num_datapoints_train, num_features, num_targets)
+            X_train_pieces[i] = X_train_tmp
+            y_train_pieces[i] = y_train_tmp
 
-            X_test = np.concatenate([X_train[-self.tail_length:], X_test])
-            self.X_val_tail = X_test[-self.tail_length:] if self.tail_length > 1 \
-                else np.zeros((0, population_size, num_features)).astype(dtype=X_test.dtype)
+            num_data_points_seqs[i] = num_datapoints_train
 
-            test_tensors = self._ser2seq(X_test, y_test, num_datapoints_test, num_features, num_targets)
+        train_tensors = (np.concatenate(X_train_pieces), np.concatenate(y_train_pieces))
+        if test_tensors is not None:
+            test_tensors = (np.concatenate(X_test_pieces), np.concatenate(y_test_pieces))
             datamanager.test_tensors = test_tensors
-
-        elif val_tensors is not None:
-            X_val, y_val = val_tensors
-            X_val = np.concatenate([X_train[-self.tail_length:], X_val])
-
-            # used for prediction
-            self.X_val_tail = X_val[-self.tail_length:]
-            val_tensors = self._ser2seq(X_val, y_val, num_datapoints_train, num_features, num_targets)
+        if val_tensors is not None:
+            val_tensors = (np.concatenate(X_val_pieces), np.concatenate(y_val_pieces))
             datamanager.val_tensors = val_tensors
-        else:
-            self.X_val_tail = X_train[-self.tail_length:]
+        self.X_val_tail = X_val_tail
 
-        train_tensors = self._ser2seq(X_train, y_train, num_datapoints_train, num_features, num_targets)
         datamanager.train_tensors = train_tensors
+        datamanager.update_sequence_lengths_train(num_data_points_seqs)
         datamanager.splits = datamanager.get_splits_from_resampling_strategy()
         return datamanager
 
@@ -228,9 +272,9 @@ def _ser2seq(self, X_in, y_in, num_datapoints, num_features, num_targets):
             y_in_trans: transformed input target array with shape
             [num_datapoints * population_size, num_targets]
         """
-        X_in = np.concatenate([np.roll(X_in, shift=i * self.sample_interval, axis=0) for i in range(0, -self.sequence_length, -1)],
-                              axis=2).astype(np.float32)[:num_datapoints]
-        X_in = X_in.reshape((-1, self.sequence_length, num_features))
+        X_in = np.concatenate([np.roll(X_in, shift=i * self.sample_interval, axis=0) for i in range(0, -self.window_size, -1)],
+                              axis=1).astype(np.float32)[:num_datapoints]
+        X_in = X_in.reshape((-1, self.window_size, num_features))
         y_in = y_in.reshape((-1, num_targets))
         return X_in, y_in
 
@@ -240,28 +284,15 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
         """
-        if X.ndim == 3:
-            X_shape = X.shape
-            if X_shape[-1] != self.num_features:
-                raise ValueError("the features of test data is incompatible with the training data")
-            if X_shape[1] == self.population_size:
-                num_points_X_in = X_shape[0]
-            elif X_shape[0] == self.population_size:
-                num_points_X_in = X_shape[1]
-                X = np.swapaxes(X, 0, 1)
-            elif X_shape[1] == 1:
-                X = X.reshape([-1, self.population_size, self.num_features])
-                num_points_X_in = X_shape[0]
-            else:
-                raise ValueError("test shape is incompatible with the training shape")
-        else:
-            raise ValueError(
-                "The test data for time series forecasting has to be a three-dimensional tensor of shape PxLxM.")
+        if len(X) != len(self.X_val_tail):
+            raise ValueError(f"Training data has {len(self.X_val_tail)} sequences but test data has {len(X)} sequences")
+        for i in range(len(X)):
+            num_points_X_in = np.shape(X[i])[0]
+            X_tmp = np.concatenate([self.X_val_tail[i], X[i]])
+            X[i] = np.concatenate([np.roll(X_tmp, shift=i * self.sample_interval, axis=0) for i in range(0, -self.window_size, -1)],
+                           axis=1).astype(np.float32)[:num_points_X_in]
 
-        X = np.concatenate([self.X_val_tail, X])
-        X = np.concatenate([np.roll(X, shift=i * self.sample_interval, axis=0) for i in range(0, -self.sequence_length, -1)],
-                           axis=2).astype(np.float32)[:num_points_X_in]
-        X = X.reshape((-1, self.sequence_length, self.num_features))
+        X = np.concatenate(X)
 
 
         dataset = BaseDataset(
@@ -306,26 +337,26 @@ def get_test_data_loader(self) -> torch.utils.data.DataLoader:
     @staticmethod
     def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                         batch_size: Tuple[Tuple, int] = ((32, 320), 64),
-                                        sequence_length: Tuple[Tuple, int] = ((1, 20), 1)
+                                        window_size: Tuple[Tuple, int] = ((1, 20), 1)
                                         ) -> ConfigurationSpace:
         batch_size = UniformIntegerHyperparameter(
             "batch_size", batch_size[0][0], batch_size[0][1], default_value=batch_size[1])
-        if "upper_sequence_length" not in dataset_properties:
+        if "upper_window_size" not in dataset_properties:
             warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
                           'length that is greater than the maximal allowed length of the dataset')
-            upper_sequence_length = min(np.iinfo(np.int32).max, sequence_length[0][1])
+            upper_window_size = min(np.iinfo(np.int32).max, window_size[0][1])
         else:
-            upper_sequence_length = min(dataset_properties["upper_sequence_length"], sequence_length[0][1])
-        if sequence_length[0][0] >= upper_sequence_length:
-            warnings.warn("the lower bound of sequence length is greater than the upper bound")
-            sequence_length = Constant("sequence_length", upper_sequence_length)
+            upper_window_size = min(dataset_properties["upper_window_size"], window_size[0][1])
+        if window_size[0][0] >= upper_window_size:
+            warnings.warn("the lower bound of window size is greater than the upper bound")
+            window_size = Constant("window_size", upper_window_size)
         else:
-            sequence_length = UniformIntegerHyperparameter("sequence_length",
-                                                           lower=sequence_length[0][0],
-                                                           upper=upper_sequence_length,
-                                                           default_value=sequence_length[1])
+            window_size = UniformIntegerHyperparameter("window_size",
+                                                       lower=window_size[0][0],
+                                                       upper=upper_window_size,
+                                                       default_value=window_size[1])
         cs = ConfigurationSpace()
-        cs.add_hyperparameters([batch_size, sequence_length])
+        cs.add_hyperparameters([batch_size, window_size])
         return cs
 
     def __str__(self) -> str:

From e4bd9b91176b91b487ef00dffb9819ea0a7a1a9b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 8 Jul 2021 22:57:17 +0200
Subject: [PATCH 039/347] maint test data

---
 .../data/time_series_forecasting_target_validator.py        | 6 +++++-
 autoPyTorch/datasets/time_series_dataset.py                 | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
index e642fcfbe..d5d86d4c6 100644
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_target_validator.py
@@ -62,7 +62,11 @@ def fit(
                 num_target_train[seq_idx] = np.shape(y_train[seq_idx])[-1]
 
         if y_test is not None:
-            self._check_data(y_test[0])
+            y_test_first_seq = np.array(y_train[0])
+            if len(y_test_first_seq.shape) == 1:
+                y_test_first_seq = np.expand_dims(y_test_first_seq, -1)
+
+            self._check_data(y_test_first_seq)
             if len(y_train) != len(y_test):
                 raise ValueError("Training test must have the same amount of sequences as test set!")
             if len(np.shape(y_train[1])) == 1:
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 83d939bfe..ab5b94987 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -163,8 +163,8 @@ def __init__(self,
 
             for seq_idx, seq_length in enumerate(self.sequence_lengths_test):
                 end_idx = start_idx + seq_length
-                X_test_flatten[start_idx: end_idx] = np.array(X[seq_idx])
-                y_test_flatten[start_idx: end_idx] = np.array(Y[seq_idx])
+                X_test_flatten[start_idx: end_idx] = np.array(X_test[seq_idx])
+                y_test_flatten[start_idx: end_idx] = np.array(Y_test[seq_idx])
                 start_idx = end_idx
             test_tensors = (X_test_flatten, y_test_flatten)
         else:

From 57408a800c6c8db144c7a1bdc17b3297d1cdd9e3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 9 Jul 2021 00:16:05 +0200
Subject: [PATCH 040/347] allow backend to overwrite dataset

---
 .../time_series_forecasting_data_loader.py        | 15 ++++++++++++---
 autoPyTorch/utils/backend.py                      |  4 ++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index cd0b5d9e9..442af54c1 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -119,6 +119,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
+        X['backend'].save_datamanager(datamanager, overwrite=True)
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
         self.datamanager = datamanager
@@ -337,7 +338,7 @@ def get_test_data_loader(self) -> torch.utils.data.DataLoader:
     @staticmethod
     def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                         batch_size: Tuple[Tuple, int] = ((32, 320), 64),
-                                        window_size: Tuple[Tuple, int] = ((1, 20), 1)
+                                        window_size: Tuple[Tuple, int] = ((20, 50), 25)
                                         ) -> ConfigurationSpace:
         batch_size = UniformIntegerHyperparameter(
             "batch_size", batch_size[0][0], batch_size[0][1], default_value=batch_size[1])
@@ -349,11 +350,19 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
             upper_window_size = min(dataset_properties["upper_window_size"], window_size[0][1])
         if window_size[0][0] >= upper_window_size:
             warnings.warn("the lower bound of window size is greater than the upper bound")
-            window_size = Constant("window_size", upper_window_size)
-        else:
+            window_size = UniformIntegerHyperparameter("window_size",
+                                                       lower=1,
+                                                       upper=upper_window_size,
+                                                       default_value=1)
+        elif window_size[0][0] <= upper_window_size < window_size[0][1]:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],
                                                        upper=upper_window_size,
+                                                       default_value=1)
+        else:
+            window_size = UniformIntegerHyperparameter("window_size",
+                                                       lower=window_size[0][0],
+                                                       upper=window_size[0][1],
                                                        default_value=window_size[1])
         cs = ConfigurationSpace()
         cs.add_hyperparameters([batch_size, window_size])
diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py
index dd24c2340..78f2a42cf 100644
--- a/autoPyTorch/utils/backend.py
+++ b/autoPyTorch/utils/backend.py
@@ -303,12 +303,12 @@ def load_targets_ensemble(self) -> np.ndarray:
     def _get_datamanager_pickle_filename(self) -> str:
         return os.path.join(self.internals_directory, 'datamanager.pkl')
 
-    def save_datamanager(self, datamanager: BaseDataset) -> str:
+    def save_datamanager(self, datamanager: BaseDataset, overwrite=False) -> str:
         self._make_internals_directory()
         filepath = self._get_datamanager_pickle_filename()
 
         with lockfile.LockFile(filepath):
-            if not os.path.exists(filepath):
+            if not os.path.exists(filepath) or overwrite:
                 with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
                         filepath), delete=False) as fh:
                     pickle.dump(datamanager, fh, -1)

From 846a48bf544f685a2f74b7fa49c408ee837a99a0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 9 Jul 2021 10:43:44 +0200
Subject: [PATCH 041/347] maint

---
 .../time_series_forecasting_data_loader.py         | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 442af54c1..8dce8952d 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -349,11 +349,15 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         else:
             upper_window_size = min(dataset_properties["upper_window_size"], window_size[0][1])
         if window_size[0][0] >= upper_window_size:
-            warnings.warn("the lower bound of window size is greater than the upper bound")
-            window_size = UniformIntegerHyperparameter("window_size",
-                                                       lower=1,
-                                                       upper=upper_window_size,
-                                                       default_value=1)
+            if upper_window_size == 1:
+                warnings.warn("window size is fixed as 1")
+                window_size = Constant("window_size", value=1)
+            else:
+                warnings.warn("the lower bound of window size is greater than the upper bound")
+                window_size = UniformIntegerHyperparameter("window_size",
+                                                           lower=1,
+                                                           upper=upper_window_size,
+                                                           default_value=1)
         elif window_size[0][0] <= upper_window_size < window_size[0][1]:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],

From d23f1dc7e090e71ef8848465bc683d6cd56cd068 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 26 Jul 2021 14:45:06 +0200
Subject: [PATCH 042/347] lazy-imported sequence data

---
 ...me_series_forecasting_feature_validator.py | 159 +++----
 ...ime_series_forecasting_target_validator.py | 213 ++-------
 .../data/time_series_forecasting_validator.py |   1 +
 autoPyTorch/datasets/resampling_strategy.py   |   9 +-
 autoPyTorch/datasets/time_series_dataset.py   | 441 +++++++++++++-----
 .../time_series_forecasting_data_loader.py    | 292 +++++-------
 6 files changed, 530 insertions(+), 585 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_feature_validator.py b/autoPyTorch/data/time_series_forecasting_feature_validator.py
index f41d4ce9c..bbec1c878 100644
--- a/autoPyTorch/data/time_series_forecasting_feature_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_feature_validator.py
@@ -3,7 +3,6 @@
 import copy
 import sklearn.utils
 
-
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 import numpy as np
@@ -13,26 +12,28 @@
 from sklearn.exceptions import NotFittedError
 
 from autoPyTorch.data.tabular_validator import TabularFeatureValidator
+from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
 
 
 class TimeSeriesForecastingFeatureValidator(TabularFeatureValidator):
     def __init__(self,
-                 logger: Optional[Union[PicklableClientLogger, logging.Logger
-                 ]] = None,
+                 logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
                  ) -> None:
-        TabularFeatureValidator.__init__(self, logger)
-        self._extend_feature_dims = False
+        super(TimeSeriesForecastingFeatureValidator, self).__init__(logger)
+        self.feature_validators = None # type: Optional[List[TabularFeatureValidator]]
 
     def fit(self,
-            X_train: Union[np.ndarray, List[np.ndarray]],
-            X_test: Optional[np.ndarray] = None) -> BaseEstimator:
+            X_train: Union[np.ndarray, List[SUPPORTED_FEAT_TYPES]],
+            X_test: Optional[Union[np.ndarray, List[SUPPORTED_FEAT_TYPES]]] = None) -> BaseEstimator:
         """
-        We expect a time series dataset stored in the form :[population, time_series, features]
+        We expect a time series dataset stored in the form :[time_series_sequences]
+        TODO can we directly read X_train and X_test from panda DataFrame
 
         Arguments:
             X_train (np.ndarray):
                 A set of data that are going to be validated (type and dimensionality
-                checks) and used for fitting
+                checks) and used for fitting, it is composed of multiple time series sequences which might have
+                different length
 
             X_test (Optional[np.ndarray]):
                 An optional set of data that is going to be validated
@@ -41,93 +42,55 @@ def fit(self,
             self:
                 The fitted base estimator
         """
-        # TODO only allow np.ndarray(3D) and List of np.ndarray(2D) or array of array (2D) to reduce complexity!!!!!
-        if isinstance(X_train, np.ndarray):
-            if X_train.ndim > 3:
-                raise ValueError(f"Number of dimensions too large for time series train data.")
-            if X_train.ndim == 1:
-                self.validate_ts_data(X_train)
-        elif isinstance(X_train, list):
-            self.validate_ts_data(X_train)
-        else:
-            raise ValueError(f"Time series train data must be given as a numpy array or nested list,"
-                             f" but got {type(X_train)}")
-        """
-        _ = sklearn.utils.check_array(
-            X_train,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        )
-        """
+        categorical_columns = [[] for _ in range(len(X_train))]
+        numerical_columns = [[] for _ in range(len(X_train))]
+        categories = [[] for _ in range(len(X_train))]
+        num_features = [0] * len(X_train)
+
+        if X_test is not None:
+            if len(X_train) != len(X_test):
+                raise ValueError(f"Training data needs to have the same number sequences as the test data")
+
+        self.feature_validators = [TabularFeatureValidator(self.logger) for _ in range(len(X_train))]
         if X_test is not None:
-            if isinstance(X_test, np.ndarray):
-                if X_test.ndim > 3:
-                    raise ValueError(f"Number of dimensions too large for time series train data.")
-                if X_test.ndim == 1:
-                    self.validate_ts_data(X_test)
-            elif isinstance(X_test, list):
-                self.validate_ts_data(X_test)
-            else:
-                raise ValueError(f"Time series train data must be given as a numpy array or nested list,"
-                                 f" but got {type(X_test)}")
-            """
-            _ = sklearn.utils.check_array(
-                X_test,
-                force_all_finite=True,
-                ensure_2d=False,
-                allow_nd=True,
-                accept_sparse=False,
-                accept_large_sparse=False
-            )
-            """
-        first_sequence = np.array(X_train[0])
-
-        if self._extend_feature_dims:
-            first_sequence = np.expand_dims(first_sequence, axis=-1)
-            self.n_feature_dims = 1
-        self._fit(first_sequence)
+            for seq_idx, (X_train_seq, X_test_seq)  in enumerate(zip(X_train, X_test)):
+                self.feature_validators[seq_idx].fit(X_train_seq, X_test_seq)
 
+                categorical_columns[seq_idx] = self.feature_validators[seq_idx].categorical_columns
+                numerical_columns[seq_idx] = self.feature_validators[seq_idx].numerical_columns
+                categories[seq_idx] = self.feature_validators[seq_idx].categories
+                num_features[seq_idx] = self.feature_validators[seq_idx].num_features
+        else:
+            for seq_idx, X_train_seq in enumerate(X_train):
+                self.feature_validators[seq_idx].fit(X_train_seq)
+
+                categorical_columns[seq_idx] = self.feature_validators[seq_idx].categorical_columns
+                numerical_columns[seq_idx] = self.feature_validators[seq_idx].numerical_columns
+                categories[seq_idx] = self.feature_validators[seq_idx].categories
+                num_features[seq_idx] = self.feature_validators[seq_idx].num_features
+
+        if not np.all(np.asarray(categorical_columns) == categorical_columns[0]):
+            raise ValueError(f"All the sequence needs to have the same categorical columns!")
+        if not np.all(np.asarray(categories) == categories[0]):
+            raise ValueError(f"All the sequence needs to have the same categories!")
+        if not np.all(np.asarray(numerical_columns) == numerical_columns[0]):
+            raise ValueError(f"All the sequence needs to have the same Numerical columns!")
+        if not np.all(np.asarray(num_features) == num_features[0]):
+            raise ValueError(f"All the sequence needs to have the same number of features!")
+
+        self.categories = categories[0]
+        self.num_features = num_features[0]
+        self.categorical_columns = categorical_columns[0]
+        self.numerical_columns = numerical_columns[0]
+
+        self.feat_type = self.feature_validators[0].feat_type
+        self.data_type = self.feature_validators[0].data_type
+        self.dtypes = self.feature_validators[0].dtypes
+        self.column_order = self.feature_validators[0].column_order
         self._is_fitted = True
 
         return self
 
-    def validate_ts_data(self, X, is_train_set=True):
-        n_feature_dims = [0] * len(X)
-        seq_ndims = [0] * len(X)
-        for idx_seq, x in enumerate(X):
-            x_array_shape = np.array(x).shape
-            x_array_n_dims = len(x_array_shape)
-            seq_ndims[idx_seq] = x_array_n_dims
-
-            if x_array_n_dims == 1:
-                # As lots of time series prediction tasks only have one sequence feature, we will not raise an error here
-                #self.logger.warning(f"For each piece of time series data, we will automatically convert 1D vector to"
-                #                    f"2D matrix!")
-                self._extend_feature_dims = True
-                n_feature_dims[idx_seq] = 1
-            elif x_array_n_dims > 2:
-                raise ValueError(f"Invalid number of dimensions for time series train data")
-            else:
-                n_feature_dims[idx_seq] = x_array_shape[-1]
-
-
-        if not np.all(np.asarray(seq_ndims) == seq_ndims[0]):
-            raise ValueError(f"All the sequence needs to have the same shape!")
-        if not np.all(np.asarray(n_feature_dims) == n_feature_dims[0]):
-            raise ValueError(f"Number of features does not match for all the sequence")
-
-        if is_train_set:
-            self.n_feature_dims = n_feature_dims[0]
-            self.seq_ndims = seq_ndims[0]
-        else:
-            if seq_ndims[0] != self.seq_ndims:
-                raise ValueError("number of sequence dimensions does not match for training and test sets!")
-            if n_feature_dims[0] != self.n_feature_dims:
-                raise ValueError("number of feature dimensions does not match for training and test sets!")
-
     def transform(self, X: np.ndarray) -> np.ndarray:
         """
 
@@ -142,18 +105,6 @@ def transform(self, X: np.ndarray) -> np.ndarray:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
-        if self._extend_feature_dims:
-            for seq_idx in range(len(X)):
-                X[seq_idx] = np.expand_dims(X[seq_idx], axis=-1)
+        for seq_idx in range(len(X)):
+            X[seq_idx] = self.feature_validators[seq_idx].transform(X[seq_idx])
         return X
-        """
-        return sklearn.utils.check_array(
-            X,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        )
-        """
-
diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
index d5d86d4c6..9a2d25b7e 100644
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_target_validator.py
@@ -1,17 +1,14 @@
 from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
 
-import copy
 import typing
 import logging
 import numpy as np
 
 import pandas as pd
-from pandas.api.types import is_numeric_dtype
 
 import scipy.sparse
 
-import sklearn.utils
-from sklearn import preprocessing
+import sklearn
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.multiclass import type_of_target
@@ -24,15 +21,15 @@ class TimeSeriesForecastingTargetValidator(TabularTargetValidator):
     def __init__(self,
                  is_classification: bool = False,
                  logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
+                 ]] = None,
                  ) -> None:
         TabularTargetValidator.__init__(self, is_classification, logger)
-        self._extend_feature_dims = False
+        self.target_validators = None
 
     def fit(
-        self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+            self,
+            y_train: SUPPORTED_TARGET_TYPES,
+            y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
@@ -45,70 +42,43 @@ def fit(
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
-        # Check that the data is valid
-        y_train_first_seq = np.array(y_train[0])
-        if len(y_train_first_seq.shape) == 1:
-            y_train_first_seq = np.expand_dims(y_train_first_seq, -1)
-        self._check_data(y_train_first_seq)
+        if y_test is not None:
+            if len(y_train) != len(y_test):
+                raise ValueError(f"Training target needs to have the same number sequences as the test target")
 
-        num_seq = len(y_train)
+        self.target_validators = [TabularTargetValidator(self.is_classification, self.logger) for _ in
+                                  range(len(y_train))]
 
-        if len(np.shape(y_train[0])) == 1:
-            self._extend_feature_dims = True
-            num_target_train = [1] * num_seq
-        else:
-            num_target_train = [0] * num_seq
-            for seq_idx in range(num_seq):
-                num_target_train[seq_idx] = np.shape(y_train[seq_idx])[-1]
+        out_dimensionality = [[] for _ in range(len(y_train))]
+        type_of_target = [""] * len(y_train)
 
         if y_test is not None:
-            y_test_first_seq = np.array(y_train[0])
-            if len(y_test_first_seq.shape) == 1:
-                y_test_first_seq = np.expand_dims(y_test_first_seq, -1)
+            for seq_idx, (y_train_seq, y_test_seq)  in enumerate(zip(y_train, y_test)):
+                self.target_validators[seq_idx].fit(y_train_seq, y_test_seq)
+
+                out_dimensionality[seq_idx] = self.target_validators[seq_idx].out_dimensionality
+                type_of_target[seq_idx] = self.target_validators[seq_idx].type_of_target
 
-            self._check_data(y_test_first_seq)
-            if len(y_train) != len(y_test):
-                raise ValueError("Training test must have the same amount of sequences as test set!")
-            if len(np.shape(y_train[1])) == 1:
-                num_target_test = [1] * num_seq
-            else:
-                num_target_test = [0] * num_seq
-                for seq_idx in range(num_seq):
-                    test_seq = y_test[seq_idx]
-                    test_seq_shape = np.shape(test_seq)
-                    num_target_test[seq_idx] = test_seq_shape[-1]
-
-            if isinstance(y_train, pd.DataFrame):
-                y_train = typing.cast(pd.DataFrame, y_train)
-                y_test = typing.cast(pd.DataFrame, y_test)
-                if y_train.columns.tolist() != y_test.columns.tolist():
-                    raise ValueError(
-                        "Train and test targets must both have the same columns, yet "
-                        "y={} and y_test={} ".format(
-                            y_train.columns,
-                            y_test.columns
-                        )
-                    )
-
-                if list(y_train.dtypes) != list(y_test.dtypes):
-                    raise ValueError("Train and test targets must both have the same dtypes")
-
-            if not np.all(np.asarray(num_target_test) == num_target_test[0]):
-                raise ValueError("Test sets have inconsistent number of targets")
-
-        if not np.all(np.asarray(num_target_train) == num_target_train[0]):
-            raise ValueError("Train sets have inconsistent number of targets")
-
-        if self.out_dimensionality is None:
-            self.out_dimensionality = 1 if self._extend_feature_dims else num_target_train[0]
         else:
-            _n_outputs = 1 if self._extend_feature_dims else num_target_train[0]
-            if self.out_dimensionality != _n_outputs:
-                raise ValueError('Number of outputs changed from %d to %d!' %
-                                 (self.out_dimensionality, _n_outputs))
+            for seq_idx, y_train_seq in enumerate(y_train):
+                self.target_validators[seq_idx].fit(y_train_seq)
 
-        # Fit on the training data
-        self._fit(y_train, y_test)
+                out_dimensionality[seq_idx] = self.target_validators[seq_idx].out_dimensionality
+                type_of_target[seq_idx] = self.target_validators[seq_idx].type_of_target
+
+        if not np.all(np.asarray(out_dimensionality) == out_dimensionality[0]):
+            raise ValueError(f"All the sequence needs to have the same out_dimensionality!")
+        # TODO consider how to handle "continuous" and "multiple_classes" data type
+        """
+        if not np.all(np.asarray(type_of_target) == type_of_target[0]):
+            raise ValueError(f"All the sequence needs to have the same type_of_target!")
+        """
+
+        self.out_dimensionality = out_dimensionality[0]
+        self.type_of_target = type_of_target[0]
+
+        self.data_type = self.target_validators[0].data_type
+        self.dtype = self.target_validators[0].dtype
 
         self._is_fitted = True
 
@@ -120,116 +90,11 @@ def transform(
     ) -> np.ndarray:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
-
-        # Check the data here so we catch problems on new test data
-        y_first_seq = np.array(y[0])
-        if len(y_first_seq.shape) == 1:
-            y_train_first_seq = np.expand_dims(y_first_seq, -1)
-
-        self._check_data(y_train_first_seq)
-        if self._extend_feature_dims:
-            for seq_idx in range(len(y)):
-                y[seq_idx] = np.expand_dims(y[seq_idx], axis=-1)
-
-        """
-        # sklearn check array will make sure we have the
-        # correct numerical features for the array
-        # Also, a numpy array will be created
-        y = sklearn.utils.check_array(
-            y,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        )
-        """
+        for seq_idx in range(len(y)):
+            y[seq_idx] = self.target_validators[seq_idx].transform(y[seq_idx])
         return y
 
     """
     Validator for time series forecasting, currently only consider regression tasks
     TODO: Considering Classification Validator
-    """
-    def _check_data(
-        self,
-        y: SUPPORTED_TARGET_TYPES,
-    ) -> None:
-        """
-        Perform dimensionality and data type checks on the targets
-
-        Arguments:
-            y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
-                A set of features whose dimensionality and data type is going to be checked
-        """
-
-        if not isinstance(
-                y, (np.ndarray, pd.DataFrame, list, pd.Series)) and not scipy.sparse.issparse(y):
-            raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
-                             " pd.Series, sparse data and Python Lists as targets, yet, "
-                             "the provided input is of type {}".format(
-                                 type(y)
-                             ))
-
-        # Sparse data muss be numerical
-        # Type ignore on attribute because sparse targets have a dtype
-        if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
-                                                          np.number):
-            raise ValueError("When providing a sparse matrix as targets, the only supported "
-                             "values are numerical. Please consider using a dense"
-                             " instead."
-                             )
-
-        if self.data_type is None:
-            self.data_type = type(y)
-        if self.data_type != type(y):
-            self.logger.warning("AutoPyTorch previously received targets of type %s "
-                                "yet the current features have type %s. Changing the dtype "
-                                "of inputs to an estimator might cause problems" % (
-                                    str(self.data_type),
-                                    str(type(y)),
-                                ),
-                                )
-
-        # No Nan is supported
-        has_nan_values = False
-        if hasattr(y, 'iloc'):
-            has_nan_values = typing.cast(pd.DataFrame, y).isnull().values.any()
-        if scipy.sparse.issparse(y):
-            y = typing.cast(scipy.sparse.spmatrix, y)
-            has_nan_values = not np.array_equal(y.data, y.data)
-        else:
-            # List and array like values are considered here
-            # np.isnan cannot work on strings, so we have to check for every element
-            # but NaN, are not equal to themselves:
-            has_nan_values = not np.array_equal(y, y)
-        if has_nan_values:
-            raise ValueError("Target values cannot contain missing/NaN values. "
-                             "This is not supported by scikit-learn. "
-                             )
-
-        # Pandas Series is not supported for multi-label indicator
-        # This format checks are done by type of target
-        try:
-            self.type_of_target = type_of_target(y[0])
-        except Exception as e:
-            raise ValueError("The provided data could not be interpreted by AutoPyTorch. "
-                             "While determining the type of the targets via type_of_target "
-                             "run into exception: {}.".format(e))
-
-        supported_output_types = ('binary',
-                                  'continuous',
-                                  'continuous-multioutput',
-                                  'multiclass',
-                                  'multilabel-indicator',
-                                  # Notice unknown/multiclass-multioutput are not supported
-                                  # This can only happen during testing only as estimators
-                                  # should filter out unsupported types.
-                                  )
-        if self.type_of_target not in supported_output_types:
-            raise ValueError("Provided targets are not supported by AutoPyTorch. "
-                             "Provided type is {} whereas supported types are {}.".format(
-                                 self.type_of_target,
-                                 supported_output_types
-                             ))
-
-
+    """
\ No newline at end of file
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 9739b0868..c05541e62 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -8,6 +8,7 @@
 from autoPyTorch.data.time_series_forecasting_target_validator import TimeSeriesForecastingTargetValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 
+# TODO create a minxin class to perform same operations on both feature and target validators
 
 class TimeSeriesForecastingInputValidator(TimeSeriesInputValidator):
     """
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index d4dce13b5..27c7ac8fd 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -54,7 +54,7 @@ class HoldoutValTypes(IntEnum):
         'val_share': 0.33,
     },
     HoldoutValTypes.time_series_hold_out_validation: {
-    'val_share': 0.33
+    'val_share': 0.2
     },
     CrossValTypes.k_fold_cross_validation: {
         'num_splits': 3,
@@ -151,7 +151,10 @@ def time_series_hold_out_validation(val_share: float, indices: np.ndarray, **kwa
 
     Returns:
     """
-    train, val = train_test_split(indices, test_size=val_share, shuffle=False)
+    # TODO consider how we handle test size properly
+    test_size = int(val_share * len(indices))
+    cv = TimeSeriesSplit(n_splits=2, test_size=test_size, gap=kwargs['n_prediction_steps'])
+    train, val = list(cv.split(indices))[-1]
     return train, val
 
 
@@ -170,6 +173,6 @@ def time_series_cross_validation(num_splits: int, indices: np.ndarray, **kwargs:
     """
     # TODO: we use gap=n_prediction_step here, we need to consider if we want to implement n_prediction_step here or
     # under DATALOADER!!!
-    cv = TimeSeriesSplit(n_splits=num_splits)
+    cv = TimeSeriesSplit(n_splits=num_splits, gap=kwargs['n_prediction_steps'])
     splits = list(cv.split(indices))
     return splits
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ab5b94987..c3b917d15 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import warnings
+import bisect
 
 import numpy as np
 
@@ -8,6 +9,7 @@
 
 from torch.utils.data.dataset import Dataset, Subset, ConcatDataset
 
+
 import torchvision.transforms
 
 from autoPyTorch.constants import (
@@ -41,50 +43,203 @@
 #TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
 #TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
-"""
+
 class TimeSeriesSequence(BaseDataset):
     def __init__(self,
-                 train_tensors: Union[np.ndarray, List[List]],
+                 X: Union[np.ndarray, pd.DataFrame],
+                 Y: Union[np.ndarray, pd.Series],
+                 X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  dataset_name: Optional[str] = None,
-                 val_tensors: Optional[ Union[np.ndarray, List[List]]] = None,
-                 test_tensors: Optional[ Union[np.ndarray, List[List]]] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
+                 shuffle: bool = False,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 n_prediction_steps: int = 1,
                  ):
+        """
+        A dataset representing a time series sequence.
+        Args:
+            train_tensors:
+            dataset_name:
+            val_tensors:
+            test_tensors:
+            resampling_strategy:
+            resampling_strategy_args:
+            seed:
+            train_transforms:
+            val_transforms:
+            n_prediction_steps: int, how many steps need to be predicted in advance
+        """
+        train_tensors = (X, Y)
+        test_tensors = (X_test, Y_test)
+        self.n_prediction_steps = n_prediction_steps
+
         if dataset_name is not None:
             self.dataset_name = dataset_name
         else:
             self.dataset_name = hash_array_or_matrix(train_tensors[0])
-        if not hasattr(train_tensors[0], 'shape'):
-            type_check(train_tensors, val_tensors)
+
         self.train_tensors = train_tensors
-        self.val_tensors = val_tensors
+        self.val_tensors = None
         self.test_tensors = test_tensors
-        self.cross_validators = {}
-        self.holdout_validators = {}
-        self.rand = np.random.RandomState(seed=seed)
-        self.shuffle = False
-        self.task_type: Optional[str] = None
 
+        self.rand = np.random.RandomState(seed=seed)
+        self.shuffle = shuffle
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
 
+        # we only allow time series cross validation and holdout validation
         self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
         self.holdout_validators = get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
 
-        self.splits = self.get_splits_from_resampling_strategy()
-
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
         self.train_transform = train_transforms
         self.val_transform = val_transforms
-"""
+
+    def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
+        """
+        get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous sequences
+        until the given index, this allows us to do further transformation when the
+
+        Args:
+            index (int): what element to yield from all the train/test tensors
+            train (bool): Whether to apply a train or test transformation, if any
+
+        Returns:
+            A transformed single point prediction
+        """
+        if index == -1:
+            index = self.__len__()
+
+        if hasattr(self.train_tensors[0], 'loc'):
+            X = self.train_tensors[0].iloc[:index + 1]
+        else:
+            X = self.train_tensors[0][:index + 1]
+
+        if self.train_transform is not None and train:
+            X = self.train_transform(X)
+        elif self.val_transform is not None and not train:
+            X = self.val_transform(X)
+
+        # In case of prediction, the targets are not provided
+        Y = self.train_tensors[1]
+        if Y is not None:
+            # Y = Y[:index + self.n_prediction_steps]
+            Y = Y[index]
+        else:
+            Y = None
+
+        return X, Y
+
+    def __len__(self) -> int:
+        return self.train_tensors[0].shape[0]
+
+    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
+        """
+        Creates a set of splits based on a resampling strategy provided, apart from the
+        'get_splits_from_resampling_strategy' implemented in base_dataset, here we will get self.upper_window_size
+        with the given value
+
+        Returns
+            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
+        """
+        splits = []
+        if isinstance(self.resampling_strategy, HoldoutValTypes):
+            val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'val_share', None)
+            if self.resampling_strategy_args is not None:
+                val_share = self.resampling_strategy_args.get('val_share', val_share)
+            splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
+                                                        val_share=val_share))
+
+            if self.val_tensors is not None:
+                upper_window_size = self.__len__() - self.n_prediction_steps
+            else:
+                upper_window_size = int(self.__len__() * val_share) - self.n_prediction_steps
+
+        elif isinstance(self.resampling_strategy, CrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
+                'num_splits', None)
+            if self.resampling_strategy_args is not None:
+                num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
+            # Create the split if it was not created before
+            splits.extend(self.create_cross_val_splits(
+                    cross_val_type=self.resampling_strategy,
+                    num_splits=cast(int, num_splits),
+            ))
+            upper_window_size = (self.__len__() // num_splits) - self.n_prediction_steps
+        else:
+            raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
+        self.upper_window_size = upper_window_size
+        return splits
+
+    def create_cross_val_splits(
+        self,
+        cross_val_type: CrossValTypes,
+        num_splits: int
+    ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
+        """
+        This function creates the cross validation split for the given task.
+
+        It is done once per dataset to have comparable results among pipelines
+        Args:
+            cross_val_type (CrossValTypes):
+            num_splits (int): number of splits to be created
+
+        Returns:
+            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
+        """
+        # Create just the split once
+        # This is gonna be called multiple times, because the current dataset
+        # is being used for multiple pipelines. That is, to be efficient with memory
+        # we dump the dataset to memory and read it on a need basis. So this function
+        # should be robust against multiple calls, and it does so by remembering the splits
+        if not isinstance(cross_val_type, CrossValTypes):
+            raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        split = self.cross_validators[cross_val_type.name](num_splits, **kwargs)
+        return split
+
+    def create_holdout_val_split(
+        self,
+        holdout_val_type: HoldoutValTypes,
+        val_share: float,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        This function creates the holdout split for the given task.
+
+        It is done once per dataset to have comparable results among pipelines
+        Args:
+            holdout_val_type (HoldoutValTypes):
+            val_share (float): share of the validation data
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
+        """
+        if holdout_val_type is None:
+            raise ValueError(
+                '`val_share` specified, but `holdout_val_type` not specified.'
+            )
+        if self.val_tensors is not None:
+            raise ValueError(
+                '`val_share` specified, but the Dataset was a given a pre-defined split at initialization already.')
+        if val_share < 0 or val_share > 1:
+            raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
+        if not isinstance(holdout_val_type, HoldoutValTypes):
+            raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
+        return train, val
 
 
-class TimeSeriesForecastingDataset(BaseDataset):
+class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
+    datasets: List[TimeSeriesSequence]
+    cumulative_sizes: List[int]
+
     def __init__(self,
                  X: Union[np.ndarray, List[List]],
                  Y: Union[np.ndarray, pd.Series],
@@ -93,7 +248,7 @@ def __init__(self,
                  dataset_name: Optional[str] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
-                 shuffle: Optional[bool] = False,
+                 shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -109,93 +264,121 @@ def __init__(self,
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
         self.n_prediction_steps = n_prediction_steps
+        if validator is None:
+            validator = TimeSeriesForecastingInputValidator(is_classification=False)
         self.validator = validator
-        if self.validator is not None:
-            if not isinstance(validator, TimeSeriesForecastingInputValidator):
-                raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
-                                 f"but receive {type(validator)}")
 
-            X, Y = self.validator.transform(X, Y)
-            self.num_features = self.validator.feature_validator.n_feature_dims
-            self.num_target = self.validator.target_validator.out_dimensionality
+        if not isinstance(validator, TimeSeriesForecastingInputValidator):
+            raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
+                             f"but receive {type(validator)}")
 
-            if X_test is not None:
-                X_test, Y_test = self.validator.transform(X_test, Y_test)
-        else:
-            self.num_features = np.shape(X[0])[-1]
-            self.num_target = np.shape(Y[0])[-1]
+        self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,)
 
-        self.num_sequences = len(X)
-        self.sequence_lengths_train = [0] * self.num_sequences
-        for seq_idx in range(self.num_sequences):
-            self.sequence_lengths_train[seq_idx] = len(X[seq_idx])
+        self.numerical_columns = self.validator.feature_validator.numerical_columns
+        self.categorical_columns = self.validator.feature_validator.categorical_columns
 
-        self.sequence_lengths_val = [0] * self.num_sequences
-        self.sequence_lengths_test = [0] * self.num_sequences
+        self.num_features = self.validator.feature_validator.num_features  # type: int
+        self.num_target = self.validator.target_validator.out_dimensionality  # type: int
 
-        self.categorical_columns = validator.feature_validator.categorical_columns
-        self.numerical_columns = validator.feature_validator.numerical_columns
+        X, Y = self.validator.transform(X, Y)
+
+        self.shuffle = shuffle
+        self.rand = np.random.RandomState(seed=seed)
+
+        self.resampling_strategy = resampling_strategy
+        self.resampling_strategy_args = resampling_strategy_args
+
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+
+        self.num_sequences = len(X)
+        self.sequence_lengths = [0] * self.num_sequences
+        for seq_idx in range(self.num_sequences):
+            self.sequence_lengths[seq_idx] = len(X[seq_idx])
 
-        num_train_data = np.sum(self.sequence_lengths_train)
+        num_train_data = np.sum(self.sequence_lengths)
         X_train_flatten = np.empty([num_train_data, self.num_features])
-        y_train_flatten = np.empty([num_train_data, self.num_features])
+        Y_train_flatten = np.empty([num_train_data, self.num_target])
         start_idx = 0
 
         self.sequences = []
+        # flatten the sequences to allow data preprocessing
 
-        if shuffle:
-            warnings.WarningMessage("Time Series Forecasting will not shuffle the data")
-        for seq_idx, seq_length in enumerate(self.sequence_lengths_train):
+        for seq_idx, seq_length in enumerate(self.sequence_lengths):
             end_idx = start_idx + seq_length
-            X_train_flatten[start_idx: end_idx] = np.array(X[seq_idx])
-            y_train_flatten[start_idx: end_idx] = np.array(Y[seq_idx])
+            X_train_flatten[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, self.num_features])
+            Y_train_flatten[start_idx: end_idx] = np.array(Y[seq_idx]).reshape([-1, self.num_target])
             start_idx = end_idx
 
-        train_tensors = (X_train_flatten, y_train_flatten)
+        sequence_lengths_test = [0] * self.num_sequences
 
-        if X_test is not None and Y_test is not None:
+        if X_test is not None or Y_test is not None:
             for seq_idx in range(self.num_sequences):
-                self.sequence_lengths_test[seq_idx] = len(X_test[seq_idx])
-            num_test_data = np.sum(self.sequence_lengths_test)
+                sequence_lengths_test[seq_idx] = len(X_test[seq_idx])
+            num_test_data = np.sum(sequence_lengths_test)
             X_test_flatten = np.empty([num_test_data, self.num_features])
-            y_test_flatten = np.empty([num_test_data, self.num_target])
+            Y_test_flatten = np.empty([num_test_data, self.num_target])
             start_idx = 0
 
-            for seq_idx, seq_length in enumerate(self.sequence_lengths_test):
+            for seq_idx, seq_length in enumerate(sequence_lengths_test):
                 end_idx = start_idx + seq_length
-                X_test_flatten[start_idx: end_idx] = np.array(X_test[seq_idx])
-                y_test_flatten[start_idx: end_idx] = np.array(Y_test[seq_idx])
+                X_test_flatten[start_idx: end_idx] = np.array(X_test[seq_idx]).reshape([-1, self.num_features])
+                Y_test_flatten[start_idx: end_idx] = np.array(Y_test[seq_idx]).reshape([-1, self.num_target])
                 start_idx = end_idx
-            test_tensors = (X_test_flatten, y_test_flatten)
+
+        if dataset_name is None:
+            dataset_name_seqs = [None] * self.num_sequences
         else:
-            test_tensors = None
-        """
-        super(TimeSeriesForecastingDataset, self).__init__(train_tensors=train_tensors,
-                                                     dataset_name=dataset_name,
-                                                     test_tensors=test_tensors,
-                                                     resampling_strategy=resampling_strategy,
-                                                     resampling_strategy_args=resampling_strategy_args,
-                                                     shuffle=False,
-                                                     seed=seed,
-                                                     train_transforms=train_transforms,
-                                                     val_transforms=val_transforms)
-        """
-        if dataset_name is not None:
-            self.dataset_name = dataset_name
+            dataset_name_seqs = [f"{dataset_name}_sequence_{i}" for i in range(self.num_sequences)]
+
+        # initialize datasets
+        sequences_kwargs = {"resampling_strategy": resampling_strategy,
+                            "resampling_strategy_args": resampling_strategy_args,
+                            "train_transforms": self.train_transform,
+                            "val_transforms": self.val_transform,
+                            "n_prediction_steps": n_prediction_steps}
+        idx_start_train = 0
+        idx_start_test = 0
+        sequence_datasets = []
+        if X_test is None or Y_test is None:
+            for seq_idx, seq_length_train in enumerate(self.sequence_lengths):
+                idx_end_train = idx_start_train + seq_length_train
+                sequence = TimeSeriesSequence(X=X_train_flatten[idx_start_train: idx_end_train],
+                                              Y=Y_train_flatten[idx_start_train: idx_end_train],
+                                              dataset_name=dataset_name_seqs[seq_idx],
+                                              seed=self.rand.randint(0, 2**20),
+                                              **sequences_kwargs)
+                sequence_datasets.append(sequence)
+                idx_start_train = idx_end_train
         else:
-            self.dataset_name = hash_array_or_matrix(train_tensors[0])
-
-        self.train_tensors = train_tensors
+            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths, sequence_lengths_test)):
+                idx_end_train = idx_start_train + seq_length_train
+                idx_end_test = idx_start_test + seq_length_test
+                sequence = TimeSeriesSequence(X=X_train_flatten[idx_start_train: idx_end_train],
+                                              Y=Y_train_flatten[idx_start_train: idx_end_train],
+                                              X_test=X_test_flatten[idx_start_test: idx_end_test],
+                                              Y_test=Y_test_flatten[idx_start_test: idx_end_test],
+                                              dataset_name=dataset_name_seqs[seq_idx],
+                                              seed=self.rand.randint(0, 2**20),
+                                              **sequences_kwargs)
+                sequence_datasets.append(sequence)
+                idx_start_train = idx_end_train
+
+        ConcatDataset.__init__(self, datasets=sequence_datasets)
+
+        self.train_tensors = (X_train_flatten, Y_train_flatten)
+        if X_test is not None or Y_test is not None:
+            self.test_tensors = (X_test_flatten, Y_test_flatten)
+        else:
+            self.test_tensors = (None, None)
         self.val_tensors = None
-        self.test_tensors = test_tensors
-        self.rand = np.random.RandomState(seed=seed)
-        self.shuffle = False
-        self.resampling_strategy = resampling_strategy
-        self.resampling_strategy_args = resampling_strategy_args
+
         self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
         # TODO find a way to edit input shape!
-        self.input_shape: Tuple[int] = [np.min(self.sequence_lengths_train), self.num_features]
+        self.input_shape: Tuple[int] = (np.min(self.sequence_lengths),self.num_features)
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1])
@@ -209,12 +392,6 @@ def __init__(self,
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = False
 
-        # We also need to be able to transform the data, be it for pre-processing
-        # or for augmentation
-        self.train_transform = train_transforms
-        self.val_transform = val_transforms
-
-
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
         self.numerical_features: List[int] = list(range(self.num_features))
@@ -225,10 +402,43 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
-        # We also need to be able to transform the data, be it for pre-processing
-        # or for augmentation
-        self.train_transform = train_transforms
-        self.val_transform = val_transforms
+    def __getitem__(self, idx, train=True):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].__getitem__(sample_idx, train)
+
+    def update_transform(self, transform: Optional[torchvision.transforms.Compose],
+                         train: bool = True,
+                         ) -> 'BaseDataset':
+        """
+        During the pipeline execution, the pipeline object might propose transformations
+        as a product of the current pipeline configuration being tested.
+
+        This utility allows to return a self with the updated transformation, so that
+        a dataloader can yield this dataset with the desired transformations
+
+        Args:
+            transform (torchvision.transforms.Compose): The transformations proposed
+                by the current pipeline
+            train (bool): Whether to update the train or validation transform
+
+        Returns:
+            self: A copy of the update pipeline
+        """
+        if train:
+            self.train_transform = transform
+        else:
+            self.val_transform = transform
+        for seq in self.datasets:
+            seq = seq.update_transform(transform, train)
+        return self
 
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
         """
@@ -239,19 +449,19 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
         Returns
             (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
         """
-        splits= []
+        splits = []
         if isinstance(self.resampling_strategy, HoldoutValTypes):
             val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
                 'val_share', None)
             if self.resampling_strategy_args is not None:
                 val_share = self.resampling_strategy_args.get('val_share', val_share)
             splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
-                                                   val_share=val_share))
+                                                        val_share=val_share))
 
             if self.val_tensors is not None:
-                upper_window_size = np.min(self.sequence_lengths_train) - self.n_prediction_steps
+                upper_window_size = np.min(self.sequence_lengths) - self.n_prediction_steps
             else:
-                upper_window_size = int(np.min(self.sequence_lengths_train) * val_share) - self.n_prediction_steps
+                upper_window_size = int(np.min(self.sequence_lengths) * 1 - val_share) - self.n_prediction_steps
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
@@ -263,9 +473,10 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                     cross_val_type=self.resampling_strategy,
                     num_splits=cast(int, num_splits),
             ))
-            upper_window_size = (np.min(self.sequence_lengths_train) // num_splits) - self.n_prediction_steps
+            upper_window_size = (np.min(self.sequence_lengths) // num_splits) - self.n_prediction_steps
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
+
         self.upper_window_size = upper_window_size
         return splits
 
@@ -289,13 +500,6 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties.update({'upper_window_size': self.upper_window_size})
         return dataset_properties
 
-    def update_sequence_lengths_train(self, sequence_length):
-        if len(sequence_length) != self.num_sequences:
-            raise ValueError("number of sequence must match!")
-        if np.sum(sequence_length) != self.train_tensors[0].shape[0]:
-            raise ValueError("sequence length needs to be consistent with train tensors")
-        self.sequence_lengths_train = sequence_length
-
     def create_cross_val_splits(
         self,
         cross_val_type: CrossValTypes,
@@ -318,26 +522,24 @@ def create_cross_val_splits(
         # is being used for multiple pipelines. That is, to be efficient with memory
         # we dump the dataset to memory and read it on a need basis. So this function
         # should be robust against multiple calls, and it does so by remembering the splits
+
         if not isinstance(cross_val_type, CrossValTypes):
             raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
-        kwargs = {"n_prediction_steps": self.n_prediction_steps}
-        splits = [[() for _ in range(self.num_sequences)] for _ in range(num_splits)]
-        idx_all = self._get_indices()
         idx_start = 0
-        for idx_seq, seq_length in enumerate(self.sequence_lengths_train):
-            idx_end = idx_start + seq_length
-            split = self.cross_validators[cross_val_type.name](num_splits, idx_all[idx_start: idx_end], **kwargs)
+        splits = [[[] for _ in range(len(self.datasets))] for _ in range(num_splits)]
+        for idx_seq, dataset in enumerate(self.datasets):
+            split = dataset.create_cross_val_splits(cross_val_type, num_splits=num_splits)
             for idx_split in range(num_splits):
-                splits[idx_split][idx_seq] = split[idx_split]
-            idx_start = idx_end
+                splits[idx_split][idx_seq] = idx_start + split[idx_split]
+            idx_start += self.sequence_lengths[idx_seq]
         # in this case, splits is stored as :
         #  [ first split, second_split ...]
         #  first_split = [([0], [1]), ([2], [3])] ....
         splits_merged = []
         for i in range(num_splits):
             split = splits[i]
-            train_indices = np.concatenate([sp[0] for sp in split])
-            test_indices = np.concatenate([sp[1] for sp in split])
+            train_indices = np.hstack([sp[0] for sp in split])
+            test_indices = np.hstack([sp[1] for sp in split])
             splits_merged.append((train_indices, test_indices))
         return splits_merged
 
@@ -361,33 +563,26 @@ def create_holdout_val_split(
             raise ValueError(
                 '`val_share` specified, but `holdout_val_type` not specified.'
             )
-        if self.val_tensors is not None:
-            raise ValueError(
-                '`val_share` specified, but the Dataset was a given a pre-defined split at initialization already.')
+
         if val_share < 0 or val_share > 1:
             raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
         if not isinstance(holdout_val_type, HoldoutValTypes):
             raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
-        kwargs = {"n_prediction_steps": self.n_prediction_steps}
 
-        splits = [[() for _ in range(self.num_sequences)] for _ in range(2)]
-        idx_all = self._get_indices()
+        splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
-        for idx_seq, seq_length in enumerate(self.sequence_lengths_train):
-            idx_end = idx_start + seq_length
-            split = self.holdout_validators[holdout_val_type.name](val_share, idx_all[idx_start: idx_end], **kwargs)
+        for idx_seq, dataset in enumerate(self.datasets):
+            split = dataset.create_holdout_val_split(holdout_val_type, val_share)
             for idx_split in range(2):
-                splits[idx_split][idx_seq] = split[idx_split]
-            idx_start = idx_end
+                splits[idx_split][idx_seq] = idx_start + split[idx_split]
+            idx_start += self.sequence_lengths[idx_seq]
 
-        train_indices = np.concatenate([sp for sp in splits[0]])
-        test_indices = np.concatenate([sp for sp in splits[1]])
+        train_indices = np.hstack([sp for sp in splits[0]])
+        test_indices = np.hstack([sp for sp in splits[1]])
 
         return train_indices, test_indices
 
 
-
-
 def _check_time_series_forecasting_inputs(train: np.ndarray,
                                           val: Optional[np.ndarray] = None) -> None:
     if train.ndim != 3 or any(isinstance(i, (list, np.ndarray)) for i in train):
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 8dce8952d..b2870dd48 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,5 +1,7 @@
 from typing import Any, Dict, Optional, Tuple
 
+from torch.utils.data.sampler import SubsetRandomSampler
+
 from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
 
 
@@ -18,21 +20,65 @@
 
 
 from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
-from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
-from autoPyTorch.utils.backend import Backend
-from autoPyTorch.utils.common import FitRequirement, custom_collate_fn
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.utils.common import  custom_collate_fn
+from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 
 
-class TimeSeriesForecastingDataLoader(TimeSeriesDataLoader):
-    """This class is an interface to the PyTorch Dataloader.
+class ExpandTransformTimeSeries(object):
+    """Expand Dimensionality so tabular transformations see
+       a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
+       along the last axis
+    """
+    def __call__(self, data: np.ndarray) -> np.ndarray:
+        if len(data.shape) <= 1:
+            data = np.expand_dims(data, axis=-1)
+        return data
+
+
+class SequenceBuilder(object):
+    """build a time sequence token from the given time sequence
+    it requires two hyperparameters: sample_interval and window size
+    let's assume we have a time sequence
+    x = [0 1 2 3 4 5 6 7 8 9 10].with window_size=3 and sample resolution=2
+    then the extracted time series is [6, 8, 10] (or x[-5,-3,-1])
+    if window_size=3 and sample_resolution=3
+    then the extracted token is [4, 7, 10] (or x[-7,-4,-1])
+
+    Parameters
+    ----------
+    sample_interval : int, default=1
+        sample resolution
+
+    window_size : int, default=1
+        sliding window size
+    """
+    def __init__(self, sample_interval: int = 1, window_size: int = 1):
+        """
+        initialization
+        Args:
+            sample_interval: int: sample resolution
+            window_size: int: the size of the sliding window
+        """
+        self.sample_interval = sample_interval
+        self.window_size = window_size
+
+    def __call__(self, data: np.ndarray) -> np.ndarray:
+        sample_indices = np.arange(-1 - self.sample_interval * (self.window_size - 1), 0, step=self.sample_interval)
+        try:
+            return data[sample_indices]
+        except IndexError:
+            raise ValueError("the length of the time sequence token is larger than the possible, please check the "
+                             "sampler setting or reduce the window size")
+
+
+class TimeSeriesForecastingDataLoader(FeatureDataLoader):
+    """This class is an interface to read time sequence data
 
     It gives the possibility to read various types of mapped
     datasets as described in:
     https://pytorch.org/docs/stable/data.html
-
     """
-
     def __init__(self,
                  batch_size: int = 64,
                  window_size: int = 1,
@@ -56,30 +102,7 @@ def __init__(self,
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
-        self.tail_length = (self.window_size * self.sample_interval) + self.n_prediction_steps - 1
-
-    def transform(self, X: np.ndarray) -> np.ndarray:
-        """The transform function calls the transform function of the
-        underlying model and returns the transformed array.
-
-        Args:
-            X (np.ndarray): input features
-
-        Returns:
-            np.ndarray: Transformed features
-        """
-        X.update({'train_data_loader': self.train_data_loader,
-                  'val_data_loader': self.val_data_loader,
-                  'X_train': self.datamanager.train_tensors[0],
-                  'y_train': self.datamanager.train_tensors[1]})
-        if self.datamanager.val_tensors is not None and 'X_val' in X:
-            X.update({'X_val': self.datamanager.val_tensors[0],
-                      'y_val': self.datamanager.val_tensors[1]})
-        if self.datamanager.test_tensors is not None and 'X_test' in X:
-            X.update({'X_test': self.datamanager.test_tensors[0],
-                      'y_test': self.datamanager.test_tensors[1]})
-
-        return X
+        self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -94,15 +117,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
         sample_interval = X.get('sample_interval', 1)
         self.sample_interval = sample_interval
-        self.tail_length = (self.window_size * self.sample_interval) + self.n_prediction_steps - 1
 
+        self.window_size = 5
+
+        self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         # Make sure there is an optimizer
         self.check_requirements(X, y)
 
         # Incorporate the transform to the dataset
-        datamanager = X['backend'].load_datamanager()
-        datamanager = self._update_dataset(datamanager)
-
+        datamanager = X['backend'].load_datamanager() # type: TimeSeriesForecastingDataset
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
         self.test_transform = self.build_transform(X, mode='test')
@@ -114,24 +137,42 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             self.val_transform,
             train=False,
         )
-
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
-        X['backend'].save_datamanager(datamanager, overwrite=True)
+
+        self.n_prediction_steps = datamanager.n_prediction_steps
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
-        self.datamanager = datamanager
+        train_split, test_split = datamanager.splits[X['split_id']]
+        valid_indices = []
+        idx_start = 0
+        # to allow a time sequence data with resolution self.sample_interval and windows size with self.window_size
+        # we need to drop the first part of each sequence
+        for seq_idx, seq_length in enumerate(datamanager.sequence_lengths):
+            idx_end = idx_start + seq_length
+            full_sequence = np.arange(idx_start, idx_end)[self.subseq_length:]
+            valid_indices.append(full_sequence)
+            idx_start = idx_end
+
+        valid_indices = np.hstack([valid_idx for valid_idx in valid_indices])
+        _, sampler_indices_train, _ = np.intersect1d(train_split, valid_indices, return_indices=True)
+
+        # test_indices not required as testsets usually lies on the trail of hte sequence
+        #_, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
+
+        sampler_train = SubsetRandomSampler(indices=sampler_indices_train)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=min(self.batch_size, len(train_dataset)),
-            shuffle=True,
+            shuffle=False,
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', True),
             collate_fn=custom_collate_fn,
+            sampler=sampler_train,
         )
 
         self.val_data_loader = torch.utils.data.DataLoader(
@@ -146,138 +187,34 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         return self
 
-    def _update_dataset(self, datamanager: TimeSeriesForecastingDataset):
+    def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
         """
-        update the dataset to build time sequence
-        """
-        num_features = datamanager.num_features
-        num_sequences = datamanager.num_sequences
-        num_targets = datamanager.num_target
-
-        X_train, y_train = datamanager.train_tensors
-        val_tensors = datamanager.val_tensors
-        test_tensors = datamanager.test_tensors
-        n_prediction_steps = datamanager.n_prediction_steps
-        sequence_lengths_train = datamanager.sequence_lengths_train
-        sequence_lengths_val = datamanager.sequence_lengths_val
-        sequence_lengths_test = datamanager.sequence_lengths_test
-
-        self.num_sequences = num_sequences
-        self.num_features = num_features
-        #X_train = X_train.reshape([-1, population_size, num_features])
-        #y_train = y_train.reshape([-1, population_size, num_target])
-
-        X_train_pieces = [[] for _ in range(num_sequences)]
-        y_train_pieces = [[] for _ in range(num_sequences)]
-
-        X_val_pieces = [[] for _ in range(num_sequences)]
-        y_val_pieces = [[] for _ in range(num_sequences)]
-
-        X_test_pieces = [[] for _ in range(num_sequences)]
-        y_test_pieces = [[] for _ in range(num_sequences)]
-
-        X_val_tail = [[] for _ in range(num_sequences)]
-        num_data_points_seqs = [0] * num_sequences
-        start_idx_train = 0
-        start_idx_test = 0
-        start_idx_val = 0
-        for i, seq_length_train in enumerate(sequence_lengths_train):
-            end_idx_train = start_idx_train + seq_length_train
-            num_datapoints_train = seq_length_train - (self.window_size - 1) * self.sample_interval - n_prediction_steps + 1
-
-            X_train_tmp = X_train[start_idx_train: end_idx_train]
-            y_train_tmp = y_train[start_idx_train: end_idx_train]
-
-            y_train_tmp = y_train_tmp[-num_datapoints_train:]
-            if test_tensors is not None:
-                end_idx_test = start_idx_test + sequence_lengths_test[i]
-                X_test_tmp  = test_tensors[0][start_idx_test: end_idx_test]
-                y_test_tmp = test_tensors[1][start_idx_test: end_idx_test]
-
-                if val_tensors is not None:
-                    end_idx_val = start_idx_val + sequence_lengths_val[i]
-                    X_val_tmp = val_tensors[0][start_idx_val: end_idx_val]
-                    y_val_tmp = val_tensors[1][start_idx_val: end_idx_val]
-                    num_datapoints_val = sequence_lengths_val[i]
-
-                    X_val_tmp = np.concatenate([X_train_tmp[-self.tail_length:], X_val_tmp])
-                    X_test_tmp = np.concatenate([X_val_tmp[-self.tail_length:], X_test_tmp])
-                    X_val_tmp, y_val_tmp = self._ser2seq(X_val_tmp, y_val_tmp, num_datapoints_val, num_features, num_targets)
-
-                    X_val_pieces[i] = X_val_tmp
-                    y_val_pieces[i] = y_val_tmp
-
-                num_datapoints_test = sequence_lengths_test[i]
-
-                X_test_tmp = np.concatenate([X_train_tmp[-self.tail_length:], X_test_tmp])
-                X_val_tail[i] = X_test_tmp[-self.tail_length:] if self.tail_length > 1 \
-                    else np.zeros((0, num_features)).astype(dtype=X_test_tmp.dtype)
-
-                X_test_tmp, y_test_tmp = self._ser2seq(X_test_tmp, y_test_tmp, num_datapoints_test, num_features, num_targets)
-                X_test_pieces[i] = X_test_tmp
-                y_test_pieces[i] = y_test_tmp
-                datamanager.test_tensors = test_tensors
-
-            elif val_tensors is not None:
-                end_idx_val = start_idx_val + sequence_lengths_val[i]
-                X_val_tmp = val_tensors[0][start_idx_val: end_idx_val]
-                y_val_tmp = val_tensors[1][start_idx_val: end_idx_val]
-                num_datapoints_val = sequence_lengths_val[i]
-                X_val_tmp = np.concatenate([X_train_tmp[-self.tail_length:], X_val_tmp])
-
-                # used for prediction
-                X_val_tail[i] = X_val_tmp[-self.tail_length:] if self.tail_length > 1 \
-                    else np.zeros((0, num_features)).astype(dtype=X_val_tmp.dtype)
-                X_val_tmp, y_val_tmp = self._ser2seq(X_val_tmp, y_val_tmp, num_datapoints_val, num_features, num_targets)
-
-                X_val_pieces[i] = X_val_tmp
-                y_val_pieces[i] = y_val_tmp
-            else:
-                X_val_tail[i] = X_train_tmp[-self.tail_length:] if self.tail_length > 1 \
-                    else np.zeros((0, num_features)).astype(dtype=X_train_tmp.dtype)
-
-            X_train_tmp, y_train_tmp = self._ser2seq(X_train_tmp, y_train_tmp,
-                                                     num_datapoints_train, num_features, num_targets)
-            X_train_pieces[i] = X_train_tmp
-            y_train_pieces[i] = y_train_tmp
-
-            num_data_points_seqs[i] = num_datapoints_train
-
-        train_tensors = (np.concatenate(X_train_pieces), np.concatenate(y_train_pieces))
-        if test_tensors is not None:
-            test_tensors = (np.concatenate(X_test_pieces), np.concatenate(y_test_pieces))
-            datamanager.test_tensors = test_tensors
-        if val_tensors is not None:
-            val_tensors = (np.concatenate(X_val_pieces), np.concatenate(y_val_pieces))
-            datamanager.val_tensors = val_tensors
-        self.X_val_tail = X_val_tail
-
-        datamanager.train_tensors = train_tensors
-        datamanager.update_sequence_lengths_train(num_data_points_seqs)
-        datamanager.splits = datamanager.get_splits_from_resampling_strategy()
-        return datamanager
-
-    def _ser2seq(self, X_in, y_in, num_datapoints, num_features, num_targets):
-        """
-        build a sliding window transformer for the given data
-         Args:
-            X_in (np.ndarray): input feature array to be transformed with shape
-             [time_series_length, population_size, num_features]
-            y_in (np.ndarray): input target array with shape [time_series_length, population_size, num_targets]
-            num_datapoints: number of actual data points stored in the dataset
-            num_features: number of features
-            num_targets: number of targets
+        Method to build a transformation that can pre-process input data
+
+        Args:
+            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
+            mode (str): train/val/test
+
         Returns:
-            X_in_trans: transformed input featuer array with shpae
-            [num_datapoints * population_size, sequence_length, num_features]
-            y_in_trans: transformed input target array with shape
-            [num_datapoints * population_size, num_targets]
+            A composition of transformations
         """
-        X_in = np.concatenate([np.roll(X_in, shift=i * self.sample_interval, axis=0) for i in range(0, -self.window_size, -1)],
-                              axis=1).astype(np.float32)[:num_datapoints]
-        X_in = X_in.reshape((-1, self.window_size, num_features))
-        y_in = y_in.reshape((-1, num_targets))
-        return X_in, y_in
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError("Unsupported mode provided {}. ".format(mode))
+
+        candidate_transformations = []  # type: List[Callable]
+
+        candidate_transformations.append((SequenceBuilder(sample_interval=self.sample_interval,
+                                                          window_size=self.window_size)))
+        candidate_transformations.append((ExpandTransformTimeSeries()))
+
+        if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
+            candidate_transformations.extend(X['preprocess_transforms'])
+
+        # Transform to tensor
+        candidate_transformations.append(torch.from_numpy)
+
+        return torchvision.transforms.Compose(candidate_transformations)
 
     def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.inf,
                    ) -> torch.utils.data.DataLoader:
@@ -285,19 +222,12 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
         """
-        if len(X) != len(self.X_val_tail):
-            raise ValueError(f"Training data has {len(self.X_val_tail)} sequences but test data has {len(X)} sequences")
-        for i in range(len(X)):
-            num_points_X_in = np.shape(X[i])[0]
-            X_tmp = np.concatenate([self.X_val_tail[i], X[i]])
-            X[i] = np.concatenate([np.roll(X_tmp, shift=i * self.sample_interval, axis=0) for i in range(0, -self.window_size, -1)],
-                           axis=1).astype(np.float32)[:num_points_X_in]
-
-        X = np.concatenate(X)
-
+        X = X[-self.subseq_length - self.n_prediction_steps:]
+        if y is not None:
+            y = y[-self.subseq_length - self.n_prediction_steps:]
 
-        dataset = BaseDataset(
-            train_tensors=(X, y),
+        dataset = TimeSeriesSequence(
+            X=X, Y=y,
             # This dataset is used for loading test data in a batched format
             train_transforms=self.test_transform,
             val_transforms=self.test_transform,

From 7ae7e26cd945e756685c32ec25451ee6e8ce781f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 28 Jul 2021 16:42:40 +0200
Subject: [PATCH 043/347] align input features and targets

---
 autoPyTorch/datasets/time_series_dataset.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index c3b917d15..df31e7e95 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -112,8 +112,8 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         Returns:
             A transformed single point prediction
         """
-        if index == -1:
-            index = self.__len__()
+        if index < 0 :
+            index = self.__len__() + 1 - index
 
         if hasattr(self.train_tensors[0], 'loc'):
             X = self.train_tensors[0].iloc[:index + 1]
@@ -129,14 +129,14 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         Y = self.train_tensors[1]
         if Y is not None:
             # Y = Y[:index + self.n_prediction_steps]
-            Y = Y[index]
+            Y = Y[index + self.n_prediction_steps]
         else:
             Y = None
 
         return X, Y
 
     def __len__(self) -> int:
-        return self.train_tensors[0].shape[0]
+        return self.train_tensors[0].shape[0] - self.n_prediction_steps
 
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
         """
@@ -352,6 +352,8 @@ def __init__(self,
                                               **sequences_kwargs)
                 sequence_datasets.append(sequence)
                 idx_start_train = idx_end_train
+
+                self.sequence_lengths[seq_idx] = len(sequence)
         else:
             for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths, sequence_lengths_test)):
                 idx_end_train = idx_start_train + seq_length_train
@@ -366,6 +368,8 @@ def __init__(self,
                 sequence_datasets.append(sequence)
                 idx_start_train = idx_end_train
 
+                self.sequence_lengths[seq_idx] = len(sequence)
+
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
         self.train_tensors = (X_train_flatten, Y_train_flatten)

From 5c846c99237f89df4d2a67a49c39aa90bca38970 Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Thu, 29 Jul 2021 10:59:36 +0200
Subject: [PATCH 044/347] fix bugs in dataloader and only allow regrssion task
 types

---
 autoPyTorch/datasets/time_series_dataset.py    |  4 ++++
 .../time_series_forecasting_data_loader.py     | 18 ++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index df31e7e95..b97ce923f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -387,6 +387,9 @@ def __init__(self,
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1])
 
+            if self.output_type in ["binary", "multiclass"]:
+                self.output_type = "continuous"
+
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
                 self.output_shape = len(np.unique(self.train_tensors[1]))
             else:
@@ -406,6 +409,7 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
+
     def __getitem__(self, idx, train=True):
         if idx < 0:
             if -idx > len(self):
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index b2870dd48..ac9ad439b 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -19,7 +19,7 @@
 import warnings
 
 
-from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from autoPyTorch.utils.common import  custom_collate_fn
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
@@ -166,7 +166,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
-            batch_size=min(self.batch_size, len(train_dataset)),
+            batch_size=min(self.batch_size, len(sampler_indices_train)),
             shuffle=False,
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
@@ -184,7 +184,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             drop_last=X.get('drop_last', False),
             collate_fn=custom_collate_fn,
         )
-
         return self
 
     def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
@@ -222,7 +221,9 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
         """
+        # TODO any better way to deal with prediction data loader for multiple sequences
         X = X[-self.subseq_length - self.n_prediction_steps:]
+
         if y is not None:
             y = y[-self.subseq_length - self.n_prediction_steps:]
 
@@ -232,8 +233,13 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
             train_transforms=self.test_transform,
             val_transforms=self.test_transform,
         )
+
+        test_seq_indices = np.arange(len(X))[self.subseq_length:]
+
+        dataset_test = TransformSubset(dataset, indices=test_seq_indices, train=False)
+
         return torch.utils.data.DataLoader(
-            dataset,
+            dataset_test,
             batch_size=min(batch_size, len(dataset)),
             shuffle=False,
             collate_fn=custom_collate_fn,
@@ -287,12 +293,12 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                 window_size = UniformIntegerHyperparameter("window_size",
                                                            lower=1,
                                                            upper=upper_window_size,
-                                                           default_value=1)
+                                                           default_value=(upper_window_size + 1)// 2)
         elif window_size[0][0] <= upper_window_size < window_size[0][1]:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],
                                                        upper=upper_window_size,
-                                                       default_value=1)
+                                                       default_value=(window_size[0][0] + upper_window_size) // 2)
         else:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],

From 6b592517a8398bd4673651d4e36a2c4fa06d9200 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 30 Jul 2021 19:10:40 +0200
Subject: [PATCH 045/347] maint

---
 .../data_loader/time_series_forecasting_data_loader.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index b2870dd48..80668e939 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -118,8 +118,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         sample_interval = X.get('sample_interval', 1)
         self.sample_interval = sample_interval
 
-        self.window_size = 5
-
         self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         # Make sure there is an optimizer
         self.check_requirements(X, y)
@@ -222,7 +220,9 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
         """
+        # TODO any better way to deal with prediction data loader for multiple sequences
         X = X[-self.subseq_length - self.n_prediction_steps:]
+
         if y is not None:
             y = y[-self.subseq_length - self.n_prediction_steps:]
 
@@ -287,12 +287,12 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                 window_size = UniformIntegerHyperparameter("window_size",
                                                            lower=1,
                                                            upper=upper_window_size,
-                                                           default_value=1)
+                                                           default_value=(upper_window_size + 1)// 2)
         elif window_size[0][0] <= upper_window_size < window_size[0][1]:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],
                                                        upper=upper_window_size,
-                                                       default_value=1)
+                                                       default_value=(window_size[0][0] + upper_window_size) // 2)
         else:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],

From b99fab15e6757edad882f0f873dd81cb505427d7 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 3 Aug 2021 12:13:32 +0200
Subject: [PATCH 046/347] only allow regression data type for forcasting

---
 autoPyTorch/api/time_series_regression.py      |  2 +-
 ...time_series_forecasting_target_validator.py | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/autoPyTorch/api/time_series_regression.py b/autoPyTorch/api/time_series_regression.py
index 8493cc4d7..70d3ac48e 100644
--- a/autoPyTorch/api/time_series_regression.py
+++ b/autoPyTorch/api/time_series_regression.py
@@ -96,7 +96,7 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An
             ))
         return dataset.get_required_dataset_info()
 
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
+    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesRegressionPipeline:
         return TimeSeriesRegressionPipeline(dataset_properties=dataset_properties)
 
     def search(self,
diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
index 9a2d25b7e..9469e0bf9 100644
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_target_validator.py
@@ -57,22 +57,30 @@ def fit(
                 self.target_validators[seq_idx].fit(y_train_seq, y_test_seq)
 
                 out_dimensionality[seq_idx] = self.target_validators[seq_idx].out_dimensionality
-                type_of_target[seq_idx] = self.target_validators[seq_idx].type_of_target
+                target_type = self.target_validators[seq_idx].type_of_target
+                if target_type in ['multiclass', "binary"]:
+                    # for time series forecasting problems, we only support regression
+                    type_of_target[seq_idx] = "continuous"
+                else:
+                    type_of_target[seq_idx] = target_type
 
         else:
             for seq_idx, y_train_seq in enumerate(y_train):
                 self.target_validators[seq_idx].fit(y_train_seq)
 
                 out_dimensionality[seq_idx] = self.target_validators[seq_idx].out_dimensionality
-                type_of_target[seq_idx] = self.target_validators[seq_idx].type_of_target
+                target_type = self.target_validators[seq_idx].type_of_target
+                if target_type in ['multiclass', "binary"]:
+                    # for time series forecasting problems, we only support regression
+                    type_of_target[seq_idx] = "continuous"
+                else:
+                    type_of_target[seq_idx] = target_type
 
         if not np.all(np.asarray(out_dimensionality) == out_dimensionality[0]):
             raise ValueError(f"All the sequence needs to have the same out_dimensionality!")
-        # TODO consider how to handle "continuous" and "multiple_classes" data type
-        """
         if not np.all(np.asarray(type_of_target) == type_of_target[0]):
             raise ValueError(f"All the sequence needs to have the same type_of_target!")
-        """
+
 
         self.out_dimensionality = out_dimensionality[0]
         self.type_of_target = type_of_target[0]

From 8cd46afb6bcd5ebd92d86be69a4770fa05ac15d3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 17 Aug 2021 19:30:12 +0200
Subject: [PATCH 047/347] merge from main repo

---
 .github/workflows/long_regression_test.yml | 35 ++++++++++++++++
 .github/workflows/scheduled_test.yml       | 34 ++++++++++++++++
 README.md                                  | 46 +++++++++++++---------
 requirements.txt                           | 40 ++++++++++---------
 4 files changed, 117 insertions(+), 38 deletions(-)
 create mode 100644 .github/workflows/long_regression_test.yml
 create mode 100644 .github/workflows/scheduled_test.yml

diff --git a/.github/workflows/long_regression_test.yml b/.github/workflows/long_regression_test.yml
new file mode 100644
index 000000000..135c45fb0
--- /dev/null
+++ b/.github/workflows/long_regression_test.yml
@@ -0,0 +1,35 @@
+name: Tests
+
+on:
+  schedule:
+    # Every Truesday at 7AM UTC
+    # TODO teporary set to every day just for the PR
+    #- cron: '0 07 * * 2'
+    - cron: '0 07 * * *'
+
+
+jobs:
+  ubuntu:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+      fail-fast:  false
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        ref: development
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install test dependencies
+      run: |
+        git submodule update --init --recursive
+        python -m pip install --upgrade pip
+        pip install -e .[test]
+    - name: Run tests
+      run: |
+        python -m pytest --durations=200 cicd/test_preselected_configs.py -vs
diff --git a/.github/workflows/scheduled_test.yml b/.github/workflows/scheduled_test.yml
new file mode 100644
index 000000000..68f37d72d
--- /dev/null
+++ b/.github/workflows/scheduled_test.yml
@@ -0,0 +1,34 @@
+name: Tests
+
+on:
+  schedule:
+    # Every Monday at 7AM UTC
+    - cron: '0 07 * * 1'
+
+
+jobs:
+  ubuntu:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+      fail-fast:  false
+      max-parallel: 2
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        ref: development
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install test dependencies
+      run: |
+        git submodule update --init --recursive
+        python -m pip install --upgrade pip
+        pip install -e .[test]
+    - name: Run tests
+      run: |
+        python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v test
\ No newline at end of file
diff --git a/README.md b/README.md
index 380ce0c08..792408749 100755
--- a/README.md
+++ b/README.md
@@ -1,29 +1,29 @@
 # Auto-PyTorch
 
-Copyright (C) 2019  [AutoML Group Freiburg](http://www.automl.org/)
+Copyright (C) 2021  [AutoML Groups Freiburg and Hannover](http://www.automl.org/)
 
-This an alpha version of Auto-PyTorch with improved API.
-So far, Auto-PyTorch supports tabular data (classification, regression).
-We plan to enable image data and time-series data.
+While early AutoML frameworks focused on optimizing traditional ML pipelines and their hyperparameters, another trend in AutoML is to focus on neural architecture search. To bring the best of these two worlds together, we developed **Auto-PyTorch**, which jointly and robustly optimizes the network architecture and the training hyperparameters to enable fully automated deep learning (AutoDL).
 
+Auto-PyTorch is mainly developed to support tabular data (classification, regression), but can also be applied to image data (classification).
+The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below for bibtex ref).
 
-Find the documentation [here](https://automl.github.io/Auto-PyTorch/refactor_development)
+## Alpha Status of Next Release
 
+The upcoming release of Auto-PyTorch will further improve usability, robustness and efficiency by using SMAC as the underlying optimization package, changing the code structure and other improvements. If you would like to give it a try, check out the `development` branch or it's [documentation](https://automl.github.io/Auto-PyTorch/development/).
 
 ## Installation
 
 ### Pip
 
-We recommend using Anaconda for developing as follows:
-
 ```sh
-# Following commands assume the user is in a cloned directory of Auto-Pytorch
-conda create -n autopytorch python=3.8
-conda activate autopytorch
-conda install gxx_linux-64 gcc_linux-64 swig
-cat requirements.txt | xargs -n 1 -L 1 pip install
-python setup.py install
+$ cd install/path
+$ git clone https://github.com/automl/Auto-PyTorch.git
+$ cd Auto-PyTorch
+```
+If you want to contribute to this repository switch to our current development branch
 
+```sh
+$ git checkout development
 ```
 
 ## Contributing
@@ -71,7 +71,19 @@ along with this program (see LICENSE file).
 
 ## Reference
 
+
+```bibtex
+  @article{zimmer-tpami21a,
+  author = {Lucas Zimmer and Marius Lindauer and Frank Hutter},
+  title = {Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year = {2021},
+  note = {IEEE early access; also available under https://arxiv.org/abs/2006.13799},
+  pages = {1-12}
+}
 ```
+
+```bibtex
 @incollection{mendoza-automlbook18a,
   author    = {Hector Mendoza and Aaron Klein and Matthias Feurer and Jost Tobias Springenberg and Matthias Urban and Michael Burkart and Max Dippel and Marius Lindauer and Frank Hutter},
   title     = {Towards Automatically-Tuned Deep Neural Networks},
@@ -81,14 +93,10 @@ along with this program (see LICENSE file).
   booktitle = {AutoML: Methods, Sytems, Challenges},
   publisher = {Springer},
   chapter   = {7},
-  pages     = {141--156},
-  note      = {To appear.},
+  pages     = {141--156}
 }
 ```
 
-**Note**: Previously, the name of the project was AutoNet. Since this was too generic, we changed the name to AutoPyTorch. AutoNet 2.0 in the reference mention above is indeed AutoPyTorch.
-
-
 ## Contact
 
-Auto-PyTorch is developed by the [AutoML Group of the University of Freiburg](http://www.automl.org/).
+Auto-PyTorch is developed by the [AutoML Groups of the University of Freiburg and Hannover](http://www.automl.org/).
diff --git a/requirements.txt b/requirements.txt
index c79104461..c3bbc1b23 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,21 @@
-pandas
-torch
-torchvision
-tensorboard
-scikit-learn>=0.24.0,<0.25.0
-numpy
-scipy
-lockfile
-imgaug>=0.4.0
-ConfigSpace>=0.4.14,<0.5
-pynisher>=0.6.3
-pyrfr>=0.7,<0.9
-smac>=0.13.1,<0.14
-dask
-distributed>=2.2.0
-catboost
-lightgbm
-flaky
-tabulate
+setuptools
+Cython==0.29.21
+netifaces==0.10.9
+numpy==1.19.5
+pandas==1.2.0
+scipy==1.6.0
+statsmodels==0.12.1
+scikit-learn==0.23.0
+imbalanced-learn==0.7.0
+imblearn==0.0
+ConfigSpace==0.4.17
+pynisher==0.6.3
+hpbandster==0.7.4
+fasteners==0.16
+torch==1.7.1
+torchvision==0.8.2
+tensorboard-logger==0.1.0
+openml==0.11.0
+lightgbm==3.1.1
+catboost==0.24.4
+pexpect==4.8.0

From d1cbc2cefe5752313b3dc9e96fd846d17d119302 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 17 Aug 2021 19:33:26 +0200
Subject: [PATCH 048/347] fix datasetname issues in time series dataset

---
 autoPyTorch/datasets/time_series_dataset.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index b97ce923f..90e9d64fe 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -329,9 +329,10 @@ def __init__(self,
                 start_idx = end_idx
 
         if dataset_name is None:
-            dataset_name_seqs = [None] * self.num_sequences
+            self.dataset_name = hash_array_or_matrix(X_train_flatten)
         else:
-            dataset_name_seqs = [f"{dataset_name}_sequence_{i}" for i in range(self.num_sequences)]
+            self.dataset_name = dataset_name
+        dataset_name_seqs = [f"{dataset_name}_sequence_{i}" for i in range(self.num_sequences)]
 
         # initialize datasets
         sequences_kwargs = {"resampling_strategy": resampling_strategy,

From 30b883897077949765bcc3f67937b45f8274c384 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 18 Aug 2021 22:53:15 +0200
Subject: [PATCH 049/347] new tae allowing prediction on time series prediction
 tasks

---
 autoPyTorch/api/base_task.py                  |   6 +-
 autoPyTorch/api/time_series_forecasting.py    |   1 +
 autoPyTorch/datasets/resampling_strategy.py   |  11 +-
 autoPyTorch/datasets/time_series_dataset.py   |  35 +-
 autoPyTorch/evaluation/tae.py                 |   5 +-
 ...time_series_forecasting_train_evaluator.py | 310 ++++++++++++++++++
 autoPyTorch/evaluation/train_evaluator.py     |  12 +-
 autoPyTorch/optimizer/smbo.py                 |  14 +-
 .../time_series_forecasting_data_loader.py    |  47 ++-
 .../pipeline/time_series_forecasting.py       |  22 ++
 10 files changed, 423 insertions(+), 40 deletions(-)
 create mode 100644 autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index cf5c5b464..e697a7d95 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -634,6 +634,7 @@ def _search(
             precision: int = 32,
             disable_file_output: List = [],
             load_models: bool = True,
+            time_series_prediction: bool = False
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -696,6 +697,8 @@ def _search(
             disable_file_output (Union[bool, List]):
             load_models (bool), (default=True): Whether to load the
                 models after fitting AutoPyTorch.
+            time_series_prediction (bool):
+                if we want to do time series prediction tasks
 
         Returns:
             self
@@ -851,7 +854,8 @@ def _search(
                 ensemble_callback=proc_ensemble,
                 logger_port=self._logger_port,
                 start_num_run=num_run,
-                search_space_updates=self.search_space_updates
+                search_space_updates=self.search_space_updates,
+                time_series_prediction=time_series_prediction
             )
             try:
                 self.run_history, self.trajectory, budget_type = \
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 27182e547..03ab359b9 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -239,4 +239,5 @@ def search(
             precision=precision,
             disable_file_output=disable_file_output,
             load_models=load_models,
+            time_series_prediction=True
         )
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 27c7ac8fd..44715038c 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -152,8 +152,11 @@ def time_series_hold_out_validation(val_share: float, indices: np.ndarray, **kwa
     Returns:
     """
     # TODO consider how we handle test size properly
-    test_size = int(val_share * len(indices))
-    cv = TimeSeriesSplit(n_splits=2, test_size=test_size, gap=kwargs['n_prediction_steps'])
+    # Time Series prediction only requires on set of prediction for each
+    # This implement needs to be combined with time series forecasting dataloader, where each time an entire time series
+    # is used for prediction
+    test_size = kwargs['n_prediction_steps']
+    cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
     train, val = list(cv.split(indices))[-1]
     return train, val
 
@@ -173,6 +176,8 @@ def time_series_cross_validation(num_splits: int, indices: np.ndarray, **kwargs:
     """
     # TODO: we use gap=n_prediction_step here, we need to consider if we want to implement n_prediction_step here or
     # under DATALOADER!!!
-    cv = TimeSeriesSplit(n_splits=num_splits, gap=kwargs['n_prediction_steps'])
+    # TODO do we need cross valriadtion for time series datasets?
+    test_size = kwargs['n_prediction_steps']
+    cv = TimeSeriesSplit(n_splits=num_splits, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
     splits = list(cv.split(indices))
     return splits
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 90e9d64fe..e5d00efa1 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -58,6 +58,7 @@ def __init__(self,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  n_prediction_steps: int = 1,
+                 do_split=True,
                  ):
         """
         A dataset representing a time series sequence.
@@ -88,12 +89,14 @@ def __init__(self,
 
         self.rand = np.random.RandomState(seed=seed)
         self.shuffle = shuffle
-        self.resampling_strategy = resampling_strategy
-        self.resampling_strategy_args = resampling_strategy_args
 
-        # we only allow time series cross validation and holdout validation
-        self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
-        self.holdout_validators = get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
+        if do_split:
+            self.resampling_strategy = resampling_strategy
+            self.resampling_strategy_args = resampling_strategy_args
+
+            # we only allow time series cross validation and holdout validation
+            self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
+            self.holdout_validators = get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
 
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
@@ -129,14 +132,14 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         Y = self.train_tensors[1]
         if Y is not None:
             # Y = Y[:index + self.n_prediction_steps]
-            Y = Y[index + self.n_prediction_steps]
+            Y = Y[index]
         else:
             Y = None
 
         return X, Y
 
     def __len__(self) -> int:
-        return self.train_tensors[0].shape[0] - self.n_prediction_steps
+        return self.train_tensors[0].shape[0]
 
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
         """
@@ -254,6 +257,7 @@ def __init__(self,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  validator: Optional[TimeSeriesForecastingInputValidator] = None,
                  n_prediction_steps: int = 1,
+                 shift_input_data: bool = True,
                  ):
         """
         :param target_variables: The indices of the variables you want to forecast
@@ -261,6 +265,9 @@ def __init__(self,
         :param n_steps: The number of steps you want to forecast into the future
         :param train: Tuple with one tensor holding the training data
         :param val: Tuple with one tensor holding the validation data
+        :param shift_input_data: bool
+        if the input X and targets needs to be shifted to be aligned:
+        such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
         self.n_prediction_steps = n_prediction_steps
@@ -295,8 +302,14 @@ def __init__(self,
 
         self.num_sequences = len(X)
         self.sequence_lengths = [0] * self.num_sequences
-        for seq_idx in range(self.num_sequences):
-            self.sequence_lengths[seq_idx] = len(X[seq_idx])
+        if shift_input_data:
+            for seq_idx in range(self.num_sequences):
+                X[seq_idx] = X[seq_idx][:-n_prediction_steps]
+                Y[seq_idx] = Y[seq_idx][n_prediction_steps:]
+                self.sequence_lengths[seq_idx] = len(X[seq_idx])
+        else:
+            for seq_idx in range(self.num_sequences):
+                self.sequence_lengths[seq_idx] = len(X[seq_idx])
 
         num_train_data = np.sum(self.sequence_lengths)
         X_train_flatten = np.empty([num_train_data, self.num_features])
@@ -343,6 +356,7 @@ def __init__(self,
         idx_start_train = 0
         idx_start_test = 0
         sequence_datasets = []
+
         if X_test is None or Y_test is None:
             for seq_idx, seq_length_train in enumerate(self.sequence_lengths):
                 idx_end_train = idx_start_train + seq_length_train
@@ -377,7 +391,7 @@ def __init__(self,
         if X_test is not None or Y_test is not None:
             self.test_tensors = (X_test_flatten, Y_test_flatten)
         else:
-            self.test_tensors = (None, None)
+            self.test_tensors = None
         self.val_tensors = None
 
         self.task_type: Optional[str] = None
@@ -410,7 +424,6 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
-
     def __getitem__(self, idx, train=True):
         if idx < 0:
             if -idx > len(self):
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 2e5e90563..5a45e2893 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -112,10 +112,11 @@ def __init__(
             logger_port: int = None,
             all_supported_metrics: bool = True,
             pynisher_context: str = 'spawn',
-            search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None
+            search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None,
+            **eval_func_kwargs
     ):
 
-        eval_function = autoPyTorch.evaluation.train_evaluator.eval_function
+        eval_function = functools.partial(autoPyTorch.evaluation.train_evaluator.eval_function, **eval_func_kwargs)
 
         self.worst_possible_result = cost_for_crash
 
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
new file mode 100644
index 000000000..077785302
--- /dev/null
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -0,0 +1,310 @@
+from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+
+from multiprocessing.queues import Queue
+from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar
+
+import warnings
+
+from ConfigSpace.configuration_space import Configuration
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+from smac.tae import StatusType
+
+from autoPyTorch.constants import (
+    CLASSIFICATION_TASKS,
+    MULTICLASSMULTIOUTPUT,
+)
+from autoPyTorch.evaluation.abstract_evaluator import (
+    AbstractEvaluator,
+    fit_and_suppress_warnings
+)
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.utils.backend import Backend
+from autoPyTorch.utils.common import subsampler
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+
+
+class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
+    def __init__(self, backend: Backend, queue: Queue,
+                 metric: autoPyTorchMetric,
+                 budget: float,
+                 budget_type: str = None,
+                 pipeline_config: Optional[Dict[str, Any]] = None,
+                 configuration: Optional[Configuration] = None,
+                 seed: int = 1,
+                 output_y_hat_optimization: bool = True,
+                 num_run: Optional[int] = None,
+                 include: Optional[Dict[str, Any]] = None,
+                 exclude: Optional[Dict[str, Any]] = None,
+                 disable_file_output: Union[bool, List] = False,
+                 init_params: Optional[Dict[str, Any]] = None,
+                 logger_port: Optional[int] = None,
+                 keep_models: Optional[bool] = None,
+                 all_supported_metrics: bool = True,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
+        super(TimeSeriesForecastingTrainEvaluator, self).__init__(
+            backend=backend,
+            queue=queue,
+            configuration=configuration,
+            metric=metric,
+            seed=seed,
+            output_y_hat_optimization=output_y_hat_optimization,
+            num_run=num_run,
+            include=include,
+            exclude=exclude,
+            disable_file_output=disable_file_output,
+            init_params=init_params,
+            budget=budget,
+            budget_type=budget_type,
+            logger_port=logger_port,
+            keep_models=keep_models,
+            all_supported_metrics=all_supported_metrics,
+            pipeline_config=pipeline_config,
+            search_space_updates=search_space_updates
+        )
+        self.pipeline_class = TimeSeriesForecastingPipeline
+        self.datamanager: TimeSeriesForecastingDataset
+        self.n_prediction_steps = self.datamanager.n_prediction_steps
+        self.num_sequences = self.datamanager.num_sequences
+
+        self.splits = self.datamanager.splits
+        if self.splits is None:
+            raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
+        self.num_folds: int = len(self.splits)
+        self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
+        # TODO consider if we really need Y_train_targets
+        #self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
+        self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds
+        self.indices: List[Optional[Tuple[Union[np.ndarray, List], Union[np.ndarray, List]]]] = [None] * self.num_folds
+
+        self.logger.debug("Search space updates :{}".format(self.search_space_updates))
+        self.keep_models = keep_models
+
+    def fit_predict_and_loss(self) -> None:
+        """Fit, predict and compute the loss for cross-validation and
+        holdout"""
+        assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
+            .format(self.__class__.__name__)
+        additional_run_info: Optional[Dict] = None
+        if self.num_folds == 1:
+            split_id = 0
+            self.logger.info("Starting fit {}".format(split_id))
+
+            pipeline = self._get_pipeline()
+
+            train_split, test_split = self.splits[split_id]
+            self.Y_optimization = self.y_train[test_split]
+            self.Y_actual_train = self.y_train[train_split]
+            y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
+                                                                                        train_indices=train_split,
+                                                                                        test_indices=test_split,
+                                                                                        add_pipeline_to_self=True)
+
+            #As each sequence contains one test split id, and the value to be predicted is the last n_prediction_steps
+            #we need to expand the current split.
+
+            y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
+                           np.tile(np.arange(self.n_prediction_steps), len(test_split))
+            #train_loss = self._loss(self.y_train[train_split], y_train_pred)
+            # TODO do we really need train loss?
+            train_loss = 0.
+            loss = self._loss(self.y_train[y_test_split], y_opt_pred)
+
+            additional_run_info = pipeline.get_additional_run_info() if hasattr(
+                pipeline, 'get_additional_run_info') else {}
+
+            status = StatusType.SUCCESS
+
+            self.finish_up(
+                loss=loss,
+                train_loss=train_loss,
+                opt_pred=y_opt_pred,
+                valid_pred=y_valid_pred,
+                test_pred=y_test_pred,
+                additional_run_info=additional_run_info,
+                file_output=True,
+                status=status,
+            )
+
+        else:
+            Y_train_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            Y_optimization_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            Y_valid_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            Y_test_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
+            train_splits: List[Optional[Union[np.ndarray, List]]] = [None] * self.num_folds
+
+            self.pipelines = [self._get_pipeline() for _ in range(self.num_folds)]
+
+            # stores train loss of each fold.
+            train_losses = [np.NaN] * self.num_folds
+            # used as weights when averaging train losses.
+            train_fold_weights = [np.NaN] * self.num_folds
+            # stores opt (validation) loss of each fold.
+            opt_losses = [np.NaN] * self.num_folds
+            # weights for opt_losses.
+            opt_fold_weights = [np.NaN] * self.num_folds
+
+            for i, (train_split, test_split) in enumerate(self.splits):
+
+                pipeline = self.pipelines[i]
+                train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
+                                                                                    train_indices=train_split,
+                                                                                    test_indices=test_split,
+                                                                                    add_pipeline_to_self=False)
+                #Y_train_pred[i] = train_pred
+                Y_optimization_pred[i] = opt_pred
+                Y_valid_pred[i] = valid_pred
+                Y_test_pred[i] = test_pred
+                train_splits[i] = train_split
+
+                self.Y_train_targets[train_split] = self.y_train[train_split]
+
+                y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
+                               np.tile(np.arange(self.n_prediction_steps), len(test_split))
+
+                self.Y_targets[i] = self.y_train[y_test_split]
+                # Compute train loss of this fold and store it. train_loss could
+                # either be a scalar or a dict of scalars with metrics as keys.
+                #train_loss = self._loss(
+                #    self.Y_train_targets[train_split],
+                #    train_pred,
+                #)
+                train_loss = 0.
+                train_losses[i] = train_loss
+                # number of training data points for this fold. Used for weighting
+                # the average.
+                train_fold_weights[i] = len(train_split)
+
+
+                # Compute validation loss of this fold and store it.
+                optimization_loss = self._loss(
+                    self.Y_targets[i],
+                    opt_pred,
+                )
+                opt_losses[i] = optimization_loss
+                # number of optimization data points for this fold.
+                # Used for weighting the average.
+                opt_fold_weights[i] = len(train_split)
+
+            # Compute weights of each fold based on the number of samples in each
+            # fold.
+            train_fold_weights = [w / sum(train_fold_weights)
+                                  for w in train_fold_weights]
+            opt_fold_weights = [w / sum(opt_fold_weights)
+                                for w in opt_fold_weights]
+
+            # train_losses is a list of dicts. It is
+            # computed using the target metric (self.metric).
+            train_loss = np.average([train_losses[i][str(self.metric)]
+                                     for i in range(self.num_folds)],
+                                    weights=train_fold_weights,
+                                    )
+
+            opt_loss = {}
+            # self.logger.debug("OPT LOSSES: {}".format(opt_losses if opt_losses is not None else None))
+            for metric in opt_losses[0].keys():
+                opt_loss[metric] = np.average(
+                    [
+                        opt_losses[i][metric]
+                        for i in range(self.num_folds)
+                    ],
+                    weights=opt_fold_weights,
+                )
+            Y_targets = self.Y_targets
+            Y_train_targets = self.Y_train_targets
+
+            Y_optimization_preds = np.concatenate(
+                [Y_optimization_pred[i] for i in range(self.num_folds)
+                 if Y_optimization_pred[i] is not None])
+            Y_targets = np.concatenate([
+                Y_targets[i] for i in range(self.num_folds)
+                if Y_targets[i] is not None
+            ])
+
+            if self.X_valid is not None:
+                Y_valid_preds = np.array([Y_valid_pred[i]
+                                          for i in range(self.num_folds)
+                                          if Y_valid_pred[i] is not None])
+                # Average the predictions of several pipelines
+                if len(Y_valid_preds.shape) == 3:
+                    Y_valid_preds = np.nanmean(Y_valid_preds, axis=0)
+            else:
+                Y_valid_preds = None
+
+            if self.X_test is not None:
+                Y_test_preds = np.array([Y_test_pred[i]
+                                         for i in range(self.num_folds)
+                                         if Y_test_pred[i] is not None])
+                # Average the predictions of several pipelines
+                if len(Y_test_preds.shape) == 3:
+                    Y_test_preds = np.nanmean(Y_test_preds, axis=0)
+            else:
+                Y_test_preds = None
+
+            self.Y_optimization = Y_targets
+            self.Y_actual_train = Y_train_targets
+
+            self.pipeline = self._get_pipeline()
+
+            status = StatusType.SUCCESS
+            self.logger.debug("In train evaluator fit_predict_and_loss, loss:{}".format(opt_loss))
+            self.finish_up(
+                loss=opt_loss,
+                train_loss=train_loss,
+                opt_pred=Y_optimization_preds,
+                valid_pred=Y_valid_preds,
+                test_pred=Y_test_preds,
+                additional_run_info=additional_run_info,
+                file_output=True,
+                status=status,
+            )
+
+    def _predict(self, pipeline: BaseEstimator,
+                 test_indices: Union[np.ndarray, List],
+                 train_indices: Union[np.ndarray, List]
+                 ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+        datamanager = self.datamanager
+        y_pred = np.ones([len(test_indices), self.n_prediction_steps])
+        for seq_idx, test_idx in enumerate(test_indices):
+            y_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline)
+
+        #train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
+        #                                   self.y_train[train_indices])
+        opt_pred = y_pred.flatten()
+
+        #TODO we consider X_valid and X_test as a multiple sequences???
+        if self.X_valid is not None:
+            valid_pred = np.ones([len(test_indices), self.n_prediction_steps])
+            for seq_idx, val_seq in enumerate(self.datamanager.datasets):
+                valid_pred[seq_idx] = self.predict_function(val_seq.val_tensors[0], pipeline).flatten()
+
+            valid_pred = valid_pred.flatten()
+
+        else:
+            valid_pred = None
+
+        if self.X_test is not None:
+            test_pred = np.ones([len(test_indices), self.n_prediction_steps])
+            for seq_idx, test_seq in enumerate(self.datamanager.datasets):
+                test_pred[seq_idx] = self.predict_function(val_seq.test_seq[0], pipeline)
+
+            test_pred = test_pred.flatten()
+        else:
+            test_pred = None
+
+        return np.empty(1), opt_pred, valid_pred, test_pred
+
+
+
+
+
+
+
+
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 0945ff9d6..bcf1d688f 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -1,5 +1,7 @@
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar
+
+import warnings
 
 from ConfigSpace.configuration_space import Configuration
 
@@ -22,6 +24,7 @@
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
+
 __all__ = ['TrainEvaluator', 'eval_function']
 
 
@@ -290,7 +293,8 @@ def _predict(self, pipeline: BaseEstimator,
                                            self.y_train[train_indices])
 
         opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
-                                         self.y_train[train_indices])
+                                            self.y_train[train_indices])
+
 
         if self.X_valid is not None:
             valid_pred = self.predict_function(self.X_valid, pipeline,
@@ -306,7 +310,6 @@ def _predict(self, pipeline: BaseEstimator,
 
         return train_pred, opt_pred, valid_pred, test_pred
 
-
 # create closure for evaluating an algorithm
 def eval_function(
         backend: Backend,
@@ -327,8 +330,9 @@ def eval_function(
         all_supported_metrics: bool = True,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
         instance: str = None,
+        evaluator_class: ClassVar[AbstractEvaluator] = TrainEvaluator,
 ) -> None:
-    evaluator = TrainEvaluator(
+    evaluator = evaluator_class(
         backend=backend,
         queue=queue,
         metric=metric,
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index f4d6a06ec..8a2699ff8 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -24,6 +24,7 @@
 )
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -78,7 +79,7 @@ def get_smac_object(
 
 
 class AutoMLSMBO(object):
-    def  __init__(self,
+    def __init__(self,
                  config_space: ConfigSpace.ConfigurationSpace,
                  dataset_name: str,
                  backend: Backend,
@@ -102,7 +103,8 @@ def  __init__(self,
                  all_supported_metrics: bool = True,
                  ensemble_callback: typing.Optional[EnsembleBuilderManager] = None,
                  logger_port: typing.Optional[int] = None,
-                 search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None
+                 search_space_updates: typing.Optional[HyperparameterSearchSpaceUpdates] = None,
+                 time_series_prediction: bool = False
                  ):
         """
         Interface to SMAC. This method calls the SMAC optimize method, and allows
@@ -151,6 +153,9 @@ def  __init__(self,
                 Allows to create a user specified SMAC object
             ensemble_callback (typing.Optional[EnsembleBuilderManager]):
                 A callback used in this scenario to start ensemble building subtasks
+            time_series_prediction (bool):
+                If we want to apply this optimizer to optimize time series prediction tasks (which has a different
+                tae)
 
         """
         super(AutoMLSMBO, self).__init__()
@@ -194,6 +199,8 @@ def  __init__(self,
 
         self.search_space_updates = search_space_updates
 
+        self.time_series_prediction = time_series_prediction
+
         dataset_name_ = "" if dataset_name is None else dataset_name
         if logger_port is None:
             self.logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
@@ -256,6 +263,9 @@ def run_smbo(self, func: typing.Optional[typing.Callable] = None
             pipeline_config=self.pipeline_config,
             search_space_updates=self.search_space_updates
         )
+
+        if self.time_series_prediction:
+            ta_kwargs["evaluator_class"] = TimeSeriesForecastingTrainEvaluator
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Created TA")
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 90b4cf79a..5a6580516 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 from torch.utils.data.sampler import SubsetRandomSampler
 
@@ -157,10 +157,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         valid_indices = np.hstack([valid_idx for valid_idx in valid_indices])
         _, sampler_indices_train, _ = np.intersect1d(train_split, valid_indices, return_indices=True)
 
+
         # test_indices not required as testsets usually lies on the trail of hte sequence
         #_, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
 
-        sampler_train = SubsetRandomSampler(indices=sampler_indices_train)
+        self.sampler_train = SubsetRandomSampler(indices=sampler_indices_train)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
@@ -170,7 +171,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', True),
             collate_fn=custom_collate_fn,
-            sampler=sampler_train,
+            sampler=self.sampler_train,
         )
 
         self.val_data_loader = torch.utils.data.DataLoader(
@@ -213,26 +214,38 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         return torchvision.transforms.Compose(candidate_transformations)
 
-    def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.inf,
+    def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.ndarray] = None, batch_size: int = np.inf,
                    ) -> torch.utils.data.DataLoader:
         """
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
         """
         # TODO any better way to deal with prediction data loader for multiple sequences
-        X = X[-self.subseq_length - self.n_prediction_steps:]
-
-        if y is not None:
-            y = y[-self.subseq_length - self.n_prediction_steps:]
-
-        dataset = TimeSeriesSequence(
-            X=X, Y=y,
-            # This dataset is used for loading test data in a batched format
-            train_transforms=self.test_transform,
-            val_transforms=self.test_transform,
-        )
-
-        test_seq_indices = np.arange(len(X))[self.subseq_length:]
+        if isinstance(X, np.ndarray):
+            X = X[-self.subseq_length - self.n_prediction_steps + 1:]
+
+            if y is not None:
+                # we want to make sure that X, and y can be mapped one 1 one (as sampling y requires a shifted value)
+                y = y[-self.subseq_length - self.n_prediction_steps + 1:]
+
+            dataset = TimeSeriesSequence(
+                X=X, Y=y,
+                # This dataset is used for loading test data in a batched format
+                train_transforms=self.test_transform,
+                val_transforms=self.test_transform,
+                do_split=False,
+            )
+
+        elif isinstance(X, TimeSeriesSequence):
+            dataset = X
+            dataset.update_transform(self.test_transform, train=False)
+        else:
+            raise ValueError(f"Unsupported type of input X: {type(X)}")
+        if self.n_prediction_steps == 1:
+            # test_seq_indices only indicates where to truncate the current
+            test_seq_indices = [len(dataset) - 1]
+        else:
+            test_seq_indices = np.arange(len(dataset))[-self.n_prediction_steps:]
 
         dataset_test = TransformSubset(dataset, indices=test_seq_indices, train=False)
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 57691e1e0..9541d0acf 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -223,3 +223,25 @@ def _get_estimator_hyperparameter_name(self) -> str:
             str: name of the pipeline type
         """
         return "time_series_predictor"
+
+
+    def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
+        """Predict the output using the selected model.
+
+        Args:
+            X (np.ndarray): input data to the array
+            batch_size (Optional[int]): batch_size controls whether the pipeline will be
+                called on small chunks of the data. Useful when calling the
+                predict method on the whole array X results in a MemoryError.
+
+        Returns:
+            np.ndarray: the predicted values given input X
+        """
+
+        # Pre-process X
+        if batch_size is None:
+            warnings.warn("Batch size not provided. "
+                          "Will predict on the whole data in a single iteration")
+            batch_size = X.shape[0]
+        loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size)
+        return self.named_steps['network'].predict(loader)

From fefd5992b1be059edfeafff543aa7b4ec1d791d3 Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Thu, 19 Aug 2021 15:46:00 +0200
Subject: [PATCH 050/347] correctly computing validation loss

---
 autoPyTorch/api/time_series_forecasting.py     | 12 ++++++++++++
 autoPyTorch/evaluation/tae.py                  |  3 ++-
 .../time_series_forecasting_train_evaluator.py | 18 +++++++++++-------
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 03ab359b9..3d058cf98 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -241,3 +241,15 @@ def search(
             load_models=load_models,
             time_series_prediction=True
         )
+
+    def predict(
+            self,
+            X_test: List[np.ndarray],
+            batch_size: Optional[int] = None,
+            n_jobs: int = 1
+    ) -> np.ndarray:
+        y_pred = np.ones([len(X_test), self.dataset.n_prediction_steps])
+        for seq_idx, seq in enumerate(X_test):
+            y_pred[seq_idx] = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
+        return y_pred
+
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 5a45e2893..620cb4961 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -270,7 +270,8 @@ def run(
             logger=logger,
             # Pynisher expects seconds as a time indicator
             wall_time_in_s=int(cutoff) if cutoff is not None else None,
-            mem_in_mb=self.memory_limit,
+            # TODO Figure out how pynisher influences GPU memory usage here
+            #mem_in_mb=self.memory_limit,
             capture_output=True,
             context=context,
         )
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 077785302..9a6cdf3b0 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -100,8 +100,14 @@ def fit_predict_and_loss(self) -> None:
             pipeline = self._get_pipeline()
 
             train_split, test_split = self.splits[split_id]
-            self.Y_optimization = self.y_train[test_split]
-            self.Y_actual_train = self.y_train[train_split]
+
+            y_optimization = np.ones([len(test_split), self.n_prediction_steps])
+
+            y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
+                           np.tile(np.arange(self.n_prediction_steps), len(test_split))
+
+            self.Y_optimization = self.y_train[y_test_split]
+            #self.Y_actual_train = self.y_train[train_split]
             y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
                                                                                         train_indices=train_split,
                                                                                         test_indices=test_split,
@@ -110,11 +116,9 @@ def fit_predict_and_loss(self) -> None:
             #As each sequence contains one test split id, and the value to be predicted is the last n_prediction_steps
             #we need to expand the current split.
 
-            y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
-                           np.tile(np.arange(self.n_prediction_steps), len(test_split))
             #train_loss = self._loss(self.y_train[train_split], y_train_pred)
             # TODO do we really need train loss?
-            train_loss = 0.
+            train_loss = None
             loss = self._loss(self.y_train[y_test_split], y_opt_pred)
 
             additional_run_info = pipeline.get_additional_run_info() if hasattr(
@@ -164,7 +168,7 @@ def fit_predict_and_loss(self) -> None:
                 Y_test_pred[i] = test_pred
                 train_splits[i] = train_split
 
-                self.Y_train_targets[train_split] = self.y_train[train_split]
+                #self.Y_train_targets[train_split] = self.y_train[train_split]
 
                 y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
                                np.tile(np.arange(self.n_prediction_steps), len(test_split))
@@ -273,7 +277,7 @@ def _predict(self, pipeline: BaseEstimator,
         datamanager = self.datamanager
         y_pred = np.ones([len(test_indices), self.n_prediction_steps])
         for seq_idx, test_idx in enumerate(test_indices):
-            y_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline)
+            y_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline).flatten()
 
         #train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
         #                                   self.y_train[train_indices])

From efb71cadc517ef3664cb29c066ffe6b824bfbd79 Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Thu, 19 Aug 2021 17:57:05 +0200
Subject: [PATCH 051/347] normalize y in dataset

---
 autoPyTorch/api/time_series_forecasting.py        |  6 +++++-
 autoPyTorch/datasets/time_series_dataset.py       | 15 +++++++++++++++
 .../time_series_forecasting_data_loader.py        |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 3d058cf98..c15763fc7 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -249,7 +249,11 @@ def predict(
             n_jobs: int = 1
     ) -> np.ndarray:
         y_pred = np.ones([len(X_test), self.dataset.n_prediction_steps])
+        y_train_mean = self.dataset.y_train_mean
+        y_train_std = self.dataset.y_train_std
         for seq_idx, seq in enumerate(X_test):
-            y_pred[seq_idx] = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
+            seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
+            seq_pred = seq_pred * y_train_std + y_train_mean
+            y_pred[seq_idx] = seq_pred
         return y_pred
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index e5d00efa1..d4269f5c2 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -258,6 +258,7 @@ def __init__(self,
                  validator: Optional[TimeSeriesForecastingInputValidator] = None,
                  n_prediction_steps: int = 1,
                  shift_input_data: bool = True,
+                 normalize_y: bool = True,
                  ):
         """
         :param target_variables: The indices of the variables you want to forecast
@@ -268,6 +269,8 @@ def __init__(self,
         :param shift_input_data: bool
         if the input X and targets needs to be shifted to be aligned:
         such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
+        :param normalize_y: bool
+        if y values needs to be normalized with mean 0 and variance 1
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
         self.n_prediction_steps = n_prediction_steps
@@ -347,6 +350,16 @@ def __init__(self,
             self.dataset_name = dataset_name
         dataset_name_seqs = [f"{dataset_name}_sequence_{i}" for i in range(self.num_sequences)]
 
+        if normalize_y:
+            self.y_train_mean = np.mean(Y_train_flatten)
+            self.y_train_std = np.std(Y_train_flatten)
+            Y_train_flatten = (Y_train_flatten - self.y_train_mean) / self.y_train_std
+            if Y_test is not None:
+                Y_test_flatten = (Y_test_flatten - self.y_train_mean) / self.y_train_std
+        else:
+            self.y_train_mean = 0
+            self.y_train_std = 1
+
         # initialize datasets
         sequences_kwargs = {"resampling_strategy": resampling_strategy,
                             "resampling_strategy_args": resampling_strategy_args,
@@ -357,6 +370,7 @@ def __init__(self,
         idx_start_test = 0
         sequence_datasets = []
 
+
         if X_test is None or Y_test is None:
             for seq_idx, seq_length_train in enumerate(self.sequence_lengths):
                 idx_end_train = idx_start_train + seq_length_train
@@ -424,6 +438,7 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
+
     def __getitem__(self, idx, train=True):
         if idx < 0:
             if -idx > len(self):
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5a6580516..30a1ef51c 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -97,7 +97,7 @@ def __init__(self,
         """
         super().__init__(batch_size=batch_size)
         self.window_size: int = window_size
-        self.upper_seuqnce_length = upper_sequence_length
+        self.upper_sequence_length = upper_sequence_length
         self.n_prediction_steps = n_prediction_steps
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,

From 5481e8c2910b34d455e535eca1c742afd7f09f46 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 20 Aug 2021 19:13:37 +0200
Subject: [PATCH 052/347] make window_size compatible with sample_interval

---
 autoPyTorch/datasets/time_series_dataset.py   |  4 ++-
 .../time_series_forecasting_data_loader.py    | 31 ++++++++++++-------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index d4269f5c2..ec773c293 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -401,6 +401,8 @@ def __init__(self,
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
+        self.seq_length_min = np.min(self.sequence_lengths)
+
         self.train_tensors = (X_train_flatten, Y_train_flatten)
         if X_test is not None or Y_test is not None:
             self.test_tensors = (X_test_flatten, Y_test_flatten)
@@ -411,7 +413,7 @@ def __init__(self,
         self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
         # TODO find a way to edit input shape!
-        self.input_shape: Tuple[int] = (np.min(self.sequence_lengths),self.num_features)
+        self.input_shape: Tuple[int] = (self.seq_length_min, self.num_features)
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1])
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 30a1ef51c..0742bb388 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -53,7 +53,7 @@ class SequenceBuilder(object):
     window_size : int, default=1
         sliding window size
     """
-    def __init__(self, sample_interval: int = 1, window_size: int = 1):
+    def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length=1):
         """
         initialization
         Args:
@@ -62,14 +62,13 @@ def __init__(self, sample_interval: int = 1, window_size: int = 1):
         """
         self.sample_interval = sample_interval
         self.window_size = window_size
+        # assuming that subseq_length is 10, e.g., we can only start from -10. sample_interval = -4
+        # we will sample the following indices: [-9,-5,-1]
+        self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
 
     def __call__(self, data: np.ndarray) -> np.ndarray:
-        sample_indices = np.arange(-1 - self.sample_interval * (self.window_size - 1), 0, step=self.sample_interval)
-        try:
-            return data[sample_indices]
-        except IndexError:
-            raise ValueError("the length of the time sequence token is larger than the possible, please check the "
-                             "sampler setting or reduce the window size")
+        sample_indices = np.arange(self.first_indices, 0, step=self.sample_interval)
+        return data[sample_indices]
 
 
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
@@ -102,7 +101,8 @@ def __init__(self,
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
-        self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        #self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        self.subseq_length = self.window_size
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -118,12 +118,19 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         sample_interval = X.get('sample_interval', 1)
         self.sample_interval = sample_interval
 
-        self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        # we want models with different sample_interval to have similar length scale
+        self.subseq_length = self.window_size
+
         # Make sure there is an optimizer
         self.check_requirements(X, y)
 
         # Incorporate the transform to the dataset
-        datamanager = X['backend'].load_datamanager() # type: TimeSeriesForecastingDataset
+        datamanager = X['backend'].load_datamanager() # type: TimeSeriesForcecastingDataset
+        assert self.subseq_length < datamanager.seq_length_min, "dataloader's window size must be smaller than the" \
+                                                                "minimal sequence length of the dataset!!"
+        # TODO, consider bucket setting
+
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
         self.test_transform = self.build_transform(X, mode='test')
@@ -146,6 +153,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         train_split, test_split = datamanager.splits[X['split_id']]
         valid_indices = []
         idx_start = 0
+
         # to allow a time sequence data with resolution self.sample_interval and windows size with self.window_size
         # we need to drop the first part of each sequence
         for seq_idx, seq_length in enumerate(datamanager.sequence_lengths):
@@ -203,7 +211,8 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
         candidate_transformations = []  # type: List[Callable]
 
         candidate_transformations.append((SequenceBuilder(sample_interval=self.sample_interval,
-                                                          window_size=self.window_size)))
+                                                          window_size=self.window_size,
+                                                          subseq_length=self.subseq_length)))
         candidate_transformations.append((ExpandTransformTimeSeries()))
 
         if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:

From b585496c319b389931555fd60ec5da836e99fdd1 Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Wed, 1 Sep 2021 16:16:10 +0200
Subject: [PATCH 053/347] rename budget type dataset_size to resolution

---
 autoPyTorch/api/time_series_forecasting.py   | 17 ++++++++++++-----
 autoPyTorch/evaluation/abstract_evaluator.py |  2 +-
 autoPyTorch/evaluation/tae.py                |  4 ++--
 autoPyTorch/optimizer/smbo.py                |  6 +++---
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index c15763fc7..4a33b2f8a 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -88,9 +88,9 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
         )
         # here fraction of subset could be number of images, tabular data or resolution of time-series datasets.
-        #TODO if budget type dataset_size is applied to all datasets, we will put it to configs
-        self.pipeline_options.update({"min_fraction_subset": 0.1,
-                                      "fraction_subset": 1.0})
+        #TODO if budget type resolution is applied to all datasets, we will put it to configs
+        self.pipeline_options.update({"min_resolution": 0.1,
+                                      "full_resolution": 1.0})
 
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TimeSeriesForecastingDataset):
@@ -124,6 +124,8 @@ def search(
         precision: int = 32,
         disable_file_output: List = [],
         load_models: bool = True,
+        shift_input_data: bool = True,
+        normalize_y: bool = True,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -143,7 +145,7 @@ def search(
                 evaluate a pipeline.
             budget_type (Optional[str]):
                 Type of budget to be used when fitting the pipeline.
-                Either 'epochs' or 'runtime' or 'dataset_size'. If not provided, uses
+                Either 'epochs' or 'runtime' or 'resolution'. If not provided, uses
                 the default in the pipeline config ('epochs')
             budget (Optional[float]):
                 Budget to fit a single run of the pipeline. If not
@@ -187,7 +189,10 @@ def search(
             disable_file_output (Union[bool, List]):
             load_models (bool), (default=True): Whether to load the
                 models after fitting AutoPyTorch.
-
+            shift_input_data: bool
+                if the input data needs to be shifted
+            normalize_y: bool
+                if the input y values need to be normalized
         Returns:
             self
 
@@ -217,6 +222,8 @@ def search(
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
             n_prediction_steps=n_prediction_steps,
+            shift_input_data=shift_input_data,
+            normalize_y=normalize_y,
         )
 
         if traditional_per_total_budget > 0.:
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 56124c3a0..ed8f42eef 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -323,7 +323,7 @@ def __init__(self, backend: Backend,
         # If the budget is epochs, we want to limit that in the fit dictionary
         if self.budget_type == 'epochs':
             self.fit_dictionary['epochs'] = budget
-        if self.budget_type == 'dataset_size':
+        if self.budget_type == 'resolution':
             if self.task_type in TIMESERIES_TASKS:
                 self.fit_dictionary['sample_interval'] = int(np.ceil(1.0 / budget))
 
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 620cb4961..3dcc50dbb 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -205,7 +205,7 @@ def run_wrapper(
                 elif run_info.budget <= 0 or run_info.budget > 100:
                     raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
                                      run_info.budget)
-            elif self.budget_type == 'dataset_size':
+            elif self.budget_type == 'resolution':
                 if run_info.budget == 0:
                     run_info = run_info._replace(budget=1.0)
                 elif run_info.budget <= 0 or run_info.budget > 1.:
@@ -213,7 +213,7 @@ def run_wrapper(
                                      run_info.budget)
             else:
                 raise ValueError("Illegal value for budget type, must be one of "
-                                 "('epochs', 'runtime'), but is : %s" %
+                                 "('epochs', 'runtime', 'resolution'), but is : %s" %
                                  self.budget_type)
 
         remaining_time = self.stats.get_remaing_time_budget()
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 8a2699ff8..8815e72fc 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -315,9 +315,9 @@ def run_smbo(self, func: typing.Optional[typing.Callable] = None
         if budget_type == 'epochs':
             initial_budget = self.pipeline_config['min_epochs']
             max_budget = self.pipeline_config['epochs']
-        elif budget_type == 'dataset_size':
-            initial_budget = self.pipeline_config.get('min_fraction_subset', 0.1)
-            max_budget = self.pipeline_config.get('fraction_subset', 1.0)
+        elif budget_type == 'resolution':
+            initial_budget = self.pipeline_config.get('min_resolution', 0.1)
+            max_budget = self.pipeline_config.get('full_resolution', 1.0)
         else:
             raise ValueError("Illegal value for budget type, must be one of "
                              "('epochs', 'runtime'), but is : %s" %

From 39ab4ac14a607d2b6d3b624abd6f1f15978e28fe Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Thu, 2 Sep 2021 19:29:47 +0200
Subject: [PATCH 054/347] window_size changes with freq

---
 autoPyTorch/api/time_series_forecasting.py  | 45 ++++++++++++++++++---
 autoPyTorch/datasets/time_series_dataset.py | 36 ++++++++++++++---
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 4a33b2f8a..3e7971b75 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -17,12 +17,11 @@
     CrossValTypes,
     HoldoutValTypes,
 )
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, MAX_WIDNOW_SIZE_BASE
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
-
 class TimeSeriesForecastingTask(BaseTask):
     """
     Time Series Forcasting API to the pipelines.
@@ -92,6 +91,13 @@ def __init__(
         self.pipeline_options.update({"min_resolution": 0.1,
                                       "full_resolution": 1.0})
 
+        self.customized_window_size = False
+        if self.search_space_updates is not None:
+            for update in self.search_space_updates.updates:
+                # user has already specified a window_size range
+                if update.node_name == 'data_loader' and update.hyperparameter == 'window_size':
+                    self.customized_window_size = True
+
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TimeSeriesForecastingDataset):
             raise ValueError("Dataset is incompatible for the given task,: {}".format(
@@ -110,7 +116,8 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         #target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
-        n_prediction_steps: int= 1,
+        n_prediction_steps: int = 1,
+        freq: Optional[Union[str, int, List[int]]] = None,
         dataset_name: Optional[str] = None,
         budget_type: Optional[str] = None,
         budget: Optional[float] = None,
@@ -138,9 +145,13 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
-            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]]
-                indices indicating which variables need to be predicted, if X is given, either 'target_variables'
-                and 'y' needs to be given
+            n_prediction_steps: int
+                How many steps in advance we need to predict
+            freq: Optional[Union[str, int, List[int]]]
+                frequency information, it determines the configuration space of the window size, if it is not given,
+                we will use the default configuration
+            dataset_name: Optional[str],
+                dataset name
             optimize_metric (str): name of the metric that is used to
                 evaluate a pipeline.
             budget_type (Optional[str]):
@@ -218,6 +229,7 @@ def search(
         self.dataset = TimeSeriesForecastingDataset(
             X=X_train, Y=y_train,
             X_test=X_test, Y_test=y_test,
+            freq=freq,
             validator=self.InputValidator,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
@@ -226,6 +238,27 @@ def search(
             normalize_y=normalize_y,
         )
 
+        if self.dataset.freq is not None or not self.customized_window_size:
+            base_window_size = self.dataset.freq
+            # we don't want base window size to large, which might cause a too long computation time, in which case
+            # we will use n_prediction_step instead (which is normally smaller than base_window_size)
+            if base_window_size > self.dataset.upper_window_size or base_window_size > MAX_WIDNOW_SIZE_BASE:
+                # TODO considering padding to allow larger upper_window_size !!!
+                base_window_size = min(n_prediction_steps, self.dataset.upper_window_size)
+            if base_window_size > MAX_WIDNOW_SIZE_BASE:
+                base_window_size = 50 # TODO this value comes from setting of solar dataset, do we have a better choice?
+            if self.search_space_updates is None:
+                self.search_space_updates = HyperparameterSearchSpaceUpdates()
+
+            window_size_scales = [1, 2]
+
+            self.search_space_updates.append(node_name="data_loader",
+                                             hyperparameter="window_size",
+                                             value_range=[window_size_scales[0] * base_window_size,
+                                                          window_size_scales[1] * base_window_size],
+                                             default_value=1.25 * base_window_size,
+                                             )
+
         if traditional_per_total_budget > 0.:
             self._logger.warning("Time series Forecasting for now does not support traditional classifiers. "
                                  "Setting traditional_per_total_budget to 0.")
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ec773c293..8279ea754 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -43,6 +43,21 @@
 #TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
 #TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
+# seasonality map, maps a frequency value to a number
+SEASONALITY_MAP = {
+   "minutely": [1440, 10080, 525960],
+   "10_minutes": [144, 1008, 52596],
+   "half_hourly": [48, 336, 17532],
+   "hourly": [24, 168, 8766],
+   "daily": 7,
+   "weekly": 365.25/7,
+   "monthly": 12,
+   "quarterly": 4,
+   "yearly": 1
+}
+
+MAX_WIDNOW_SIZE_BASE = 500
+
 
 class TimeSeriesSequence(BaseDataset):
     def __init__(self,
@@ -248,6 +263,7 @@ def __init__(self,
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 freq: Optional[Union[str, int, List[int]]] = None,
                  dataset_name: Optional[str] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
@@ -261,11 +277,9 @@ def __init__(self,
                  normalize_y: bool = True,
                  ):
         """
-        :param target_variables: The indices of the variables you want to forecast
-        :param sequence_length: The amount of past data you want to use to forecast future value
-        :param n_steps: The number of steps you want to forecast into the future
-        :param train: Tuple with one tensor holding the training data
-        :param val: Tuple with one tensor holding the validation data
+        :param freq: Optional[Union[str, int]] frequency of the series sequences, used to determine the (possible)
+        period
+        :param n_prediction_steps: The number of steps you want to forecast into the future
         :param shift_input_data: bool
         if the input X and targets needs to be shifted to be aligned:
         such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
@@ -370,7 +384,6 @@ def __init__(self,
         idx_start_test = 0
         sequence_datasets = []
 
-
         if X_test is None or Y_test is None:
             for seq_idx, seq_length_train in enumerate(self.sequence_lengths):
                 idx_end_train = idx_start_train + seq_length_train
@@ -440,6 +453,17 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
+        if isinstance(freq, str):
+            if freq not in SEASONALITY_MAP:
+                Warning("The given freq name is not supported by our dataset, we will use the default "
+                        "configuration space on the hyperparameter window_size, if you want to adapt this value"
+                        "you could pass freq with a numerical value")
+            freq = SEASONALITY_MAP.get(freq, None)
+        if isinstance(freq, list):
+            tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
+            freq = tmp_freq
+
+        self.freq = freq
 
     def __getitem__(self, idx, train=True):
         if idx < 0:

From 219e1120ffb9004f422116f6723bd9dad279c276 Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Fri, 3 Sep 2021 15:22:05 +0200
Subject: [PATCH 055/347] reduce unnecessary checking

---
 ...me_series_forecasting_feature_validator.py | 110 ------------
 ...ime_series_forecasting_target_validator.py | 108 -----------
 .../data/time_series_forecasting_validator.py | 169 ++++++++++++++----
 autoPyTorch/datasets/resampling_strategy.py   |   1 +
 autoPyTorch/datasets/time_series_dataset.py   |  95 ++++------
 5 files changed, 163 insertions(+), 320 deletions(-)
 delete mode 100644 autoPyTorch/data/time_series_forecasting_feature_validator.py
 delete mode 100644 autoPyTorch/data/time_series_forecasting_target_validator.py

diff --git a/autoPyTorch/data/time_series_forecasting_feature_validator.py b/autoPyTorch/data/time_series_forecasting_feature_validator.py
deleted file mode 100644
index bbec1c878..000000000
--- a/autoPyTorch/data/time_series_forecasting_feature_validator.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from typing import Optional, Union, List
-import logging
-import copy
-import sklearn.utils
-
-from autoPyTorch.utils.logging_ import PicklableClientLogger
-
-import numpy as np
-
-import sklearn.utils
-from sklearn.base import BaseEstimator
-from sklearn.exceptions import NotFittedError
-
-from autoPyTorch.data.tabular_validator import TabularFeatureValidator
-from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
-
-
-class TimeSeriesForecastingFeatureValidator(TabularFeatureValidator):
-    def __init__(self,
-                 logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
-                 ) -> None:
-        super(TimeSeriesForecastingFeatureValidator, self).__init__(logger)
-        self.feature_validators = None # type: Optional[List[TabularFeatureValidator]]
-
-    def fit(self,
-            X_train: Union[np.ndarray, List[SUPPORTED_FEAT_TYPES]],
-            X_test: Optional[Union[np.ndarray, List[SUPPORTED_FEAT_TYPES]]] = None) -> BaseEstimator:
-        """
-        We expect a time series dataset stored in the form :[time_series_sequences]
-        TODO can we directly read X_train and X_test from panda DataFrame
-
-        Arguments:
-            X_train (np.ndarray):
-                A set of data that are going to be validated (type and dimensionality
-                checks) and used for fitting, it is composed of multiple time series sequences which might have
-                different length
-
-            X_test (Optional[np.ndarray]):
-                An optional set of data that is going to be validated
-
-        Returns:
-            self:
-                The fitted base estimator
-        """
-        categorical_columns = [[] for _ in range(len(X_train))]
-        numerical_columns = [[] for _ in range(len(X_train))]
-        categories = [[] for _ in range(len(X_train))]
-        num_features = [0] * len(X_train)
-
-        if X_test is not None:
-            if len(X_train) != len(X_test):
-                raise ValueError(f"Training data needs to have the same number sequences as the test data")
-
-        self.feature_validators = [TabularFeatureValidator(self.logger) for _ in range(len(X_train))]
-        if X_test is not None:
-            for seq_idx, (X_train_seq, X_test_seq)  in enumerate(zip(X_train, X_test)):
-                self.feature_validators[seq_idx].fit(X_train_seq, X_test_seq)
-
-                categorical_columns[seq_idx] = self.feature_validators[seq_idx].categorical_columns
-                numerical_columns[seq_idx] = self.feature_validators[seq_idx].numerical_columns
-                categories[seq_idx] = self.feature_validators[seq_idx].categories
-                num_features[seq_idx] = self.feature_validators[seq_idx].num_features
-        else:
-            for seq_idx, X_train_seq in enumerate(X_train):
-                self.feature_validators[seq_idx].fit(X_train_seq)
-
-                categorical_columns[seq_idx] = self.feature_validators[seq_idx].categorical_columns
-                numerical_columns[seq_idx] = self.feature_validators[seq_idx].numerical_columns
-                categories[seq_idx] = self.feature_validators[seq_idx].categories
-                num_features[seq_idx] = self.feature_validators[seq_idx].num_features
-
-        if not np.all(np.asarray(categorical_columns) == categorical_columns[0]):
-            raise ValueError(f"All the sequence needs to have the same categorical columns!")
-        if not np.all(np.asarray(categories) == categories[0]):
-            raise ValueError(f"All the sequence needs to have the same categories!")
-        if not np.all(np.asarray(numerical_columns) == numerical_columns[0]):
-            raise ValueError(f"All the sequence needs to have the same Numerical columns!")
-        if not np.all(np.asarray(num_features) == num_features[0]):
-            raise ValueError(f"All the sequence needs to have the same number of features!")
-
-        self.categories = categories[0]
-        self.num_features = num_features[0]
-        self.categorical_columns = categorical_columns[0]
-        self.numerical_columns = numerical_columns[0]
-
-        self.feat_type = self.feature_validators[0].feat_type
-        self.data_type = self.feature_validators[0].data_type
-        self.dtypes = self.feature_validators[0].dtypes
-        self.column_order = self.feature_validators[0].column_order
-        self._is_fitted = True
-
-        return self
-
-    def transform(self, X: np.ndarray) -> np.ndarray:
-        """
-
-        Arguments:
-            X (np.ndarray):
-                A set of data, that is going to be transformed
-
-        Return:
-            np.ndarray:
-                The transformed array
-        """
-        if not self._is_fitted:
-            raise NotFittedError("Cannot call transform on a validator that is not fitted")
-
-        for seq_idx in range(len(X)):
-            X[seq_idx] = self.feature_validators[seq_idx].transform(X[seq_idx])
-        return X
diff --git a/autoPyTorch/data/time_series_forecasting_target_validator.py b/autoPyTorch/data/time_series_forecasting_target_validator.py
deleted file mode 100644
index 9469e0bf9..000000000
--- a/autoPyTorch/data/time_series_forecasting_target_validator.py
+++ /dev/null
@@ -1,108 +0,0 @@
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
-
-import typing
-import logging
-import numpy as np
-
-import pandas as pd
-
-import scipy.sparse
-
-import sklearn
-from sklearn.base import BaseEstimator
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.multiclass import type_of_target
-
-from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES
-from autoPyTorch.utils.logging_ import PicklableClientLogger
-
-
-class TimeSeriesForecastingTargetValidator(TabularTargetValidator):
-    def __init__(self,
-                 is_classification: bool = False,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
-                 ]] = None,
-                 ) -> None:
-        TabularTargetValidator.__init__(self, is_classification, logger)
-        self.target_validators = None
-
-    def fit(
-            self,
-            y_train: SUPPORTED_TARGET_TYPES,
-            y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
-    ) -> BaseEstimator:
-        """
-        Validates and fit a categorical encoder (if needed) to the targets
-        The supported data types are List, numpy arrays and pandas DataFrames.
-
-        Arguments:
-            y_train (SUPPORTED_TARGET_TYPES)
-                A set of targets set aside for training
-            y_test (typing.Union[SUPPORTED_TARGET_TYPES])
-                A hold out set of data used of the targets. It is also used to fit the
-                categories of the encoder.
-        """
-        if y_test is not None:
-            if len(y_train) != len(y_test):
-                raise ValueError(f"Training target needs to have the same number sequences as the test target")
-
-        self.target_validators = [TabularTargetValidator(self.is_classification, self.logger) for _ in
-                                  range(len(y_train))]
-
-        out_dimensionality = [[] for _ in range(len(y_train))]
-        type_of_target = [""] * len(y_train)
-
-        if y_test is not None:
-            for seq_idx, (y_train_seq, y_test_seq)  in enumerate(zip(y_train, y_test)):
-                self.target_validators[seq_idx].fit(y_train_seq, y_test_seq)
-
-                out_dimensionality[seq_idx] = self.target_validators[seq_idx].out_dimensionality
-                target_type = self.target_validators[seq_idx].type_of_target
-                if target_type in ['multiclass', "binary"]:
-                    # for time series forecasting problems, we only support regression
-                    type_of_target[seq_idx] = "continuous"
-                else:
-                    type_of_target[seq_idx] = target_type
-
-        else:
-            for seq_idx, y_train_seq in enumerate(y_train):
-                self.target_validators[seq_idx].fit(y_train_seq)
-
-                out_dimensionality[seq_idx] = self.target_validators[seq_idx].out_dimensionality
-                target_type = self.target_validators[seq_idx].type_of_target
-                if target_type in ['multiclass', "binary"]:
-                    # for time series forecasting problems, we only support regression
-                    type_of_target[seq_idx] = "continuous"
-                else:
-                    type_of_target[seq_idx] = target_type
-
-        if not np.all(np.asarray(out_dimensionality) == out_dimensionality[0]):
-            raise ValueError(f"All the sequence needs to have the same out_dimensionality!")
-        if not np.all(np.asarray(type_of_target) == type_of_target[0]):
-            raise ValueError(f"All the sequence needs to have the same type_of_target!")
-
-
-        self.out_dimensionality = out_dimensionality[0]
-        self.type_of_target = type_of_target[0]
-
-        self.data_type = self.target_validators[0].data_type
-        self.dtype = self.target_validators[0].dtype
-
-        self._is_fitted = True
-
-        return self
-
-    def transform(
-            self,
-            y: typing.Union[SUPPORTED_TARGET_TYPES],
-    ) -> np.ndarray:
-        if not self._is_fitted:
-            raise NotFittedError("Cannot call transform on a validator that is not fitted")
-        for seq_idx in range(len(y)):
-            y[seq_idx] = self.target_validators[seq_idx].transform(y[seq_idx])
-        return y
-
-    """
-    Validator for time series forecasting, currently only consider regression tasks
-    TODO: Considering Classification Validator
-    """
\ No newline at end of file
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index c05541e62..a4ea5927f 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -3,52 +3,143 @@
 # -*- encoding: utf-8 -*-
 import logging
 import typing
+import numpy as np
 
-from autoPyTorch.data.time_series_forecasting_feature_validator import TimeSeriesForecastingFeatureValidator
-from autoPyTorch.data.time_series_forecasting_target_validator import TimeSeriesForecastingTargetValidator
-from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
 
-# TODO create a minxin class to perform same operations on both feature and target validators
+from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
+from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
+from autoPyTorch.data.tabular_validator import TabularInputValidator
 
-class TimeSeriesForecastingInputValidator(TimeSeriesInputValidator):
+
+class TimeSeriesForecastingInputValidator(TabularInputValidator):
     """
-    Makes sure the input data complies with Auto-PyTorch requirements.
-
-    This class also perform checks for data integrity and flags the user
-    via informative errors.
-
-    Attributes:
-        is_classification (bool):
-            For classification task, this flag indicates that the target data
-            should be encoded
-        feature_validator (FeatureValidator):
-            A FeatureValidator instance used to validate and encode feature columns to match
-            sklearn expectations on the data
-        target_validator (TargetValidator):
-            A TargetValidator instance used to validate and encode (in case of classification)
-            the target values
+    A validator designed for a time series forecasting dataset.
+    As a time series forecasting dataset might contain several time sequnces with
     """
+    def fit(
+        self,
+        X_train: SUPPORTED_FEAT_TYPES,
+        y_train: SUPPORTED_TARGET_TYPES,
+        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
+        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+    ) -> BaseEstimator:
+        # Check that the data is valid
+        if len(X_train) != len(y_train):
+            raise ValueError("Inconsistent number of sequences for features and targets,"
+                             " {} for features and {} for targets".format(
+                len(X_train),
+                len(y_train),
+            ))
+
+        if X_test is not None:
+            if len(X_test) != len(y_test):
+                raise ValueError("Inconsistent number of test datapoints for features and targets,"
+                             " {} for features and {} for targets".format(
+                len(X_test),
+                len(y_test),
+                ))
+            super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
+        else:
+            super().fit(X_train[0], y_train[0])
+
+        self.check_input_shapes(X_train, y_train, is_training=True)
+
+        if X_test is not  None:
+            self.check_input_shapes(X_test, y_test, is_training=False)
+        return self
+
+    @staticmethod
+    def get_num_features(X):
+        X_shape = np.shape(X)
+        return 1 if len(X_shape) == 1 else X_shape[1]
 
-    def __init__(
+    @staticmethod
+    def check_input_shapes(X, y, is_training: bool = True):
+        num_features = [0] * len(X)
+        out_dimensionality = [0] * len(y)
+
+        for i in range(len(X)):
+            num_features[i] = TimeSeriesForecastingInputValidator.get_num_features(X[i])
+            out_dimensionality[i] = TimeSeriesForecastingInputValidator.get_num_features(y[i])
+
+        if not np.all(np.asarray(num_features) == num_features[0]):
+            raise ValueError(f"All the sequences need to have the same number of features in "
+                             f"{'train' if is_training else 'test'} set!")
+
+        if not np.all(np.asarray(out_dimensionality) == out_dimensionality[0]):
+            raise ValueError(f"All the sequences need to have the same number of targets in "
+                             f"{'train' if is_training else 'test'} set!")
+
+    def transform(
         self,
-        is_classification: bool = False,
-        logger_port: typing.Optional[int] = None,
-    ) -> None:
-        super().__init__(is_classification=is_classification, logger_port=logger_port)
-        self.is_classification = is_classification
-        self.logger_port = logger_port
-        if self.logger_port is not None:
-            self.logger: typing.Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
-                name='Validation',
-                port=self.logger_port,
-            )
+        X: SUPPORTED_FEAT_TYPES,
+        y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        shift_input_data: bool = True,
+        n_prediction_steps: int = 1
+    ) -> typing.Tuple[np.ndarray, typing.List[int], typing.Optional[np.ndarray]]:
+        if not self._is_fitted:
+            raise NotFittedError("Cannot call transform on a validator that is not fitted")
+
+        num_sequences = len(X)
+        sequence_lengths = [0] * num_sequences
+        num_features = self.feature_validator.num_features
+
+        if y is not None:
+            num_targets = self.target_validator.out_dimensionality
+            if shift_input_data:
+                for seq_idx in range(num_sequences):
+                    X[seq_idx] = X[seq_idx][:-n_prediction_steps]
+                    y[seq_idx] = y[seq_idx][n_prediction_steps:]
+                    sequence_lengths[seq_idx] = len(X[seq_idx])
+            else:
+                for seq_idx in range(num_sequences):
+                    sequence_lengths[seq_idx] = len(X[seq_idx])
+
+            num_train_data = np.sum(sequence_lengths)
+
+            # a matrix that is concatenated by all the time series sequences
+            X_flat = np.empty([num_train_data, num_features])
+            y_flat = np.empty([num_train_data, num_targets])
+
+            start_idx = 0
+            for seq_idx, seq_length in enumerate(sequence_lengths):
+                end_idx = start_idx + seq_length
+                X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
+                y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
+                start_idx = end_idx
+
+            X_transformed = self.feature_validator.transform(X_flat)
+            y_transformed = self.target_validator.transform(y_flat)
+            return X_transformed, sequence_lengths, y_transformed
         else:
-            self.logger = logging.getLogger('Validation')
+            if shift_input_data:
+                for seq_idx in range(num_sequences):
+                    X[seq_idx] = X[seq_idx][:-n_prediction_steps]
+                    sequence_lengths[seq_idx] = len(X[seq_idx])
+            else:
+                for seq_idx in range(num_sequences):
+                    sequence_lengths[seq_idx] = len(X[seq_idx])
+
+            num_train_data = np.sum(sequence_lengths)
+
+            # a matrix that is concatenated by all the time series sequences
+            X_flat = np.empty([num_train_data, num_features])
+
+            start_idx = 0
+            # TODO make it parallel with large number of sequences
+            for seq_idx, seq_length in enumerate(sequence_lengths):
+                end_idx = start_idx + seq_length
+                X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
+                start_idx = end_idx
+
+            X_transformed = self.feature_validator.transform(X_flat)
+
+            return X_transformed, sequence_lengths
+
+
+
+
 
-        self.feature_validator = TimeSeriesForecastingFeatureValidator(logger=self.logger)
-        self.target_validator = TimeSeriesForecastingTargetValidator(
-            is_classification=self.is_classification,
-            logger=self.logger
-        )
 
-        self._is_fitted = False
\ No newline at end of file
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 44715038c..5ae1ab2ed 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -140,6 +140,7 @@ def k_fold_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any)
 
 
 # TODO DO we move these under autoPyTorch/datasets/time_series_dataset.py?
+# TODO rewrite this part, as we only need holdout sets
 def time_series_hold_out_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \
         -> Tuple[np.ndarray, np.ndarray]:
     """
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 8279ea754..f5d625e44 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -73,7 +73,7 @@ def __init__(self,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  n_prediction_steps: int = 1,
-                 do_split=True,
+                 do_split=False,
                  ):
         """
         A dataset representing a time series sequence.
@@ -296,7 +296,8 @@ def __init__(self,
             raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
                              f"but receive {type(validator)}")
 
-        self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,)
+        if not self.validator._is_fitted:
+            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,)
 
         self.numerical_columns = self.validator.feature_validator.numerical_columns
         self.categorical_columns = self.validator.feature_validator.categorical_columns
@@ -304,7 +305,10 @@ def __init__(self,
         self.num_features = self.validator.feature_validator.num_features  # type: int
         self.num_target = self.validator.target_validator.out_dimensionality  # type: int
 
-        X, Y = self.validator.transform(X, Y)
+
+        X, sequence_lengths, Y = self.validator.transform(X, Y)
+        if X_test is not None:
+            X_test, sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test)
 
         self.shuffle = shuffle
         self.rand = np.random.RandomState(seed=seed)
@@ -318,58 +322,20 @@ def __init__(self,
         self.val_transform = val_transforms
 
         self.num_sequences = len(X)
-        self.sequence_lengths = [0] * self.num_sequences
-        if shift_input_data:
-            for seq_idx in range(self.num_sequences):
-                X[seq_idx] = X[seq_idx][:-n_prediction_steps]
-                Y[seq_idx] = Y[seq_idx][n_prediction_steps:]
-                self.sequence_lengths[seq_idx] = len(X[seq_idx])
-        else:
-            for seq_idx in range(self.num_sequences):
-                self.sequence_lengths[seq_idx] = len(X[seq_idx])
-
-        num_train_data = np.sum(self.sequence_lengths)
-        X_train_flatten = np.empty([num_train_data, self.num_features])
-        Y_train_flatten = np.empty([num_train_data, self.num_target])
-        start_idx = 0
-
-        self.sequences = []
-        # flatten the sequences to allow data preprocessing
-
-        for seq_idx, seq_length in enumerate(self.sequence_lengths):
-            end_idx = start_idx + seq_length
-            X_train_flatten[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, self.num_features])
-            Y_train_flatten[start_idx: end_idx] = np.array(Y[seq_idx]).reshape([-1, self.num_target])
-            start_idx = end_idx
-
-        sequence_lengths_test = [0] * self.num_sequences
-
-        if X_test is not None or Y_test is not None:
-            for seq_idx in range(self.num_sequences):
-                sequence_lengths_test[seq_idx] = len(X_test[seq_idx])
-            num_test_data = np.sum(sequence_lengths_test)
-            X_test_flatten = np.empty([num_test_data, self.num_features])
-            Y_test_flatten = np.empty([num_test_data, self.num_target])
-            start_idx = 0
-
-            for seq_idx, seq_length in enumerate(sequence_lengths_test):
-                end_idx = start_idx + seq_length
-                X_test_flatten[start_idx: end_idx] = np.array(X_test[seq_idx]).reshape([-1, self.num_features])
-                Y_test_flatten[start_idx: end_idx] = np.array(Y_test[seq_idx]).reshape([-1, self.num_target])
-                start_idx = end_idx
+        self.sequence_lengths = sequence_lengths
 
         if dataset_name is None:
-            self.dataset_name = hash_array_or_matrix(X_train_flatten)
+            self.dataset_name = hash_array_or_matrix(X)
         else:
             self.dataset_name = dataset_name
         dataset_name_seqs = [f"{dataset_name}_sequence_{i}" for i in range(self.num_sequences)]
 
         if normalize_y:
-            self.y_train_mean = np.mean(Y_train_flatten)
-            self.y_train_std = np.std(Y_train_flatten)
-            Y_train_flatten = (Y_train_flatten - self.y_train_mean) / self.y_train_std
+            self.y_train_mean = np.mean(Y)
+            self.y_train_std = np.std(Y)
+            Y = (Y - self.y_train_mean) / self.y_train_std
             if Y_test is not None:
-                Y_test_flatten = (Y_test_flatten - self.y_train_mean) / self.y_train_std
+                Y_test = (Y_test - self.y_train_mean) / self.y_train_std
         else:
             self.y_train_mean = 0
             self.y_train_std = 1
@@ -387,38 +353,34 @@ def __init__(self,
         if X_test is None or Y_test is None:
             for seq_idx, seq_length_train in enumerate(self.sequence_lengths):
                 idx_end_train = idx_start_train + seq_length_train
-                sequence = TimeSeriesSequence(X=X_train_flatten[idx_start_train: idx_end_train],
-                                              Y=Y_train_flatten[idx_start_train: idx_end_train],
+                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
+                                              Y=Y[idx_start_train: idx_end_train],
                                               dataset_name=dataset_name_seqs[seq_idx],
                                               seed=self.rand.randint(0, 2**20),
                                               **sequences_kwargs)
                 sequence_datasets.append(sequence)
                 idx_start_train = idx_end_train
-
-                self.sequence_lengths[seq_idx] = len(sequence)
         else:
-            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths, sequence_lengths_test)):
+            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths, sequence_lengths_tests)):
                 idx_end_train = idx_start_train + seq_length_train
                 idx_end_test = idx_start_test + seq_length_test
-                sequence = TimeSeriesSequence(X=X_train_flatten[idx_start_train: idx_end_train],
-                                              Y=Y_train_flatten[idx_start_train: idx_end_train],
-                                              X_test=X_test_flatten[idx_start_test: idx_end_test],
-                                              Y_test=Y_test_flatten[idx_start_test: idx_end_test],
+                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
+                                              Y=Y[idx_start_train: idx_end_train],
+                                              X_test=X_test[idx_start_test: idx_end_test],
+                                              Y_test=Y_test[idx_start_test: idx_end_test],
                                               dataset_name=dataset_name_seqs[seq_idx],
                                               seed=self.rand.randint(0, 2**20),
                                               **sequences_kwargs)
                 sequence_datasets.append(sequence)
                 idx_start_train = idx_end_train
 
-                self.sequence_lengths[seq_idx] = len(sequence)
-
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
         self.seq_length_min = np.min(self.sequence_lengths)
 
-        self.train_tensors = (X_train_flatten, Y_train_flatten)
+        self.train_tensors = (X, Y)
         if X_test is not None or Y_test is not None:
-            self.test_tensors = (X_test_flatten, Y_test_flatten)
+            self.test_tensors = (X_test, Y_test)
         else:
             self.test_tensors = None
         self.val_tensors = None
@@ -590,8 +552,13 @@ def create_cross_val_splits(
             raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
         idx_start = 0
         splits = [[[] for _ in range(len(self.datasets))] for _ in range(num_splits)]
+
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        splits = [[() for _ in range(self.num_sequences)] for _ in range(num_splits)]
+        idx_all = self._get_indices()
+
         for idx_seq, dataset in enumerate(self.datasets):
-            split = dataset.create_cross_val_splits(cross_val_type, num_splits=num_splits)
+            split = self.cross_validators[cross_val_type.name](num_splits, indices=dataset._get_indices(), **kwargs)
             for idx_split in range(num_splits):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths[idx_seq]
@@ -631,11 +598,14 @@ def create_holdout_val_split(
             raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
         if not isinstance(holdout_val_type, HoldoutValTypes):
             raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
 
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
-            split = dataset.create_holdout_val_split(holdout_val_type, val_share)
+            split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
+                                                                   indices=dataset._get_indices(),
+                                                                   **kwargs)
             for idx_split in range(2):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths[idx_seq]
@@ -754,6 +724,5 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'task_type': self.task_type,
             'numerical_features': self.numerical_features,
             'categorical_features': self.categorical_features,
-
         })
         return info

From 092e75b61c713cd11f1da29475c0cda1a492885a Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Mon, 6 Sep 2021 19:18:26 +0200
Subject: [PATCH 056/347] maint, allow dummy classifier to do time series
 prediction

---
 autoPyTorch/api/base_task.py                  |  9 +++-
 autoPyTorch/api/time_series_forecasting.py    | 10 +++--
 .../data/time_series_forecasting_validator.py | 39 +++++++----------
 autoPyTorch/datasets/time_series_dataset.py   | 14 ++++---
 ...time_series_forecasting_train_evaluator.py | 42 +++++++++++++++----
 autoPyTorch/evaluation/train_evaluator.py     |  4 +-
 .../time_series_forecasting_data_loader.py    |  2 +
 .../pipeline/time_series_forecasting.py       |  1 -
 8 files changed, 74 insertions(+), 47 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index e697a7d95..4519c1859 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -41,6 +41,7 @@
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import get_available_classifiers
@@ -196,6 +197,8 @@ def __init__(
                 raise ValueError("Expected search space updates to be of instance"
                                  " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
 
+        self.time_series_prediction = False
+
     @abstractmethod
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         """
@@ -514,7 +517,8 @@ def _do_dummy_prediction(self, num_run: int) -> None:
             stats=stats,
             memory_limit=memory_limit,
             disable_file_output=True if len(self._disable_file_output) > 0 else False,
-            all_supported_metrics=self._all_supported_metrics
+            all_supported_metrics=self._all_supported_metrics,
+            evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_prediction else None,
         )
 
         status, cost, runtime, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -586,7 +590,8 @@ def _do_traditional_prediction(self, num_run: int, time_for_traditional: int) ->
                 stats=stats,
                 memory_limit=memory_limit,
                 disable_file_output=True if len(self._disable_file_output) > 0 else False,
-                all_supported_metrics=self._all_supported_metrics
+                all_supported_metrics=self._all_supported_metrics,
+                evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_prediction else None,
             )
             dask_futures.append((classifier, self._dask_client.submit(ta.run, config=classifier,
                                                                       cutoff=time_for_traditional_classifier_sec)))
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 3e7971b75..0996b14d8 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -22,6 +22,7 @@
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
+
 class TimeSeriesForecastingTask(BaseTask):
     """
     Time Series Forcasting API to the pipelines.
@@ -97,6 +98,7 @@ def __init__(
                 # user has already specified a window_size range
                 if update.node_name == 'data_loader' and update.hyperparameter == 'window_size':
                     self.customized_window_size = True
+        self.time_series_prediction = True
 
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TimeSeriesForecastingDataset):
@@ -239,12 +241,12 @@ def search(
         )
 
         if self.dataset.freq is not None or not self.customized_window_size:
-            base_window_size = self.dataset.freq
+            base_window_size = int(np.ceil(self.dataset.freq))
             # we don't want base window size to large, which might cause a too long computation time, in which case
             # we will use n_prediction_step instead (which is normally smaller than base_window_size)
             if base_window_size > self.dataset.upper_window_size or base_window_size > MAX_WIDNOW_SIZE_BASE:
                 # TODO considering padding to allow larger upper_window_size !!!
-                base_window_size = min(n_prediction_steps, self.dataset.upper_window_size)
+                base_window_size = int(np.ceil(min(n_prediction_steps, self.dataset.upper_window_size)))
             if base_window_size > MAX_WIDNOW_SIZE_BASE:
                 base_window_size = 50 # TODO this value comes from setting of solar dataset, do we have a better choice?
             if self.search_space_updates is None:
@@ -256,7 +258,7 @@ def search(
                                              hyperparameter="window_size",
                                              value_range=[window_size_scales[0] * base_window_size,
                                                           window_size_scales[1] * base_window_size],
-                                             default_value=1.25 * base_window_size,
+                                             default_value=int(np.ceil(1.25 * base_window_size)),
                                              )
 
         if traditional_per_total_budget > 0.:
@@ -279,7 +281,7 @@ def search(
             precision=precision,
             disable_file_output=disable_file_output,
             load_models=load_models,
-            time_series_prediction=True
+            time_series_prediction=self.time_series_prediction
         )
 
     def predict(
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index a4ea5927f..97563fa80 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -18,35 +18,30 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
     A validator designed for a time series forecasting dataset.
     As a time series forecasting dataset might contain several time sequnces with
     """
+
     def fit(
-        self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        y_train: SUPPORTED_TARGET_TYPES,
-        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+            self,
+            X_train: SUPPORTED_FEAT_TYPES,
+            y_train: SUPPORTED_TARGET_TYPES,
+            X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
+            y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         # Check that the data is valid
         if len(X_train) != len(y_train):
             raise ValueError("Inconsistent number of sequences for features and targets,"
-                             " {} for features and {} for targets".format(
-                len(X_train),
-                len(y_train),
-            ))
+                             " {} for features and {} for targets".format(len(X_train), len(y_train),))
 
         if X_test is not None:
             if len(X_test) != len(y_test):
                 raise ValueError("Inconsistent number of test datapoints for features and targets,"
-                             " {} for features and {} for targets".format(
-                len(X_test),
-                len(y_test),
-                ))
+                                 " {} for features and {} for targets".format(len(X_test), len(y_test), ))
             super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
         else:
             super().fit(X_train[0], y_train[0])
 
         self.check_input_shapes(X_train, y_train, is_training=True)
 
-        if X_test is not  None:
+        if X_test is not None:
             self.check_input_shapes(X_test, y_test, is_training=False)
         return self
 
@@ -73,11 +68,11 @@ def check_input_shapes(X, y, is_training: bool = True):
                              f"{'train' if is_training else 'test'} set!")
 
     def transform(
-        self,
-        X: SUPPORTED_FEAT_TYPES,
-        y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
-        shift_input_data: bool = True,
-        n_prediction_steps: int = 1
+            self,
+            X: SUPPORTED_FEAT_TYPES,
+            y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+            shift_input_data: bool = True,
+            n_prediction_steps: int = 1
     ) -> typing.Tuple[np.ndarray, typing.List[int], typing.Optional[np.ndarray]]:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
@@ -137,9 +132,3 @@ def transform(
             X_transformed = self.feature_validator.transform(X_flat)
 
             return X_transformed, sequence_lengths
-
-
-
-
-
-
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f5d625e44..dbd87c405 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -290,7 +290,7 @@ def __init__(self,
         self.n_prediction_steps = n_prediction_steps
         if validator is None:
             validator = TimeSeriesForecastingInputValidator(is_classification=False)
-        self.validator = validator
+        self.validator : TimeSeriesForecastingInputValidator = validator
 
         if not isinstance(validator, TimeSeriesForecastingInputValidator):
             raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
@@ -305,10 +305,13 @@ def __init__(self,
         self.num_features = self.validator.feature_validator.num_features  # type: int
         self.num_target = self.validator.target_validator.out_dimensionality  # type: int
 
-
-        X, sequence_lengths, Y = self.validator.transform(X, Y)
+        X, sequence_lengths, Y = self.validator.transform(X, Y,
+                                                          shift_input_data=shift_input_data,
+                                                          n_prediction_steps=n_prediction_steps)
         if X_test is not None:
-            X_test, sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test)
+            X_test, sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test,
+                                                                              shift_input_data=shift_input_data,
+                                                                              n_prediction_steps=n_prediction_steps)
 
         self.shuffle = shuffle
         self.rand = np.random.RandomState(seed=seed)
@@ -361,7 +364,8 @@ def __init__(self,
                 sequence_datasets.append(sequence)
                 idx_start_train = idx_end_train
         else:
-            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths, sequence_lengths_tests)):
+            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths,
+                                                                              sequence_lengths_tests)):
                 idx_end_train = idx_start_train + seq_length_train
                 idx_end_test = idx_start_test + seq_length_test
                 sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 9a6cdf3b0..7324ea83d 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -1,13 +1,15 @@
 from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+from autoPyTorch.evaluation.abstract_evaluator import DummyClassificationPipeline
 
 from multiprocessing.queues import Queue
 from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar
-
+from functools import partial
 import warnings
 
 from ConfigSpace.configuration_space import Configuration
 
 import numpy as np
+import pandas as pd
 
 from sklearn.base import BaseEstimator
 
@@ -31,6 +33,26 @@
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 
 
+class DummyTimeSeriesPredictionPipeline(DummyClassificationPipeline):
+    def __init__(self, config: Configuration,
+                 random_state: Optional[Union[int, np.random.RandomState]] = None,
+                 init_params: Optional[Dict] = None,
+                 n_prediction_steps: int = 1,
+                 ) -> None:
+        super(DummyTimeSeriesPredictionPipeline, self).__init__(config, random_state, init_params)
+        self.n_prediction_steps = n_prediction_steps
+
+    def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
+                      batch_size: int = 1000) -> np.array:
+        new_X = np.ones((self.n_prediction_steps, 1))
+        return super(DummyTimeSeriesPredictionPipeline, self).predict_proba(new_X)
+
+    def predict(self, X: Union[np.ndarray, pd.DataFrame],
+                batch_size: int = 1000) -> np.array:
+        new_X = np.ones((self.n_prediction_steps, 1))
+        return super(DummyTimeSeriesPredictionPipeline, self).predict(new_X).astype(np.float32)
+
+
 class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
     def __init__(self, backend: Backend, queue: Queue,
                  metric: autoPyTorchMetric,
@@ -69,11 +91,15 @@ def __init__(self, backend: Backend, queue: Queue,
             pipeline_config=pipeline_config,
             search_space_updates=search_space_updates
         )
-        self.pipeline_class = TimeSeriesForecastingPipeline
         self.datamanager: TimeSeriesForecastingDataset
         self.n_prediction_steps = self.datamanager.n_prediction_steps
         self.num_sequences = self.datamanager.num_sequences
 
+        if isinstance(self.configuration, int):
+            self.pipeline_class = partial(DummyTimeSeriesPredictionPipeline, n_prediction_steps=self.n_prediction_steps)
+        else:
+            self.pipeline_class = TimeSeriesForecastingPipeline
+
         self.splits = self.datamanager.splits
         if self.splits is None:
             raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
@@ -103,6 +129,10 @@ def fit_predict_and_loss(self) -> None:
 
             y_optimization = np.ones([len(test_split), self.n_prediction_steps])
 
+            # We implement this with the following reasons:
+            # given a series data, we don't know which value to predict so we predict the last n_predicted values
+            # However, this makes the shape unaligned with the shape of "self.Y_optimization"
+            # TODO consider fixed this under data loader (use pipline to do a preprocessing)
             y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
                            np.tile(np.arange(self.n_prediction_steps), len(test_split))
 
@@ -277,6 +307,7 @@ def _predict(self, pipeline: BaseEstimator,
         datamanager = self.datamanager
         y_pred = np.ones([len(test_indices), self.n_prediction_steps])
         for seq_idx, test_idx in enumerate(test_indices):
+            import pdb
             y_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline).flatten()
 
         #train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
@@ -305,10 +336,3 @@ def _predict(self, pipeline: BaseEstimator,
 
         return np.empty(1), opt_pred, valid_pred, test_pred
 
-
-
-
-
-
-
-
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index bcf1d688f..f4e901677 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -330,8 +330,10 @@ def eval_function(
         all_supported_metrics: bool = True,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
         instance: str = None,
-        evaluator_class: ClassVar[AbstractEvaluator] = TrainEvaluator,
+        evaluator_class: Optional[AbstractEvaluator] = None,
 ) -> None:
+    if evaluator_class is None:
+        evaluator_class = TrainEvaluator
     evaluator = evaluator_class(
         backend=backend,
         queue=queue,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 0742bb388..78fcab561 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -228,6 +228,8 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         """
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
+        This is a lazy loaded test set, each time only one piece of series data is passed to the dataloader and we
+        expand the sampling indices inside this function,
         """
         # TODO any better way to deal with prediction data loader for multiple sequences
         if isinstance(X, np.ndarray):
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 9541d0acf..f4a39f3d0 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -224,7 +224,6 @@ def _get_estimator_hyperparameter_name(self) -> str:
         """
         return "time_series_predictor"
 
-
     def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
         """Predict the output using the selected model.
 

From 62c9e863a73b7f7fbdcf270a0eaa42aae8c47332 Mon Sep 17 00:00:00 2001
From: Difan Deng <deng@dengster.tnt.uni-hannover.de>
Date: Fri, 10 Sep 2021 16:19:47 +0200
Subject: [PATCH 057/347] allow scaler

---
 autoPyTorch/datasets/time_series_dataset.py   | 285 ++++++++----------
 .../TimeSeriesTransformer.py                  |   6 +-
 .../scaling/utils.py                          |  27 +-
 .../time_series_forecasting_data_loader.py    |  10 +-
 .../pipeline/time_series_forecasting.py       |   2 +-
 5 files changed, 148 insertions(+), 182 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index dbd87c405..717059687 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -9,7 +9,6 @@
 
 from torch.utils.data.dataset import Dataset, Subset, ConcatDataset
 
-
 import torchvision.transforms
 
 from autoPyTorch.constants import (
@@ -24,7 +23,7 @@
     TIMESERIES_FORECASTING,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset, type_check, type_of_target, TransformSubset
+from autoPyTorch.datasets.base_dataset import BaseDataset, BASE_DATASET_INPUT, type_of_target
 from autoPyTorch.datasets.resampling_strategy import (
     DEFAULT_RESAMPLING_PARAMETERS,
     CrossValTypes,
@@ -37,7 +36,6 @@
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
-from autoPyTorch.datasets.tabular_dataset import TabularDataset
 
 #TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
 #TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
@@ -59,60 +57,32 @@
 MAX_WIDNOW_SIZE_BASE = 500
 
 
-class TimeSeriesSequence(BaseDataset):
+class TimeSeriesSequence(Dataset):
     def __init__(self,
                  X: Union[np.ndarray, pd.DataFrame],
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 dataset_name: Optional[str] = None,
-                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
-                 resampling_strategy_args: Optional[Dict[str, Any]] = None,
-                 shuffle: bool = False,
-                 seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  n_prediction_steps: int = 1,
-                 do_split=False,
                  ):
         """
         A dataset representing a time series sequence.
         Args:
-            train_tensors:
-            dataset_name:
-            val_tensors:
-            test_tensors:
-            resampling_strategy:
-            resampling_strategy_args:
             seed:
             train_transforms:
             val_transforms:
             n_prediction_steps: int, how many steps need to be predicted in advance
         """
-        train_tensors = (X, Y)
-        test_tensors = (X_test, Y_test)
+        train_tensors = [X, Y]
+        test_tensors = [X_test, Y_test]
         self.n_prediction_steps = n_prediction_steps
 
-        if dataset_name is not None:
-            self.dataset_name = dataset_name
-        else:
-            self.dataset_name = hash_array_or_matrix(train_tensors[0])
-
         self.train_tensors = train_tensors
         self.val_tensors = None
         self.test_tensors = test_tensors
 
-        self.rand = np.random.RandomState(seed=seed)
-        self.shuffle = shuffle
-
-        if do_split:
-            self.resampling_strategy = resampling_strategy
-            self.resampling_strategy_args = resampling_strategy_args
-
-            # we only allow time series cross validation and holdout validation
-            self.cross_validators = get_cross_validators(CrossValTypes.time_series_cross_validation)
-            self.holdout_validators = get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
-
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
         self.train_transform = train_transforms
@@ -156,102 +126,29 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
     def __len__(self) -> int:
         return self.train_tensors[0].shape[0]
 
-    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
-        """
-        Creates a set of splits based on a resampling strategy provided, apart from the
-        'get_splits_from_resampling_strategy' implemented in base_dataset, here we will get self.upper_window_size
-        with the given value
-
-        Returns
-            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
-        """
-        splits = []
-        if isinstance(self.resampling_strategy, HoldoutValTypes):
-            val_share = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
-                'val_share', None)
-            if self.resampling_strategy_args is not None:
-                val_share = self.resampling_strategy_args.get('val_share', val_share)
-            splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
-                                                        val_share=val_share))
-
-            if self.val_tensors is not None:
-                upper_window_size = self.__len__() - self.n_prediction_steps
-            else:
-                upper_window_size = int(self.__len__() * val_share) - self.n_prediction_steps
-
-        elif isinstance(self.resampling_strategy, CrossValTypes):
-            num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
-                'num_splits', None)
-            if self.resampling_strategy_args is not None:
-                num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
-            # Create the split if it was not created before
-            splits.extend(self.create_cross_val_splits(
-                    cross_val_type=self.resampling_strategy,
-                    num_splits=cast(int, num_splits),
-            ))
-            upper_window_size = (self.__len__() // num_splits) - self.n_prediction_steps
-        else:
-            raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
-        self.upper_window_size = upper_window_size
-        return splits
-
-    def create_cross_val_splits(
-        self,
-        cross_val_type: CrossValTypes,
-        num_splits: int
-    ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
-        """
-        This function creates the cross validation split for the given task.
-
-        It is done once per dataset to have comparable results among pipelines
-        Args:
-            cross_val_type (CrossValTypes):
-            num_splits (int): number of splits to be created
-
-        Returns:
-            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
+    def update_transform(self, transform: Optional[torchvision.transforms.Compose],
+                         train: bool = True,
+                         ) -> 'BaseDataset':
         """
-        # Create just the split once
-        # This is gonna be called multiple times, because the current dataset
-        # is being used for multiple pipelines. That is, to be efficient with memory
-        # we dump the dataset to memory and read it on a need basis. So this function
-        # should be robust against multiple calls, and it does so by remembering the splits
-        if not isinstance(cross_val_type, CrossValTypes):
-            raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
-        kwargs = {"n_prediction_steps": self.n_prediction_steps}
-        split = self.cross_validators[cross_val_type.name](num_splits, **kwargs)
-        return split
+        During the pipeline execution, the pipeline object might propose transformations
+        as a product of the current pipeline configuration being tested.
 
-    def create_holdout_val_split(
-        self,
-        holdout_val_type: HoldoutValTypes,
-        val_share: float,
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """
-        This function creates the holdout split for the given task.
+        This utility allows to return a self with the updated transformation, so that
+        a dataloader can yield this dataset with the desired transformations
 
-        It is done once per dataset to have comparable results among pipelines
         Args:
-            holdout_val_type (HoldoutValTypes):
-            val_share (float): share of the validation data
+            transform (torchvision.transforms.Compose): The transformations proposed
+                by the current pipeline
+            train (bool): Whether to update the train or validation transform
 
         Returns:
-            (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
+            self: A copy of the update pipeline
         """
-        if holdout_val_type is None:
-            raise ValueError(
-                '`val_share` specified, but `holdout_val_type` not specified.'
-            )
-        if self.val_tensors is not None:
-            raise ValueError(
-                '`val_share` specified, but the Dataset was a given a pre-defined split at initialization already.')
-        if val_share < 0 or val_share > 1:
-            raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
-        if not isinstance(holdout_val_type, HoldoutValTypes):
-            raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
-        kwargs = {"n_prediction_steps": self.n_prediction_steps}
-        train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
-        return train, val
+        if train:
+            self.train_transform = transform
+        else:
+            self.val_transform = transform
+        return self
 
 
 class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
@@ -309,9 +206,11 @@ def __init__(self,
                                                           shift_input_data=shift_input_data,
                                                           n_prediction_steps=n_prediction_steps)
         if X_test is not None:
-            X_test, sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test,
+            X_test, self.sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test,
                                                                               shift_input_data=shift_input_data,
                                                                               n_prediction_steps=n_prediction_steps)
+        else:
+            self.sequence_lengths_tests = None
 
         self.shuffle = shuffle
         self.rand = np.random.RandomState(seed=seed)
@@ -325,7 +224,7 @@ def __init__(self,
         self.val_transform = val_transforms
 
         self.num_sequences = len(X)
-        self.sequence_lengths = sequence_lengths
+        self.sequence_lengths_train = sequence_lengths
 
         if dataset_name is None:
             self.dataset_name = hash_array_or_matrix(X)
@@ -344,47 +243,23 @@ def __init__(self,
             self.y_train_std = 1
 
         # initialize datasets
-        sequences_kwargs = {"resampling_strategy": resampling_strategy,
-                            "resampling_strategy_args": resampling_strategy_args,
-                            "train_transforms": self.train_transform,
+        sequences_kwargs = {"train_transforms": self.train_transform,
                             "val_transforms": self.val_transform,
                             "n_prediction_steps": n_prediction_steps}
-        idx_start_train = 0
-        idx_start_test = 0
-        sequence_datasets = []
 
-        if X_test is None or Y_test is None:
-            for seq_idx, seq_length_train in enumerate(self.sequence_lengths):
-                idx_end_train = idx_start_train + seq_length_train
-                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
-                                              Y=Y[idx_start_train: idx_end_train],
-                                              dataset_name=dataset_name_seqs[seq_idx],
-                                              seed=self.rand.randint(0, 2**20),
-                                              **sequences_kwargs)
-                sequence_datasets.append(sequence)
-                idx_start_train = idx_end_train
-        else:
-            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(self.sequence_lengths,
-                                                                              sequence_lengths_tests)):
-                idx_end_train = idx_start_train + seq_length_train
-                idx_end_test = idx_start_test + seq_length_test
-                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
-                                              Y=Y[idx_start_train: idx_end_train],
-                                              X_test=X_test[idx_start_test: idx_end_test],
-                                              Y_test=Y_test[idx_start_test: idx_end_test],
-                                              dataset_name=dataset_name_seqs[seq_idx],
-                                              seed=self.rand.randint(0, 2**20),
-                                              **sequences_kwargs)
-                sequence_datasets.append(sequence)
-                idx_start_train = idx_end_train
+        sequence_datasets = self.make_sequences_datasets(X=X, Y=Y,
+                                                         sequence_lengths_train=self.sequence_lengths_train,
+                                                         X_test=X_test, Y_test=Y_test,
+                                                         sequence_lengths_tests=self.sequence_lengths_tests,
+                                                         **sequences_kwargs)
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
-        self.seq_length_min = np.min(self.sequence_lengths)
+        self.seq_length_min = np.min(self.sequence_lengths_train)
 
-        self.train_tensors = (X, Y)
+        self.train_tensors = [X, Y]
         if X_test is not None or Y_test is not None:
-            self.test_tensors = (X_test, Y_test)
+            self.test_tensors = [X_test, Y_test]
         else:
             self.test_tensors = None
         self.val_tensors = None
@@ -407,7 +282,7 @@ def __init__(self,
                 self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
 
         # TODO: Look for a criteria to define small enough to preprocess
-        self.is_small_preprocess = False
+        self.is_small_preprocess = True
 
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
@@ -443,6 +318,84 @@ def __getitem__(self, idx, train=True):
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return self.datasets[dataset_idx].__getitem__(sample_idx, train)
 
+    def make_sequences_datasets(self,
+                                X: np.ndarray,
+                                Y: np.ndarray,
+                                sequence_lengths_train: List[int],
+                                X_test : Optional[np.ndarray] = None,
+                                Y_test: Optional[np.ndarray] = None,
+                                sequence_lengths_tests: Optional[List[int]] = None,
+                                **sequences_kwargs: Optional[Dict]) -> List[TimeSeriesSequence]:
+        """
+        build a series time seequences datasets
+        Args:
+            X: np.ndarray (N_all, N_feature)
+                flattened train feature array with size N_all (the sum of all the series sequences) and N_feature,
+                number of features
+            Y: np.ndarray (N_all, N_target)
+                flattened train target array with size N_all (the sum of all the series sequences) and number of targets
+            sequence_lengths_train: List[int]
+                a list containing all the sequences length in the training set
+            X_test: Optional[np.ndarray (N_all_test, N_feature)]
+                flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
+                number of features
+            Y_test: np.ndarray (N_all_test, N_target)
+                flattened test target array with size N_all (the sum of all the series sequences) and number of targets
+            sequence_lengths_test: Optional[List[int]]
+                a list containing all the sequences length in the test set
+            sequences_kwargs: Dict
+                additional arguments for test sets
+        Returns:
+            sequence_datasets : List[TimeSeriesSequence]
+                a
+
+
+        """
+        sequence_datasets = []
+        idx_start_train = 0
+        if X_test is None or Y_test is None:
+            for seq_idx, seq_length_train in enumerate(sequence_lengths_train):
+                idx_end_train = idx_start_train + seq_length_train
+                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
+                                              Y=Y[idx_start_train: idx_end_train],
+                                              **sequences_kwargs)
+                sequence_datasets.append(sequence)
+                idx_start_train = idx_end_train
+        else:
+            idx_start_test = 0
+            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(sequence_lengths_train,
+                                                                              sequence_lengths_tests)):
+                idx_end_train = idx_start_train + seq_length_train
+                idx_end_test = idx_start_test + seq_length_test
+                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
+                                              Y=Y[idx_start_train: idx_end_train],
+                                              X_test=X_test[idx_start_test: idx_end_test],
+                                              Y_test=Y_test[idx_start_test: idx_end_test],
+                                              **sequences_kwargs)
+                sequence_datasets.append(sequence)
+                idx_start_train = idx_end_train
+        return sequence_datasets
+
+    def replace_data(self, X_train: BASE_DATASET_INPUT, X_test: Optional[BASE_DATASET_INPUT]) -> 'BaseDataset':
+        super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
+        self.update_tensros_seqs(X_train, self.sequence_lengths_train, is_train=True)
+        if X_test is not None:
+            self.update_tensros_seqs(X_test, self.sequence_lengths_tests, is_train=False)
+        return self
+
+    def update_tensros_seqs(self, X, sequence_lengths, is_train=True):
+        idx_start = 0
+        if is_train:
+            for seq, seq_length in zip(self.datasets, sequence_lengths):
+                idx_end = idx_start + seq_length
+                seq.train_tensors = (X[idx_start: idx_end], seq.train_tensors[1])
+                idx_start = idx_end
+        else:
+            for seq, seq_length in zip(self.datasets, sequence_lengths):
+                idx_end = idx_start + seq_length
+                seq.test_tensors = (X[idx_start: idx_end], seq.test_tensors[1])
+                idx_start = idx_end
+
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
                          train: bool = True,
                          ) -> 'BaseDataset':
@@ -488,9 +441,9 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                                                         val_share=val_share))
 
             if self.val_tensors is not None:
-                upper_window_size = np.min(self.sequence_lengths) - self.n_prediction_steps
+                upper_window_size = np.min(self.sequence_lengths_train) - self.n_prediction_steps
             else:
-                upper_window_size = int(np.min(self.sequence_lengths) * 1 - val_share) - self.n_prediction_steps
+                upper_window_size = int(np.min(self.sequence_lengths_train) * 1 - val_share) - self.n_prediction_steps
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
@@ -502,7 +455,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                     cross_val_type=self.resampling_strategy,
                     num_splits=cast(int, num_splits),
             ))
-            upper_window_size = (np.min(self.sequence_lengths) // num_splits) - self.n_prediction_steps
+            upper_window_size = (np.min(self.sequence_lengths_train) // num_splits) - self.n_prediction_steps
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
 
@@ -562,10 +515,10 @@ def create_cross_val_splits(
         idx_all = self._get_indices()
 
         for idx_seq, dataset in enumerate(self.datasets):
-            split = self.cross_validators[cross_val_type.name](num_splits, indices=dataset._get_indices(), **kwargs)
+            split = self.cross_validators[cross_val_type.name](num_splits, indices=np.arange(len(dataset)), **kwargs)
             for idx_split in range(num_splits):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
-            idx_start += self.sequence_lengths[idx_seq]
+            idx_start += self.sequence_lengths_train[idx_seq]
         # in this case, splits is stored as :
         #  [ first split, second_split ...]
         #  first_split = [([0], [1]), ([2], [3])] ....
@@ -608,11 +561,11 @@ def create_holdout_val_split(
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
             split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
-                                                                   indices=dataset._get_indices(),
+                                                                   indices=np.arange(len(dataset)),
                                                                    **kwargs)
             for idx_split in range(2):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
-            idx_start += self.sequence_lengths[idx_seq]
+            idx_start += self.sequence_lengths_train[idx_seq]
 
         train_indices = np.hstack([sp for sp in splits[0]])
         test_indices = np.hstack([sp for sp in splits[1]])
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index e0e9bedd5..ee54d2ee8 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -74,8 +74,8 @@ def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torc
             raise ValueError("cant call {} without fitting the column transformer first."
                              .format(self.__class__.__name__))
 
-        if len(X.shape) == 2:
-            # expand batch dimension when called on a single record
-            X = X[np.newaxis, ...]
+        #if len(X.shape) == 2:
+        #    # expand batch dimension when called on a single record
+        #    X = X[np.newaxis, ...]
 
         return self.preprocessor.transform(X)
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 2ae09303d..d1740c94b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -5,6 +5,7 @@
 import sklearn
 from sklearn.base import BaseEstimator
 
+import gluonts
 
 # Similar to / inspired by
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
@@ -28,24 +29,36 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             accept_large_sparse=False
         )
 
+
         if self.mode == "standard":
-            mean_ = np.mean(X, axis=1, keepdims=True)
-            std_ = np.std(X, axis=1, keepdims=True)
-            std_[std_ == 0.0] = 1.0
+            #mean_ = np.mean(X, axis=1, keepdims=True)
+            #std_ = np.std(X, axis=1, keepdims=True)
+            #std_[std_ == 0.0] = 1.0
+
+            mean_ = np.mean(X)
+            std_ = np.std(X)
+            if std_ == 0.0:
+                std_ = 1.0
 
             return (X - mean_) / std_
 
         elif self.mode == "min_max":
-            min_ = np.min(X, axis=1, keepdims=True)
-            max_ = np.max(X, axis=1, keepdims=True)
+            #min_ = np.min(X, axis=1, keepdims=True)
+            #max_ = np.max(X, axis=1, keepdims=True)
+            min_ = np.min(X)
+            max_ = np.max(X)
+
             diff_ = max_ - min_
             diff_[diff_ == 0.0] = 1.0
 
             return (X - min_) / diff_
 
         elif self.mode == "max_abs":
-            max_abs_ = np.max(np.abs(X), axis=1, keepdims=True)
-            max_abs_[max_abs_ == 0.0] = 1.0
+            #max_abs_ = np.max(np.abs(X), axis=1, keepdims=True)
+            #max_abs_[max_abs_ == 0.0] = 1.0
+            max_abs_ = np.max(np.abs(X))
+            if max_abs_ == 0.0:
+                max_abs_ = 1.0
 
             return X / max_abs_
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 78fcab561..4baf92a62 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -147,6 +147,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
 
+
         self.n_prediction_steps = datamanager.n_prediction_steps
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
@@ -156,7 +157,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         # to allow a time sequence data with resolution self.sample_interval and windows size with self.window_size
         # we need to drop the first part of each sequence
-        for seq_idx, seq_length in enumerate(datamanager.sequence_lengths):
+        for seq_idx, seq_length in enumerate(datamanager.sequence_lengths_train):
             idx_end = idx_start + seq_length
             full_sequence = np.arange(idx_start, idx_end)[self.subseq_length:]
             valid_indices.append(full_sequence)
@@ -228,8 +229,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         """
         Creates a data loader object from the provided data,
         applying the transformations meant to validation objects
-        This is a lazy loaded test set, each time only one piece of series data is passed to the dataloader and we
-        expand the sampling indices inside this function,
+        This is a lazy loaded test set, each time only one piece of series
         """
         # TODO any better way to deal with prediction data loader for multiple sequences
         if isinstance(X, np.ndarray):
@@ -315,12 +315,12 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                 window_size = UniformIntegerHyperparameter("window_size",
                                                            lower=1,
                                                            upper=upper_window_size,
-                                                           default_value=(upper_window_size + 1)// 2)
+                                                           default_value=upper_window_size)
         elif window_size[0][0] <= upper_window_size < window_size[0][1]:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],
                                                        upper=upper_window_size,
-                                                       default_value=(window_size[0][0] + upper_window_size) // 2)
+                                                       default_value=upper_window_size)
         else:
             window_size = UniformIntegerHyperparameter("window_size",
                                                        lower=window_size[0][0],
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index f4a39f3d0..fb00f05d4 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -168,8 +168,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
 
         steps.extend([
             ("scaler", ScalerChoice(default_dataset_properties)),
-            ("preprocessing", EarlyPreprocessing()),
             ("time_series_transformer", TimeSeriesTransformer()),
+            ("preprocessing", EarlyPreprocessing()),
             ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
             ("network_head", NetworkHeadChoice(default_dataset_properties)),
             ("network", NetworkComponent()),

From acb2f5048f23a510fd0302ca29c3efdd10f47475 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 26 Oct 2021 18:28:48 +0200
Subject: [PATCH 058/347] normalize each sequence individually

---
 autoPyTorch/api/time_series_forecasting.py    |  43 +++-
 autoPyTorch/datasets/time_series_dataset.py   | 217 +++++++++++-------
 .../TimeSeriesTransformer.py                  |   6 +-
 .../scaling/MaxAbsScaler.py                   |   4 +-
 .../scaling/MinMaxScaler.py                   |   6 +-
 .../scaling/NoScaler.py                       |   5 +-
 .../scaling/StandardScaler.py                 |   5 +-
 .../scaling/utils.py                          |  69 ++++--
 .../time_series_forecasting_data_loader.py    |  28 ++-
 .../pipeline/time_series_forecasting.py       |   3 +-
 10 files changed, 258 insertions(+), 128 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 0996b14d8..60a0f346d 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -117,7 +117,7 @@ def search(
         y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        #target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+        target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
         n_prediction_steps: int = 1,
         freq: Optional[Union[str, int, List[int]]] = None,
         dataset_name: Optional[str] = None,
@@ -147,6 +147,8 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
+            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+                (used for multi-variable prediction), indicates which value needs to be predicted
             n_prediction_steps: int
                 How many steps in advance we need to predict
             freq: Optional[Union[str, int, List[int]]]
@@ -216,6 +218,8 @@ def search(
         # we have to create a logger for at this point for the validator
         self._logger = self._get_logger(dataset_name)
 
+        self.target_variables = target_variables
+
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         self.InputValidator = TimeSeriesForecastingInputValidator(
@@ -240,6 +244,8 @@ def search(
             normalize_y=normalize_y,
         )
 
+        self.normalize_y = normalize_y
+
         if self.dataset.freq is not None or not self.customized_window_size:
             base_window_size = int(np.ceil(self.dataset.freq))
             # we don't want base window size to large, which might cause a too long computation time, in which case
@@ -288,14 +294,39 @@ def predict(
             self,
             X_test: List[np.ndarray],
             batch_size: Optional[int] = None,
-            n_jobs: int = 1
+            n_jobs: int = 1,
+            y_train:Optional[List[np.ndarray]]=None,
+            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
     ) -> np.ndarray:
+        """
+                    target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+                (used for multi-variable prediction), indicates which value needs to be predicted
+        """
         y_pred = np.ones([len(X_test), self.dataset.n_prediction_steps])
-        y_train_mean = self.dataset.y_train_mean
-        y_train_std = self.dataset.y_train_std
         for seq_idx, seq in enumerate(X_test):
-            seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
-            seq_pred = seq_pred * y_train_std + y_train_mean
+            if self.normalize_y:
+                if pd.DataFrame(seq).shape[-1] > 1:
+                    if target_variables is None and y_train is None:
+                        raise ValueError('For multi-variant prediction task, either target_variables or y_train needs to '
+                                         'be provided!')
+                    if y_train is None:
+                        y_train = seq[target_variables]
+                else:
+                    y_train = seq
+                if self.dataset.shift_input_data:
+                    # if input data is shifted, we must compute the mean and standard deviation with the shifted data.
+                    # This is helpful when the
+                    mean_seq = np.mean(y_train[self.dataset.n_prediction_steps])
+                    std_seq = np.std(y_train[self.dataset.n_prediction_steps])
+                else:
+                    mean_seq = np.mean(y_train)
+                    std_seq = np.std(y_train)
+
+                seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
+
+                seq_pred = seq_pred * mean_seq + std_seq
+            else:
+                seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
             y_pred[seq_idx] = seq_pred
         return y_pred
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 717059687..b633991a8 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -37,21 +37,21 @@
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
 
-#TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
-#TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
-#TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
+# TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
+# TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
+# TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
 # seasonality map, maps a frequency value to a number
 SEASONALITY_MAP = {
-   "minutely": [1440, 10080, 525960],
-   "10_minutes": [144, 1008, 52596],
-   "half_hourly": [48, 336, 17532],
-   "hourly": [24, 168, 8766],
-   "daily": 7,
-   "weekly": 365.25/7,
-   "monthly": 12,
-   "quarterly": 4,
-   "yearly": 1
+    "minutely": [1440, 10080, 525960],
+    "10_minutes": [144, 1008, 52596],
+    "half_hourly": [48, 336, 17532],
+    "hourly": [24, 168, 8766],
+    "daily": 7,
+    "weekly": 365.25 / 7,
+    "monthly": 12,
+    "quarterly": 4,
+    "yearly": 1
 }
 
 MAX_WIDNOW_SIZE_BASE = 500
@@ -75,13 +75,16 @@ def __init__(self,
             val_transforms:
             n_prediction_steps: int, how many steps need to be predicted in advance
         """
-        train_tensors = [X, Y]
-        test_tensors = [X_test, Y_test]
         self.n_prediction_steps = n_prediction_steps
 
-        self.train_tensors = train_tensors
-        self.val_tensors = None
-        self.test_tensors = test_tensors
+        self.X = X
+        self.Y = Y
+
+        self.X_val = None
+        self.Y_val = None
+
+        self.X_test = X_test
+        self.Y_tet = Y_test
 
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
@@ -100,13 +103,13 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         Returns:
             A transformed single point prediction
         """
-        if index < 0 :
+        if index < 0:
             index = self.__len__() + 1 - index
 
-        if hasattr(self.train_tensors[0], 'loc'):
-            X = self.train_tensors[0].iloc[:index + 1]
+        if hasattr(self.X, 'loc'):
+            X = self.X.iloc[:index + 1]
         else:
-            X = self.train_tensors[0][:index + 1]
+            X = self.X[:index + 1]
 
         if self.train_transform is not None and train:
             X = self.train_transform(X)
@@ -114,7 +117,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
             X = self.val_transform(X)
 
         # In case of prediction, the targets are not provided
-        Y = self.train_tensors[1]
+        Y = self.Y
         if Y is not None:
             # Y = Y[:index + self.n_prediction_steps]
             Y = Y[index]
@@ -124,7 +127,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         return X, Y
 
     def __len__(self) -> int:
-        return self.train_tensors[0].shape[0]
+        return self.X.shape[0]
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
                          train: bool = True,
@@ -162,7 +165,8 @@ def __init__(self,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  dataset_name: Optional[str] = None,
-                 resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
+                 resampling_strategy: Union[
+                     CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
@@ -187,14 +191,14 @@ def __init__(self,
         self.n_prediction_steps = n_prediction_steps
         if validator is None:
             validator = TimeSeriesForecastingInputValidator(is_classification=False)
-        self.validator : TimeSeriesForecastingInputValidator = validator
+        self.validator: TimeSeriesForecastingInputValidator = validator
 
         if not isinstance(validator, TimeSeriesForecastingInputValidator):
             raise ValueError(f"This dataset only support TimeSeriesForecastingInputValidator "
                              f"but receive {type(validator)}")
 
         if not self.validator._is_fitted:
-            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,)
+            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test, )
 
         self.numerical_columns = self.validator.feature_validator.numerical_columns
         self.categorical_columns = self.validator.feature_validator.categorical_columns
@@ -202,13 +206,15 @@ def __init__(self,
         self.num_features = self.validator.feature_validator.num_features  # type: int
         self.num_target = self.validator.target_validator.out_dimensionality  # type: int
 
+        self.shift_input_data = shift_input_data
+
         X, sequence_lengths, Y = self.validator.transform(X, Y,
                                                           shift_input_data=shift_input_data,
                                                           n_prediction_steps=n_prediction_steps)
         if X_test is not None:
             X_test, self.sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test,
-                                                                              shift_input_data=shift_input_data,
-                                                                              n_prediction_steps=n_prediction_steps)
+                                                                                   shift_input_data=shift_input_data,
+                                                                                   n_prediction_steps=n_prediction_steps)
         else:
             self.sequence_lengths_tests = None
 
@@ -232,36 +238,30 @@ def __init__(self,
             self.dataset_name = dataset_name
         dataset_name_seqs = [f"{dataset_name}_sequence_{i}" for i in range(self.num_sequences)]
 
-        if normalize_y:
-            self.y_train_mean = np.mean(Y)
-            self.y_train_std = np.std(Y)
-            Y = (Y - self.y_train_mean) / self.y_train_std
-            if Y_test is not None:
-                Y_test = (Y_test - self.y_train_mean) / self.y_train_std
-        else:
-            self.y_train_mean = 0
-            self.y_train_std = 1
-
         # initialize datasets
         sequences_kwargs = {"train_transforms": self.train_transform,
                             "val_transforms": self.val_transform,
                             "n_prediction_steps": n_prediction_steps}
 
-        sequence_datasets = self.make_sequences_datasets(X=X, Y=Y,
-                                                         sequence_lengths_train=self.sequence_lengths_train,
-                                                         X_test=X_test, Y_test=Y_test,
-                                                         sequence_lengths_tests=self.sequence_lengths_tests,
-                                                         **sequences_kwargs)
+        self.y_train_mean = [0] * len(self.sequence_lengths_train)
+        self.y_train_std = [1] * len(self.sequence_lengths_train)
+
+        sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(X=X, Y=Y,
+                                                                                     sequence_lengths_train=self.sequence_lengths_train,
+                                                                                     X_test=X_test, Y_test=Y_test,
+                                                                                     sequence_lengths_tests=self.sequence_lengths_tests,
+                                                                                     normalize_y=normalize_y,
+                                                                                     **sequences_kwargs)
+
+        self.normalize_y = normalize_y
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
         self.seq_length_min = np.min(self.sequence_lengths_train)
 
-        self.train_tensors = [X, Y]
-        if X_test is not None or Y_test is not None:
-            self.test_tensors = [X_test, Y_test]
-        else:
-            self.test_tensors = None
+        self.train_tensors = train_tensors
+
+        self.test_tensors = test_tensors
         self.val_tensors = None
 
         self.task_type: Optional[str] = None
@@ -270,16 +270,16 @@ def __init__(self,
         self.input_shape: Tuple[int] = (self.seq_length_min, self.num_features)
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
-            self.output_type: str = type_of_target(self.train_tensors[1])
+            self.output_type: str = type_of_target(self.train_tensors[1][0])
 
             if self.output_type in ["binary", "multiclass"]:
                 self.output_type = "continuous"
 
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
-                self.output_shape = len(np.unique(self.train_tensors[1]))
+                self.output_shape = len(np.unique(Y))
             else:
                 # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
-                self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
+                self.output_shape = X.shape[-1] if X.ndim > 1 else 1
 
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
@@ -322,10 +322,12 @@ def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
                                 sequence_lengths_train: List[int],
-                                X_test : Optional[np.ndarray] = None,
+                                X_test: Optional[np.ndarray] = None,
                                 Y_test: Optional[np.ndarray] = None,
                                 sequence_lengths_tests: Optional[List[int]] = None,
-                                **sequences_kwargs: Optional[Dict]) -> List[TimeSeriesSequence]:
+                                normalize_y: bool = True,
+                                **sequences_kwargs: Optional[Dict]) -> \
+            Tuple[List[TimeSeriesSequence], Tuple[List, List], Tuple[List, List]]:
         """
         build a series time seequences datasets
         Args:
@@ -343,38 +345,82 @@ def make_sequences_datasets(self,
                 flattened test target array with size N_all (the sum of all the series sequences) and number of targets
             sequence_lengths_test: Optional[List[int]]
                 a list containing all the sequences length in the test set
+            normalize_y: bool
+                if we want to normalize target vaues (normalization is conducted w.r.t. each sequence)
             sequences_kwargs: Dict
                 additional arguments for test sets
         Returns:
             sequence_datasets : List[TimeSeriesSequence]
                 a
-
+            train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
+                training tensors
+            test_tensors: Option[Tuple List[np.ndarray, List[np.ndarray]]
+                test tensors
 
         """
         sequence_datasets = []
         idx_start_train = 0
+        idx_start_test = 0
+
+        X_seq_all = []
+        Y_seq_all = []
+
+        X_test_seq_all = []
+        Y_test_seq_all = []
+
+        for seq_idx, seq_length_train in enumerate(sequence_lengths_train):
+            idx_end_train = idx_start_train + seq_length_train
+
+            X_seq = X[idx_start_train: idx_end_train]
+            Y_seq = Y[idx_start_train: idx_end_train]
+
+            if normalize_y:
+                Y_seq_mean = np.mean(Y_seq)
+                Y_seq_std = np.std(Y_seq)
+                Y_seq = (Y_seq - Y_seq_mean) / Y_seq_std
+
+            Y[idx_start_train: idx_end_train] = Y_seq
+
+            if X_test is not None and Y_test is not None:
+                seq_length_test = sequence_lengths_tests[seq_idx]
+                idx_end_test = idx_start_test + seq_length_test
+
+                X_test_seq = X_test[idx_start_test: idx_end_test]
+                Y_test_seq = Y_test[idx_start_test: idx_end_test]
+
+                if normalize_y:
+                    Y_test_seq_mean = np.mean(Y_test_seq)
+                    Y_test_seq_std = np.std(Y_test_seq)
+                    Y_seq = (Y_seq - Y_test_seq_mean) / Y_test_seq_std
+
+                Y_test[idx_start_test: idx_end_test] = Y_seq
+            else:
+                X_test_seq = None
+                Y_test_seq = None
+
+
+            sequence = TimeSeriesSequence(X=X_seq,
+                                          Y=Y_seq,
+                                          X_test=X_test_seq,
+                                          Y_test=Y_test_seq,
+                                          **sequences_kwargs)
+            sequence_datasets.append(sequence)
+            idx_start_train = idx_end_train
+
+            #X_seq_all.append(X_seq)
+            #Y_seq_all.append(Y_seq)
+
+            #X_test_seq_all.append(X_test_seq)
+            #Y_test_seq_all.append(Y_test_seq)
+        #train_tensors = (X_seq_all, Y_seq_all)
+        train_tensors = (X, Y)
         if X_test is None or Y_test is None:
-            for seq_idx, seq_length_train in enumerate(sequence_lengths_train):
-                idx_end_train = idx_start_train + seq_length_train
-                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
-                                              Y=Y[idx_start_train: idx_end_train],
-                                              **sequences_kwargs)
-                sequence_datasets.append(sequence)
-                idx_start_train = idx_end_train
+            test_tensors = None
         else:
-            idx_start_test = 0
-            for seq_idx, (seq_length_train, seq_length_test) in enumerate(zip(sequence_lengths_train,
-                                                                              sequence_lengths_tests)):
-                idx_end_train = idx_start_train + seq_length_train
-                idx_end_test = idx_start_test + seq_length_test
-                sequence = TimeSeriesSequence(X=X[idx_start_train: idx_end_train],
-                                              Y=Y[idx_start_train: idx_end_train],
-                                              X_test=X_test[idx_start_test: idx_end_test],
-                                              Y_test=Y_test[idx_start_test: idx_end_test],
-                                              **sequences_kwargs)
-                sequence_datasets.append(sequence)
-                idx_start_train = idx_end_train
-        return sequence_datasets
+            #test_tensors = (X_test_seq_all, Y_test_seq_all)
+            test_tensors = (X_test, Y_test)
+
+        return sequence_datasets, train_tensors, test_tensors
 
     def replace_data(self, X_train: BASE_DATASET_INPUT, X_test: Optional[BASE_DATASET_INPUT]) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
@@ -388,12 +434,12 @@ def update_tensros_seqs(self, X, sequence_lengths, is_train=True):
         if is_train:
             for seq, seq_length in zip(self.datasets, sequence_lengths):
                 idx_end = idx_start + seq_length
-                seq.train_tensors = (X[idx_start: idx_end], seq.train_tensors[1])
+                seq.X = X[idx_start: idx_end]
                 idx_start = idx_end
         else:
             for seq, seq_length in zip(self.datasets, sequence_lengths):
                 idx_end = idx_start + seq_length
-                seq.test_tensors = (X[idx_start: idx_end], seq.test_tensors[1])
+                seq.X_test = X[idx_start: idx_end]
                 idx_start = idx_end
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
@@ -452,8 +498,8 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                 num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
             # Create the split if it was not created before
             splits.extend(self.create_cross_val_splits(
-                    cross_val_type=self.resampling_strategy,
-                    num_splits=cast(int, num_splits),
+                cross_val_type=self.resampling_strategy,
+                num_splits=cast(int, num_splits),
             ))
             upper_window_size = (np.min(self.sequence_lengths_train) // num_splits) - self.n_prediction_steps
         else:
@@ -479,13 +525,14 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
 
     def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
-        dataset_properties.update({'upper_window_size': self.upper_window_size})
+        dataset_properties.update({'upper_window_size': self.upper_window_size,
+                                   'sequence_lengths_train': self.sequence_lengths_train})
         return dataset_properties
 
     def create_cross_val_splits(
-        self,
-        cross_val_type: CrossValTypes,
-        num_splits: int
+            self,
+            cross_val_type: CrossValTypes,
+            num_splits: int
     ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
         """
         This function creates the cross validation split for the given task.
@@ -531,9 +578,9 @@ def create_cross_val_splits(
         return splits_merged
 
     def create_holdout_val_split(
-        self,
-        holdout_val_type: HoldoutValTypes,
-        val_share: float,
+            self,
+            holdout_val_type: HoldoutValTypes,
+            val_share: float,
     ) -> Tuple[np.ndarray, np.ndarray]:
         """
         This function creates the holdout split for the given task.
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index ee54d2ee8..f1d75e401 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -13,11 +13,11 @@
 
 
 class TimeSeriesTransformer(autoPyTorchTimeSeriesPreprocessingComponent):
-
     def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
         super().__init__()
         self.random_state = random_state
         self.preprocessor: Optional[Pipeline] = None
+        self.is_training = True
         self.add_fit_requirements([
             FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
@@ -68,6 +68,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'time_series_transformer': self})
         return X
 
+    def eval(self):
+        self.is_training = False
+        self.preprocessor.set_params(timeseriesscaler__is_training=False)
+
     def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
 
         if self.preprocessor is None:
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
index 4818e20b4..e687f97b5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
@@ -22,7 +22,9 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
 
-        self.preprocessor['numerical'] = TimeSeriesScaler(mode="max_abs")
+        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode="max_abs",
+                                                          sequence_lengths_train=sequence_lengths_train)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
index 1ae908efd..2c105f616 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
@@ -19,9 +19,11 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.random_state = random_state
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
-
         self.check_requirements(X, y)
-        self.preprocessor["numerical"] = TimeSeriesScaler(mode="min_max")
+
+        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
+
+        self.preprocessor["numerical"] = TimeSeriesScaler(mode="min_max", sequence_lengths_train=sequence_lengths_train)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
index 423d8aa58..c171b81d0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
@@ -30,9 +30,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         Returns:
             instance of self
         """
-
         self.check_requirements(X, y)
-        self.preprocessor["numerical"] = TimeSeriesScaler(mode="none")
+
+        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
+        self.preprocessor["numerical"] = TimeSeriesScaler(mode="none", sequence_lengths_train=sequence_lengths_train)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
index 5d73b880f..3cf9bb960 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
@@ -19,10 +19,11 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.random_state = random_state
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
-
         self.check_requirements(X, y)
 
-        self.preprocessor['numerical'] = TimeSeriesScaler(mode="standard")
+        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode="standard",
+                                                          sequence_lengths_train=sequence_lengths_train)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index d1740c94b..3d6a39e70 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,17 +1,18 @@
-from typing import Any
+from typing import Any, List, Callable, Optional
 
 import numpy as np
 
 import sklearn
 from sklearn.base import BaseEstimator
 
-import gluonts
 
 # Similar to / inspired by
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
 class TimeSeriesScaler(BaseEstimator):
-    def __init__(self, mode: str):
+    def __init__(self, mode: str, sequence_lengths_train:List[int], is_training=True):
         self.mode = mode
+        self.sequence_lengths_train = sequence_lengths_train
+        self.is_training = is_training
 
     def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
         """
@@ -19,7 +20,19 @@ def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
         """
         return self
 
+    def eval(self):
+        self.is_training = False
+
+    def scale_individual_seq(self, X, scaling: Callable):
+        idx_start = 0
+        for seq_length_train in self.sequence_lengths_train:
+            idx_end = seq_length_train + idx_start
+            X[idx_start: idx_end] = scaling(X[idx_start: idx_end])
+            idx_start = idx_end
+        return X
+
     def transform(self, X: np.ndarray) -> np.ndarray:
+        """
         X = sklearn.utils.check_array(
             X,
             force_all_finite=True,
@@ -28,39 +41,55 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             accept_sparse=False,
             accept_large_sparse=False
         )
-
-
+        """
         if self.mode == "standard":
             #mean_ = np.mean(X, axis=1, keepdims=True)
             #std_ = np.std(X, axis=1, keepdims=True)
             #std_[std_ == 0.0] = 1.0
 
-            mean_ = np.mean(X)
-            std_ = np.std(X)
-            if std_ == 0.0:
-                std_ = 1.0
+            def standard_scaling(x_seq):
+                mean_ = np.mean(x_seq)
+                std_ = np.std(x_seq)
+                if std_ == 0.0:
+                    std_ = 1.0
+                return (x_seq - mean_) / std_
 
-            return (X - mean_) / std_
+            if self.is_training:
+                return self.scale_individual_seq(X, standard_scaling)
+            else:
+                return standard_scaling(X)
 
         elif self.mode == "min_max":
             #min_ = np.min(X, axis=1, keepdims=True)
             #max_ = np.max(X, axis=1, keepdims=True)
-            min_ = np.min(X)
-            max_ = np.max(X)
+            def min_max_scaling(x_seq):
+                min_ = np.min(x_seq)
+                max_ = np.max(x_seq)
 
-            diff_ = max_ - min_
-            diff_[diff_ == 0.0] = 1.0
+                diff_ = max_ - min_
+                if diff_ == 0.0:
+                    diff_ = 1.0
+
+                return (x_seq - min_) / diff_
+            if self.is_training:
+                return self.scale_individual_seq(X, min_max_scaling)
+            else:
+                return min_max_scaling(X)
 
-            return (X - min_) / diff_
 
         elif self.mode == "max_abs":
             #max_abs_ = np.max(np.abs(X), axis=1, keepdims=True)
             #max_abs_[max_abs_ == 0.0] = 1.0
-            max_abs_ = np.max(np.abs(X))
-            if max_abs_ == 0.0:
-                max_abs_ = 1.0
-
-            return X / max_abs_
+            def max_abs_scaling(x_seq):
+                max_abs_ = np.max(np.abs(x_seq))
+                if max_abs_ == 0.0:
+                    max_abs_ = 1.0
+
+                return x_seq / max_abs_
+            if self.is_training:
+                return self.scale_individual_seq(X, max_abs_scaling)
+            else:
+                return max_abs_scaling(X)
 
         elif self.mode == "none":
             return X
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 4baf92a62..01db0cc13 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -23,6 +23,8 @@
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from autoPyTorch.utils.common import  custom_collate_fn
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
 
 
 class ExpandTransformTimeSeries(object):
@@ -126,7 +128,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         self.check_requirements(X, y)
 
         # Incorporate the transform to the dataset
-        datamanager = X['backend'].load_datamanager() # type: TimeSeriesForcecastingDataset
+        datamanager = X['backend'].load_datamanager()  # type: TimeSeriesForcecastingDataset
         assert self.subseq_length < datamanager.seq_length_min, "dataloader's window size must be smaller than the" \
                                                                 "minimal sequence length of the dataset!!"
         # TODO, consider bucket setting
@@ -146,7 +148,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
-
+            self.dataset_small_preprocess = True
+            self.preprocess_transforms_test = X['preprocess_transforms']
+        else:
+            self.dataset_small_preprocess = False
 
         self.n_prediction_steps = datamanager.n_prediction_steps
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
@@ -166,7 +171,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         valid_indices = np.hstack([valid_idx for valid_idx in valid_indices])
         _, sampler_indices_train, _ = np.intersect1d(train_split, valid_indices, return_indices=True)
 
-
         # test_indices not required as testsets usually lies on the trail of hte sequence
         #_, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
 
@@ -211,14 +215,14 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         candidate_transformations = []  # type: List[Callable]
 
+        #if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
+        #    candidate_transformations.extend(X['preprocess_transforms'])
+
         candidate_transformations.append((SequenceBuilder(sample_interval=self.sample_interval,
                                                           window_size=self.window_size,
                                                           subseq_length=self.subseq_length)))
         candidate_transformations.append((ExpandTransformTimeSeries()))
 
-        if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
-            candidate_transformations.extend(X['preprocess_transforms'])
-
         # Transform to tensor
         candidate_transformations.append(torch.from_numpy)
 
@@ -235,8 +239,17 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         if isinstance(X, np.ndarray):
             X = X[-self.subseq_length - self.n_prediction_steps + 1:]
 
+            if self.dataset_small_preprocess:
+                for preprocess in self.preprocess_transforms_test:
+                    if isinstance(preprocess, TimeSeriesTransformer):
+                        if preprocess.is_training:
+                            preprocess.eval()
+
+                transform = torchvision.transforms.Compose(self.preprocess_transforms_test)
+                X = transform(X)
+
             if y is not None:
-                # we want to make sure that X, and y can be mapped one 1 one (as sampling y requires a shifted value)
+                # we want to make sure that X, and y can be mapped one to one (as sampling y requires a shifted value)
                 y = y[-self.subseq_length - self.n_prediction_steps + 1:]
 
             dataset = TimeSeriesSequence(
@@ -244,7 +257,6 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                 # This dataset is used for loading test data in a batched format
                 train_transforms=self.test_transform,
                 val_transforms=self.test_transform,
-                do_split=False,
             )
 
         elif isinstance(X, TimeSeriesSequence):
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index fb00f05d4..5cad6e957 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -222,7 +222,7 @@ def _get_estimator_hyperparameter_name(self) -> str:
         Returns:
             str: name of the pipeline type
         """
-        return "time_series_predictor"
+        return "time_series_forecasting"
 
     def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
         """Predict the output using the selected model.
@@ -242,5 +242,6 @@ def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray
             warnings.warn("Batch size not provided. "
                           "Will predict on the whole data in a single iteration")
             batch_size = X.shape[0]
+
         loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size)
         return self.named_steps['network'].predict(loader)

From b236ca90bef166905f68b023e9fc8ed2e9713dc2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Oct 2021 13:29:30 +0200
Subject: [PATCH 059/347] bug fixed

---
 ...time_series_forecasting_train_evaluator.py | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 7324ea83d..1a97cea98 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -15,14 +15,6 @@
 
 from smac.tae import StatusType
 
-from autoPyTorch.constants import (
-    CLASSIFICATION_TASKS,
-    MULTICLASSMULTIOUTPUT,
-)
-from autoPyTorch.evaluation.abstract_evaluator import (
-    AbstractEvaluator,
-    fit_and_suppress_warnings
-)
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
@@ -134,7 +126,7 @@ def fit_predict_and_loss(self) -> None:
             # However, this makes the shape unaligned with the shape of "self.Y_optimization"
             # TODO consider fixed this under data loader (use pipline to do a preprocessing)
             y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
-                           np.tile(np.arange(self.n_prediction_steps), len(test_split))
+                           np.tile(np.arange(self.n_prediction_steps)[::-1], len(test_split))
 
             self.Y_optimization = self.y_train[y_test_split]
             #self.Y_actual_train = self.y_train[train_split]
@@ -201,7 +193,7 @@ def fit_predict_and_loss(self) -> None:
                 #self.Y_train_targets[train_split] = self.y_train[train_split]
 
                 y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
-                               np.tile(np.arange(self.n_prediction_steps), len(test_split))
+                               np.tile(np.arange(self.n_prediction_steps)[::-1], len(test_split))
 
                 self.Y_targets[i] = self.y_train[y_test_split]
                 # Compute train loss of this fold and store it. train_loss could
@@ -301,15 +293,15 @@ def fit_predict_and_loss(self) -> None:
             )
 
     def _predict(self, pipeline: BaseEstimator,
+                 train_indices: Union[np.ndarray, List],
                  test_indices: Union[np.ndarray, List],
-                 train_indices: Union[np.ndarray, List]
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
         datamanager = self.datamanager
         y_pred = np.ones([len(test_indices), self.n_prediction_steps])
         for seq_idx, test_idx in enumerate(test_indices):
-            import pdb
             y_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline).flatten()
 
+
         #train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
         #                                   self.y_train[train_indices])
         opt_pred = y_pred.flatten()
@@ -318,7 +310,7 @@ def _predict(self, pipeline: BaseEstimator,
         if self.X_valid is not None:
             valid_pred = np.ones([len(test_indices), self.n_prediction_steps])
             for seq_idx, val_seq in enumerate(self.datamanager.datasets):
-                valid_pred[seq_idx] = self.predict_function(val_seq.val_tensors[0], pipeline).flatten()
+                valid_pred[seq_idx] = self.predict_function(val_seq.X, pipeline).flatten()
 
             valid_pred = valid_pred.flatten()
 
@@ -328,7 +320,7 @@ def _predict(self, pipeline: BaseEstimator,
         if self.X_test is not None:
             test_pred = np.ones([len(test_indices), self.n_prediction_steps])
             for seq_idx, test_seq in enumerate(self.datamanager.datasets):
-                test_pred[seq_idx] = self.predict_function(val_seq.test_seq[0], pipeline)
+                test_pred[seq_idx] = self.predict_function(test_seq.X, pipeline)
 
             test_pred = test_pred.flatten()
         else:

From 19b4c79d17823e8064c860a1040adb016c0c3c53 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Oct 2021 13:33:48 +0200
Subject: [PATCH 060/347] maint

---
 autoPyTorch/api/time_series_forecasting.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 60a0f346d..9995f6e85 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -244,8 +244,6 @@ def search(
             normalize_y=normalize_y,
         )
 
-        self.normalize_y = normalize_y
-
         if self.dataset.freq is not None or not self.customized_window_size:
             base_window_size = int(np.ceil(self.dataset.freq))
             # we don't want base window size to large, which might cause a too long computation time, in which case
@@ -304,7 +302,7 @@ def predict(
         """
         y_pred = np.ones([len(X_test), self.dataset.n_prediction_steps])
         for seq_idx, seq in enumerate(X_test):
-            if self.normalize_y:
+            if self.dataset.normalize_y:
                 if pd.DataFrame(seq).shape[-1] > 1:
                     if target_variables is None and y_train is None:
                         raise ValueError('For multi-variant prediction task, either target_variables or y_train needs to '
@@ -316,8 +314,8 @@ def predict(
                 if self.dataset.shift_input_data:
                     # if input data is shifted, we must compute the mean and standard deviation with the shifted data.
                     # This is helpful when the
-                    mean_seq = np.mean(y_train[self.dataset.n_prediction_steps])
-                    std_seq = np.std(y_train[self.dataset.n_prediction_steps])
+                    mean_seq = np.mean(y_train[self.dataset.n_prediction_steps:])
+                    std_seq = np.std(y_train[self.dataset.n_prediction_steps:])
                 else:
                     mean_seq = np.mean(y_train)
                     std_seq = np.std(y_train)

From bea37c9041faa4e0e1613e42f597f8793fbb23b0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Oct 2021 19:34:17 +0200
Subject: [PATCH 061/347] maint

---
 autoPyTorch/api/time_series_forecasting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 9995f6e85..e8e917042 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -322,7 +322,7 @@ def predict(
 
                 seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
 
-                seq_pred = seq_pred * mean_seq + std_seq
+                seq_pred = seq_pred * std_seq + mean_seq
             else:
                 seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
             y_pred[seq_idx] = seq_pred

From d41b9fa4c549cde37eddf0334400fe0fb75338e2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 19 Nov 2021 19:27:55 +0100
Subject: [PATCH 062/347] allow multi-head forecasting, new dataloader allowing
 limiting number of batches per epochs

---
 .../data/time_series_forecasting_validator.py |  10 +-
 autoPyTorch/datasets/time_series_dataset.py   |  76 +++++++----
 .../time_series_forecasting_data_loader.py    | 129 ++++++++++++++++--
 .../components/training/metrics/base.py       |   2 +-
 4 files changed, 171 insertions(+), 46 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 97563fa80..e4528fe5a 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -86,7 +86,7 @@ def transform(
             if shift_input_data:
                 for seq_idx in range(num_sequences):
                     X[seq_idx] = X[seq_idx][:-n_prediction_steps]
-                    y[seq_idx] = y[seq_idx][n_prediction_steps:]
+                    #y[seq_idx] = y[seq_idx][n_prediction_steps:]
                     sequence_lengths[seq_idx] = len(X[seq_idx])
             else:
                 for seq_idx in range(num_sequences):
@@ -94,15 +94,19 @@ def transform(
 
             num_train_data = np.sum(sequence_lengths)
 
+
             # a matrix that is concatenated by all the time series sequences
             X_flat = np.empty([num_train_data, num_features])
-            y_flat = np.empty([num_train_data, num_targets])
+            y_flat = np.empty([num_train_data + n_prediction_steps*num_sequences, num_targets])
 
             start_idx = 0
             for seq_idx, seq_length in enumerate(sequence_lengths):
                 end_idx = start_idx + seq_length
                 X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
-                y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
+                if shift_input_data:
+                    y_flat[start_idx+n_prediction_steps*seq_idx: end_idx + n_prediction_steps* (seq_idx +1)] = np.array(y[seq_idx]).reshape([-1, num_targets])
+                else:
+                    y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
 
             X_transformed = self.feature_validator.transform(X_flat)
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index b633991a8..ff6ff359e 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -7,6 +7,7 @@
 import pandas as pd
 from scipy.sparse import issparse
 
+import torch
 from torch.utils.data.dataset import Dataset, Subset, ConcatDataset
 
 import torchvision.transforms
@@ -120,7 +121,10 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         Y = self.Y
         if Y is not None:
             # Y = Y[:index + self.n_prediction_steps]
-            Y = Y[index]
+            #Y = Y[index + 1: index + self.n_prediction_steps + 1]
+            Y = Y[index + 1: index + self.n_prediction_steps + 1]
+
+            Y = torch.from_numpy(Y)
         else:
             Y = None
 
@@ -204,7 +208,7 @@ def __init__(self,
         self.categorical_columns = self.validator.feature_validator.categorical_columns
 
         self.num_features = self.validator.feature_validator.num_features  # type: int
-        self.num_target = self.validator.target_validator.out_dimensionality  # type: int
+        self.num_target =  self.validator.target_validator.out_dimensionality  # type: int
 
         self.shift_input_data = shift_input_data
 
@@ -247,11 +251,9 @@ def __init__(self,
         self.y_train_std = [1] * len(self.sequence_lengths_train)
 
         sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(X=X, Y=Y,
-                                                                                     sequence_lengths_train=self.sequence_lengths_train,
-                                                                                     X_test=X_test, Y_test=Y_test,
-                                                                                     sequence_lengths_tests=self.sequence_lengths_tests,
-                                                                                     normalize_y=normalize_y,
-                                                                                     **sequences_kwargs)
+                                                                                      X_test=X_test, Y_test=Y_test,
+                                                                                      normalize_y=normalize_y,
+                                                                                      **sequences_kwargs)
 
         self.normalize_y = normalize_y
 
@@ -276,10 +278,12 @@ def __init__(self,
                 self.output_type = "continuous"
 
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
-                self.output_shape = len(np.unique(Y))
+                num_target = len(np.unique(Y))
+                #self.output_shape = len(np.unique(Y))
             else:
                 # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
-                self.output_shape = X.shape[-1] if X.ndim > 1 else 1
+                num_target = X.shape[-1] if X.ndim > 1 else 1
+            self.output_shape = [self.n_prediction_steps, num_target]
 
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
@@ -321,10 +325,8 @@ def __getitem__(self, idx, train=True):
     def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
-                                sequence_lengths_train: List[int],
                                 X_test: Optional[np.ndarray] = None,
                                 Y_test: Optional[np.ndarray] = None,
-                                sequence_lengths_tests: Optional[List[int]] = None,
                                 normalize_y: bool = True,
                                 **sequences_kwargs: Optional[Dict]) -> \
             Tuple[List[TimeSeriesSequence], Tuple[List, List], Tuple[List, List]]:
@@ -362,43 +364,55 @@ def make_sequences_datasets(self,
         idx_start_train = 0
         idx_start_test = 0
 
-        X_seq_all = []
-        Y_seq_all = []
-
-        X_test_seq_all = []
-        Y_test_seq_all = []
-
-        for seq_idx, seq_length_train in enumerate(sequence_lengths_train):
+        for seq_idx, seq_length_train in enumerate(self.sequence_lengths_train):
             idx_end_train = idx_start_train + seq_length_train
 
             X_seq = X[idx_start_train: idx_end_train]
-            Y_seq = Y[idx_start_train: idx_end_train]
+            if self.shift_input_data:
+                Y_seq = Y[idx_start_train + seq_idx * self.n_prediction_steps:
+                          idx_end_train + (1 + seq_idx) * self.n_prediction_steps]
+            else:
+                Y_seq = Y[idx_start_train: idx_end_train]
+
 
             if normalize_y:
                 Y_seq_mean = np.mean(Y_seq)
                 Y_seq_std = np.std(Y_seq)
                 Y_seq = (Y_seq - Y_seq_mean) / Y_seq_std
 
-            Y[idx_start_train: idx_end_train] = Y_seq
+            if self.shift_input_data:
+                Y[idx_start_train + seq_idx * self.n_prediction_steps:
+                  idx_end_train + (1 + seq_idx) * self.n_prediction_steps] = Y_seq
+            else:
+                Y[idx_start_train: idx_end_train] = Y_seq
 
             if X_test is not None and Y_test is not None:
-                seq_length_test = sequence_lengths_tests[seq_idx]
+                seq_length_test = self.sequence_lengths_tests[seq_idx]
                 idx_end_test = idx_start_test + seq_length_test
 
                 X_test_seq = X_test[idx_start_test: idx_end_test]
-                Y_test_seq = Y_test[idx_start_test: idx_end_test]
+                if self.shift_input_data:
+                    Y_test_seq = Y[idx_start_test + seq_idx * self.n_prediction_steps:
+                                   idx_end_test + (1 + seq_idx) * self.n_prediction_steps]
+                else:
+                    Y_test_seq = Y_test[idx_start_test: idx_end_test]
+
 
                 if normalize_y:
                     Y_test_seq_mean = np.mean(Y_test_seq)
                     Y_test_seq_std = np.std(Y_test_seq)
                     Y_seq = (Y_seq - Y_test_seq_mean) / Y_test_seq_std
 
-                Y_test[idx_start_test: idx_end_test] = Y_seq
+                if self.shift_input_data:
+                    Y_test[idx_start_test + seq_idx * self.n_prediction_steps:
+                      idx_end_test + (1 + seq_idx) * self.n_prediction_steps] = Y_seq
+                else:
+                    Y_test[idx_start_test: idx_end_test] = Y_seq
+
             else:
                 X_test_seq = None
                 Y_test_seq = None
 
-
             sequence = TimeSeriesSequence(X=X_seq,
                                           Y=Y_seq,
                                           X_test=X_test_seq,
@@ -407,17 +421,19 @@ def make_sequences_datasets(self,
             sequence_datasets.append(sequence)
             idx_start_train = idx_end_train
 
-            #X_seq_all.append(X_seq)
-            #Y_seq_all.append(Y_seq)
+            #self.sequence_lengths_train[seq_idx] = len(sequence)
+
+            # X_seq_all.append(X_seq)
+            # Y_seq_all.append(Y_seq)
 
-            #X_test_seq_all.append(X_test_seq)
-            #Y_test_seq_all.append(Y_test_seq)
-        #train_tensors = (X_seq_all, Y_seq_all)
+            # X_test_seq_all.append(X_test_seq)
+            # Y_test_seq_all.append(Y_test_seq)
+        # train_tensors = (X_seq_all, Y_seq_all)
         train_tensors = (X, Y)
         if X_test is None or Y_test is None:
             test_tensors = None
         else:
-            #test_tensors = (X_test_seq_all, Y_test_seq_all)
+            # test_tensors = (X_test_seq_all, Y_test_seq_all)
             test_tensors = (X_test, Y_test)
 
         return sequence_datasets, train_tensors, test_tensors
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 01db0cc13..acf292f9c 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union, Sequence, List
 
 from torch.utils.data.sampler import SubsetRandomSampler
 
@@ -27,6 +27,74 @@
     TimeSeriesTransformer
 
 
+class TimeSeriesSampler(SubsetRandomSampler):
+    def __init__(self,
+                 indices: Sequence[int],
+                 seq_lengths: Sequence[int],
+                 num_instances_per_seqs: List[int],
+                 min_start: int = 0,
+                 generator: Optional[torch.Generator] = None) -> None:
+        """
+        A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
+        sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
+        the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
+        and  randomly sample one instance from each interval.
+
+        Parameters
+        ----------
+        indices: Sequence[int]
+            The set of all the possible indices that can be sampled from
+        seq_lengths: Sequence[int]
+            lengths of each sequence, applied to unsqueeze indices
+        num_instances_per_seqs: List[int]
+            how many instances are sampled in each sequence
+        min_start: int
+            the how many first instances we want to skip (the first few sequences need to be padded with 0)
+        generator: Optional[torch.Generator]
+            pytorch generator to control the randomness
+        """
+        super(TimeSeriesSampler, self).__init__(indices, generator)
+        if len(seq_lengths) != len(num_instances_per_seqs):
+            raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
+                             f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
+        if np.sum(seq_lengths) != len(indices):
+            raise ValueError(f'the sum of sequence length must correspond to the number of indices. '
+                             f'However, they are {np.sum(seq_lengths)} versus {len(indices)}')
+        seq_intervals = []
+        idx_tracker = 0
+        for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
+            idx_end = idx_tracker + seq_length
+            idx_start = idx_tracker + min_start
+            interval = np.linspace(idx_start, idx_end, num_instances + 1, endpoint=True, dtype=np.int)
+            seq_intervals.append(interval)
+        self.seq_lengths = seq_lengths
+        self.num_instances = np.sum(num_instances_per_seqs)
+        self.seq_intervals = seq_intervals
+
+    def __iter__(self):
+        samples = torch.ones(self.num_instances, dtype=torch.int)
+        idx_samples_start = 0
+        idx_seq_tracker = 0
+        for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals, self.seq_lengths)):
+            num_samples = len(interval) - 1
+            idx_samples_end = idx_samples_start + num_samples
+
+            samples_shift = torch.rand(num_samples, generator=self.generator) * (interval[1:] - interval[:-1])
+            samples_seq = torch.floor(samples_shift + interval[:-1]).int() + idx_seq_tracker
+            samples[idx_samples_start: idx_samples_end] = samples_seq
+
+            idx_samples_start = idx_samples_end
+            idx_seq_tracker += seq_length
+
+        return (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
+
+    def __len__(self):
+        return self.num_instances
+
+
+
+
+
 class ExpandTransformTimeSeries(object):
     """Expand Dimensionality so tabular transformations see
        a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
@@ -55,7 +123,7 @@ class SequenceBuilder(object):
     window_size : int, default=1
         sliding window size
     """
-    def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length=1):
+    def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length=1, padding_value=0.):
         """
         initialization
         Args:
@@ -67,10 +135,23 @@ def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length
         # assuming that subseq_length is 10, e.g., we can only start from -10. sample_interval = -4
         # we will sample the following indices: [-9,-5,-1]
         self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
+        self.padding_value = padding_value
 
     def __call__(self, data: np.ndarray) -> np.ndarray:
         sample_indices = np.arange(self.first_indices, 0, step=self.sample_interval)
-        return data[sample_indices]
+        if sample_indices[0] < -1 * len(data):
+            # we need to pad with 0
+            valid_indices = sample_indices[np.where(sample_indices >= -len(data))[0]]
+
+            data_values = data[valid_indices]
+            if data.ndim == 1:
+                padding_vector = np.full([len(sample_indices) - len(valid_indices)], self.padding_value)
+                return np.hstack([padding_vector, data_values])
+            else:
+                padding_vector = np.full([len(sample_indices) - len(valid_indices), data.shape[-1]], self.padding_value)
+                return np.vstack([padding_vector, data_values])
+        else:
+            return data[sample_indices]
 
 
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
@@ -85,6 +166,7 @@ def __init__(self,
                  window_size: int = 1,
                  #sample_interval: int = 1,
                  upper_sequence_length: int = np.iinfo(np.int32).max,
+                 num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1) -> None:
         """
         initialize a dataloader
@@ -94,6 +176,7 @@ def __init__(self,
             sample_interval: sample interval ,its value is the interval of the resolution
             upper_sequence_length: upper limit of sequence length, to avoid a sequence length larger than dataset length
             or specified by the users
+            num_batches_per_epoch: how
             n_prediction_steps: how many stpes to predict in advance
         """
         super().__init__(batch_size=batch_size)
@@ -105,6 +188,7 @@ def __init__(self,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
         #self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.subseq_length = self.window_size
+        self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -132,7 +216,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         assert self.subseq_length < datamanager.seq_length_min, "dataloader's window size must be smaller than the" \
                                                                 "minimal sequence length of the dataset!!"
         # TODO, consider bucket setting
-
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
         self.test_transform = self.build_transform(X, mode='test')
@@ -160,21 +243,44 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         valid_indices = []
         idx_start = 0
 
+
+
+
+        num_instances_dataset = np.size(train_split)
+        num_instances_train = self.num_batches_per_epoch * self.batch_size
+
+        # get the length of each sequence of training data (after split)
+        # as we already know that the elements in 'train_split' increases consecutively with a certain number of
+        # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
+        #  A new sequence must start from the index 7. We could then split each unique values to represent the length
+        # of each split
+        _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
+        num_instances_per_seqs = np.ceil(num_instances_train / num_instances_dataset * seq_train_length)
+        num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
+        # at least one element of each sequence should be selected
+
+        """
         # to allow a time sequence data with resolution self.sample_interval and windows size with self.window_size
         # we need to drop the first part of each sequence
         for seq_idx, seq_length in enumerate(datamanager.sequence_lengths_train):
             idx_end = idx_start + seq_length
-            full_sequence = np.arange(idx_start, idx_end)[self.subseq_length:]
+            #full_sequence = np.random.choice(np.arange(idx_start, idx_end)[self.subseq_length:], 5)
+            #full_sequence = np.arange(idx_start, idx_end)[self.subseq_length:]
+            #full_sequence = np.random.choice(np.arange(idx_start, idx_end)[self.subseq_length:], 5)
+            full_sequence = np.arange(idx_start, idx_end)
             valid_indices.append(full_sequence)
             idx_start = idx_end
 
         valid_indices = np.hstack([valid_idx for valid_idx in valid_indices])
         _, sampler_indices_train, _ = np.intersect1d(train_split, valid_indices, return_indices=True)
-
+        """
         # test_indices not required as testsets usually lies on the trail of hte sequence
         #_, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
 
-        self.sampler_train = SubsetRandomSampler(indices=sampler_indices_train)
+        sampler_indices_train = np.arange(num_instances_dataset)
+
+        self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
+                                               num_instances_per_seqs=num_instances_per_seqs)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
@@ -257,6 +363,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                 # This dataset is used for loading test data in a batched format
                 train_transforms=self.test_transform,
                 val_transforms=self.test_transform,
+                n_prediction_steps=0,
             )
 
         elif isinstance(X, TimeSeriesSequence):
@@ -264,11 +371,9 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
             dataset.update_transform(self.test_transform, train=False)
         else:
             raise ValueError(f"Unsupported type of input X: {type(X)}")
-        if self.n_prediction_steps == 1:
-            # test_seq_indices only indicates where to truncate the current
-            test_seq_indices = [len(dataset) - 1]
-        else:
-            test_seq_indices = np.arange(len(dataset))[-self.n_prediction_steps:]
+
+        # we only consider the last sequence as validation set
+        test_seq_indices = [len(dataset) - 1]
 
         dataset_test = TransformSubset(dataset, indices=test_seq_indices, train=False)
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 5c68a1f00..d8caf6716 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -75,7 +75,7 @@ def __call__(
         elif type_true == 'multilabel-indicator':
             y_pred[y_pred > 0.5] = 1.0
             y_pred[y_pred <= 0.5] = 0.0
-        elif type_true == 'continuous-multioutput':
+        elif type_true in ['continuous-multioutput', 'multiclass-multioutput']:
             pass
         else:
             raise ValueError(type_true)

From 10f0ebc20edbdff344f21750f84a1b5d5351fddc Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 23 Nov 2021 19:49:37 +0100
Subject: [PATCH 063/347] loss from sktime

---
 autoPyTorch/api/time_series_forecasting.py    | 116 ++++++++--------
 autoPyTorch/constants.py                      |   4 +-
 autoPyTorch/constants_forecasting.py          |  17 +++
 .../data/time_series_forecasting_validator.py |  34 ++---
 autoPyTorch/datasets/time_series_dataset.py   |  70 +++++-----
 autoPyTorch/evaluation/abstract_evaluator.py  |  48 ++++++-
 ...time_series_forecasting_train_evaluator.py | 126 ++++++++----------
 .../setup/network_backbone/MLPForecasting.py  |   9 ++
 .../network_head/distributed_network_head.py  |   0
 .../setup/network_head/distribution.py        |   0
 .../time_series_forecasting_data_loader.py    |   8 +-
 .../pipeline/components/training/losses.py    |  44 ++++--
 .../components/training/metrics/base.py       | 119 +++++++++++++++--
 .../components/training/metrics/metrics.py    | 101 +++++++++++++-
 .../components/training/metrics/utils.py      |  14 +-
 .../training/trainer/base_trainer.py          |   5 +-
 autoPyTorch/utils/pipeline.py                 |  12 +-
 requirements.txt                              |   1 +
 18 files changed, 510 insertions(+), 218 deletions(-)
 create mode 100644 autoPyTorch/constants_forecasting.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/distribution.py

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index e8e917042..51801ec46 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -17,10 +17,11 @@
     CrossValTypes,
     HoldoutValTypes,
 )
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, MAX_WIDNOW_SIZE_BASE
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE
 
 
 class TimeSeriesForecastingTask(BaseTask):
@@ -49,24 +50,26 @@ class TimeSeriesForecastingTask(BaseTask):
             Otherwise specifies set of components not to use. Incompatible with include
             components
     """
+
     def __init__(
-        self,
-        seed: int = 1,
-        n_jobs: int = 1,
-        logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
-        ensemble_nbest: int = 50,
-        max_models_on_disc: int = 50,
-        temporary_directory: Optional[str] = None,
-        output_directory: Optional[str] = None,
-        delete_tmp_folder_after_terminate: bool = True,
-        delete_output_folder_after_terminate: bool = True,
-        include_components: Optional[Dict] = None,
-        exclude_components: Optional[Dict] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
-        resampling_strategy_args: Optional[Dict[str, Any]] = None,
-        backend: Optional[Backend] = None,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+            self,
+            seed: int = 1,
+            n_jobs: int = 1,
+            logging_config: Optional[Dict] = None,
+            ensemble_size: int = 50,
+            ensemble_nbest: int = 50,
+            max_models_on_disc: int = 50,
+            temporary_directory: Optional[str] = None,
+            output_directory: Optional[str] = None,
+            delete_tmp_folder_after_terminate: bool = True,
+            delete_output_folder_after_terminate: bool = True,
+            include_components: Optional[Dict] = None,
+            exclude_components: Optional[Dict] = None,
+            resampling_strategy: Union[
+                CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            backend: Optional[Backend] = None,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     ):
         super().__init__(
             seed=seed,
@@ -88,7 +91,7 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
         )
         # here fraction of subset could be number of images, tabular data or resolution of time-series datasets.
-        #TODO if budget type resolution is applied to all datasets, we will put it to configs
+        # TODO if budget type resolution is applied to all datasets, we will put it to configs
         self.pipeline_options.update({"min_resolution": 0.1,
                                       "full_resolution": 1.0})
 
@@ -111,30 +114,30 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesForeca
         return TimeSeriesForecastingPipeline(dataset_properties=dataset_properties)
 
     def search(
-        self,
-        optimize_metric: str,
-        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
-        n_prediction_steps: int = 1,
-        freq: Optional[Union[str, int, List[int]]] = None,
-        dataset_name: Optional[str] = None,
-        budget_type: Optional[str] = None,
-        budget: Optional[float] = None,
-        total_walltime_limit: int = 100,
-        func_eval_time_limit: int = 60,
-        traditional_per_total_budget: float = 0.,
-        memory_limit: Optional[int] = 4096,
-        smac_scenario_args: Optional[Dict[str, Any]] = None,
-        get_smac_object_callback: Optional[Callable] = None,
-        all_supported_metrics: bool = True,
-        precision: int = 32,
-        disable_file_output: List = [],
-        load_models: bool = True,
-        shift_input_data: bool = True,
-        normalize_y: bool = True,
+            self,
+            optimize_metric: str,
+            X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+            n_prediction_steps: int = 1,
+            freq: Optional[Union[str, int, List[int]]] = None,
+            dataset_name: Optional[str] = None,
+            budget_type: Optional[str] = None,
+            budget: Optional[float] = None,
+            total_walltime_limit: int = 100,
+            func_eval_time_limit: int = 60,
+            traditional_per_total_budget: float = 0.,
+            memory_limit: Optional[int] = 4096,
+            smac_scenario_args: Optional[Dict[str, Any]] = None,
+            get_smac_object_callback: Optional[Callable] = None,
+            all_supported_metrics: bool = True,
+            precision: int = 32,
+            disable_file_output: List = [],
+            load_models: bool = True,
+            shift_input_data: bool = True,
+            normalize_y: bool = True,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -217,9 +220,8 @@ def search(
 
         # we have to create a logger for at this point for the validator
         self._logger = self._get_logger(dataset_name)
-
+        #TODO we will only consider target variables as int here
         self.target_variables = target_variables
-
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         self.InputValidator = TimeSeriesForecastingInputValidator(
@@ -244,15 +246,15 @@ def search(
             normalize_y=normalize_y,
         )
 
-        if self.dataset.freq is not None or not self.customized_window_size:
-            base_window_size = int(np.ceil(self.dataset.freq))
+        if self.dataset.freq_value is not None or not self.customized_window_size:
+            base_window_size = int(np.ceil(self.dataset.freq_value))
             # we don't want base window size to large, which might cause a too long computation time, in which case
             # we will use n_prediction_step instead (which is normally smaller than base_window_size)
-            if base_window_size > self.dataset.upper_window_size or base_window_size > MAX_WIDNOW_SIZE_BASE:
+            if base_window_size > self.dataset.upper_window_size or base_window_size > MAX_WINDOW_SIZE_BASE:
                 # TODO considering padding to allow larger upper_window_size !!!
                 base_window_size = int(np.ceil(min(n_prediction_steps, self.dataset.upper_window_size)))
-            if base_window_size > MAX_WIDNOW_SIZE_BASE:
-                base_window_size = 50 # TODO this value comes from setting of solar dataset, do we have a better choice?
+            if base_window_size > MAX_WINDOW_SIZE_BASE:
+                base_window_size = 50  # TODO this value comes from setting of solar dataset, do we have a better choice?
             if self.search_space_updates is None:
                 self.search_space_updates = HyperparameterSearchSpaceUpdates()
 
@@ -288,13 +290,13 @@ def search(
             time_series_prediction=self.time_series_prediction
         )
 
+
     def predict(
             self,
             X_test: List[np.ndarray],
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
-            y_train:Optional[List[np.ndarray]]=None,
-            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+            y_train: Optional[List[np.ndarray]] = None,
     ) -> np.ndarray:
         """
                     target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
@@ -304,11 +306,12 @@ def predict(
         for seq_idx, seq in enumerate(X_test):
             if self.dataset.normalize_y:
                 if pd.DataFrame(seq).shape[-1] > 1:
-                    if target_variables is None and y_train is None:
-                        raise ValueError('For multi-variant prediction task, either target_variables or y_train needs to '
-                                         'be provided!')
+                    if self.target_variables is None and y_train is None:
+                        raise ValueError(
+                            'For multi-variant prediction task, either target_variables or y_train needs to '
+                            'be provided!')
                     if y_train is None:
-                        y_train = seq[target_variables]
+                        y_train = seq[self.target_variables]
                 else:
                     y_train = seq
                 if self.dataset.shift_input_data:
@@ -327,4 +330,3 @@ def predict(
                 seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
             y_pred[seq_idx] = seq_pred
         return y_pred
-
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 1c4ee7d1f..92562be61 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -6,7 +6,7 @@
 TIMESERIES_REGRESSION = 6
 TIMESERIES_FORECASTING = 7
 
-REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION, TIMESERIES_FORECASTING]
+REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION]
 CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION, TIMESERIES_CLASSIFICATION]
 FORECASTING_TASKS = [TIMESERIES_FORECASTING]
 
@@ -49,7 +49,7 @@
      CONTINUOUSMULTIOUTPUT: 'continuous-multioutput',
      MULTICLASS: 'multiclass',
      CONTINUOUS: 'continuous',
-     MULTICLASSMULTIOUTPUT: 'multiclass-multioutput'}
+     MULTICLASSMULTIOUTPUT: 'multiclass-multioutput',}
 
 STRING_TO_OUTPUT_TYPES = \
     {'binary': BINARY,
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
new file mode 100644
index 000000000..2dfe4722a
--- /dev/null
+++ b/autoPyTorch/constants_forecasting.py
@@ -0,0 +1,17 @@
+# The cosntant values for time series forecasting comes from
+# https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
+# seasonality map, maps a frequency value to a number
+SEASONALITY_MAP = {
+    "minutely": [1440, 10080, 525960],
+    "10_minutes": [144, 1008, 52596],
+    "half_hourly": [48, 336, 17532],
+    "hourly": [24, 168, 8766],
+    "daily": 7,
+    "weekly": 365.25 / 7,
+    "monthly": 12,
+    "quarterly": 4,
+    "yearly": 1
+}
+
+MAX_WINDOW_SIZE_BASE = 500
+
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index e4528fe5a..a6af528a3 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -29,7 +29,7 @@ def fit(
         # Check that the data is valid
         if len(X_train) != len(y_train):
             raise ValueError("Inconsistent number of sequences for features and targets,"
-                             " {} for features and {} for targets".format(len(X_train), len(y_train),))
+                             " {} for features and {} for targets".format(len(X_train), len(y_train), ))
 
         if X_test is not None:
             if len(X_test) != len(y_test):
@@ -81,30 +81,32 @@ def transform(
         sequence_lengths = [0] * num_sequences
         num_features = self.feature_validator.num_features
 
+        if shift_input_data:
+            for seq_idx in range(num_sequences):
+                X[seq_idx] = X[seq_idx][:-n_prediction_steps]
+                # y[seq_idx] = y[seq_idx][n_prediction_steps:]
+                sequence_lengths[seq_idx] = len(X[seq_idx])
+        else:
+            for seq_idx in range(num_sequences):
+                sequence_lengths[seq_idx] = len(X[seq_idx])
+
         if y is not None:
             num_targets = self.target_validator.out_dimensionality
-            if shift_input_data:
-                for seq_idx in range(num_sequences):
-                    X[seq_idx] = X[seq_idx][:-n_prediction_steps]
-                    #y[seq_idx] = y[seq_idx][n_prediction_steps:]
-                    sequence_lengths[seq_idx] = len(X[seq_idx])
-            else:
-                for seq_idx in range(num_sequences):
-                    sequence_lengths[seq_idx] = len(X[seq_idx])
 
             num_train_data = np.sum(sequence_lengths)
 
-
             # a matrix that is concatenated by all the time series sequences
             X_flat = np.empty([num_train_data, num_features])
-            y_flat = np.empty([num_train_data + n_prediction_steps*num_sequences, num_targets])
+            y_flat = np.empty([num_train_data + n_prediction_steps * num_sequences, num_targets])
 
             start_idx = 0
             for seq_idx, seq_length in enumerate(sequence_lengths):
                 end_idx = start_idx + seq_length
                 X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
                 if shift_input_data:
-                    y_flat[start_idx+n_prediction_steps*seq_idx: end_idx + n_prediction_steps* (seq_idx +1)] = np.array(y[seq_idx]).reshape([-1, num_targets])
+                    y_flat[
+                    start_idx + n_prediction_steps * seq_idx: end_idx + n_prediction_steps * (seq_idx + 1)] = np.array(
+                        y[seq_idx]).reshape([-1, num_targets])
                 else:
                     y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
@@ -112,14 +114,6 @@ def transform(
             X_transformed = self.feature_validator.transform(X_flat)
             y_transformed = self.target_validator.transform(y_flat)
             return X_transformed, sequence_lengths, y_transformed
-        else:
-            if shift_input_data:
-                for seq_idx in range(num_sequences):
-                    X[seq_idx] = X[seq_idx][:-n_prediction_steps]
-                    sequence_lengths[seq_idx] = len(X[seq_idx])
-            else:
-                for seq_idx in range(num_sequences):
-                    sequence_lengths[seq_idx] = len(X[seq_idx])
 
             num_train_data = np.sum(sequence_lengths)
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ff6ff359e..4791bbf3f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -37,25 +37,7 @@
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
-
-# TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
-# TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
-# TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
-
-# seasonality map, maps a frequency value to a number
-SEASONALITY_MAP = {
-    "minutely": [1440, 10080, 525960],
-    "10_minutes": [144, 1008, 52596],
-    "half_hourly": [48, 336, 17532],
-    "hourly": [24, 168, 8766],
-    "daily": 7,
-    "weekly": 365.25 / 7,
-    "monthly": 12,
-    "quarterly": 4,
-    "yearly": 1
-}
-
-MAX_WIDNOW_SIZE_BASE = 500
+from autoPyTorch.constants_forecasting import SEASONALITY_MAP
 
 
 class TimeSeriesSequence(Dataset):
@@ -121,7 +103,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         Y = self.Y
         if Y is not None:
             # Y = Y[:index + self.n_prediction_steps]
-            #Y = Y[index + 1: index + self.n_prediction_steps + 1]
+            # Y = Y[index + 1: index + self.n_prediction_steps + 1]
             Y = Y[index + 1: index + self.n_prediction_steps + 1]
 
             Y = torch.from_numpy(Y)
@@ -167,6 +149,7 @@ def __init__(self,
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  dataset_name: Optional[str] = None,
                  resampling_strategy: Union[
@@ -182,6 +165,8 @@ def __init__(self,
                  normalize_y: bool = True,
                  ):
         """
+        :param target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] used for multi-variant forecasting
+        tasks, the target_variables indicates which values in X corresponds to Y
         :param freq: Optional[Union[str, int]] frequency of the series sequences, used to determine the (possible)
         period
         :param n_prediction_steps: The number of steps you want to forecast into the future
@@ -208,9 +193,10 @@ def __init__(self,
         self.categorical_columns = self.validator.feature_validator.categorical_columns
 
         self.num_features = self.validator.feature_validator.num_features  # type: int
-        self.num_target =  self.validator.target_validator.out_dimensionality  # type: int
+        self.num_target = self.validator.target_validator.out_dimensionality  # type: int
 
         self.shift_input_data = shift_input_data
+        self.target_variables = target_variables
 
         X, sequence_lengths, Y = self.validator.transform(X, Y,
                                                           shift_input_data=shift_input_data,
@@ -279,7 +265,7 @@ def __init__(self,
 
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
                 num_target = len(np.unique(Y))
-                #self.output_shape = len(np.unique(Y))
+                # self.output_shape = len(np.unique(Y))
             else:
                 # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
                 num_target = X.shape[-1] if X.ndim > 1 else 1
@@ -298,17 +284,22 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
+        if freq is None:
+            self.freq = None
+            self.freq_value = None
+
         if isinstance(freq, str):
             if freq not in SEASONALITY_MAP:
                 Warning("The given freq name is not supported by our dataset, we will use the default "
                         "configuration space on the hyperparameter window_size, if you want to adapt this value"
                         "you could pass freq with a numerical value")
-            freq = SEASONALITY_MAP.get(freq, None)
+            freq_value = SEASONALITY_MAP.get(freq, None)
         if isinstance(freq, list):
             tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
-            freq = tmp_freq
+            freq_value = tmp_freq
 
-        self.freq = freq
+        self.freq: Optional[str] = freq
+        self.freq_value: Optional[int] = freq_value
 
     def __getitem__(self, idx, train=True):
         if idx < 0:
@@ -374,7 +365,6 @@ def make_sequences_datasets(self,
             else:
                 Y_seq = Y[idx_start_train: idx_end_train]
 
-
             if normalize_y:
                 Y_seq_mean = np.mean(Y_seq)
                 Y_seq_std = np.std(Y_seq)
@@ -397,7 +387,6 @@ def make_sequences_datasets(self,
                 else:
                     Y_test_seq = Y_test[idx_start_test: idx_end_test]
 
-
                 if normalize_y:
                     Y_test_seq_mean = np.mean(Y_test_seq)
                     Y_test_seq_std = np.std(Y_test_seq)
@@ -405,7 +394,7 @@ def make_sequences_datasets(self,
 
                 if self.shift_input_data:
                     Y_test[idx_start_test + seq_idx * self.n_prediction_steps:
-                      idx_end_test + (1 + seq_idx) * self.n_prediction_steps] = Y_seq
+                           idx_end_test + (1 + seq_idx) * self.n_prediction_steps] = Y_seq
                 else:
                     Y_test[idx_start_test: idx_end_test] = Y_seq
 
@@ -421,7 +410,7 @@ def make_sequences_datasets(self,
             sequence_datasets.append(sequence)
             idx_start_train = idx_end_train
 
-            #self.sequence_lengths_train[seq_idx] = len(sequence)
+            # self.sequence_lengths_train[seq_idx] = len(sequence)
 
             # X_seq_all.append(X_seq)
             # Y_seq_all.append(Y_seq)
@@ -578,7 +567,16 @@ def create_cross_val_splits(
         idx_all = self._get_indices()
 
         for idx_seq, dataset in enumerate(self.datasets):
-            split = self.cross_validators[cross_val_type.name](num_splits, indices=np.arange(len(dataset)), **kwargs)
+            if self.shift_input_data:
+                split = self.cross_validators[cross_val_type.name](num_splits,
+                                                                   indices=np.arange(len(dataset)), **kwargs)
+            else:
+                # If the data is not shifted, we need to discard the last n_prediction_steps such that we have enough
+                # y values
+                split = self.cross_validators[cross_val_type.name](num_splits,
+                                                                   indices=np.arange(
+                                                                       len(dataset) - self.n_prediction_steps),
+                                                                   **kwargs)
             for idx_split in range(num_splits):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
@@ -623,9 +621,15 @@ def create_holdout_val_split(
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
-            split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
-                                                                   indices=np.arange(len(dataset)),
-                                                                   **kwargs)
+            if self.shift_input_data:
+                split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
+                                                                       indices=np.arange(len(dataset)),
+                                                                       **kwargs)
+            else:
+                split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
+                                                                       indices=np.arange(
+                                                                           len(dataset) - self.n_prediction_steps),
+                                                                       **kwargs)
             for idx_split in range(2):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index ed8f42eef..b84595009 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -3,6 +3,7 @@
 import warnings
 from multiprocessing.queues import Queue
 from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check
+from functools import partial
 
 from ConfigSpace import Configuration
 
@@ -33,6 +34,8 @@
     FORECASTING_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+
 from autoPyTorch.evaluation.utils import (
     VotingRegressorWrapper,
     convert_multioutput_multiclass_to_multilabel
@@ -178,6 +181,26 @@ def get_default_pipeline_options() -> Dict[str, Any]:
                 'runtime': 1}
 
 
+class DummyTimeSeriesPredictionPipeline(DummyClassificationPipeline):
+    def __init__(self, config: Configuration,
+                 random_state: Optional[Union[int, np.random.RandomState]] = None,
+                 init_params: Optional[Dict] = None,
+                 n_prediction_steps: int = 1,
+                 ) -> None:
+        super(DummyTimeSeriesPredictionPipeline, self).__init__(config, random_state, init_params)
+        self.n_prediction_steps = n_prediction_steps
+
+    def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
+                      batch_size: int = 1000) -> np.array:
+        new_X = X[-self.n_prediction_steps:]
+        return super(DummyTimeSeriesPredictionPipeline, self).predict_proba(new_X)
+
+    def predict(self, X: Union[np.ndarray, pd.DataFrame],
+                batch_size: int = 1000) -> np.array:
+        new_X = X[-self.n_prediction_steps:]
+        return super(DummyTimeSeriesPredictionPipeline, self).predict(new_X).astype(np.float32)
+
+
 def fit_and_suppress_warnings(logger: PicklableClientLogger, pipeline: BaseEstimator,
                               X: Dict[str, Any], y: Any
                               ) -> BaseEstimator:
@@ -212,7 +235,7 @@ def __init__(self, backend: Backend,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  ) -> None:
 
         self.starttime = time.time()
@@ -275,7 +298,7 @@ def __init__(self, backend: Backend,
             else:
                 raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_regression
-        else:
+        elif self.task_type in CLASSIFICATION_TASKS:
             if isinstance(self.configuration, int):
                 self.pipeline_class = DummyClassificationPipeline
             elif isinstance(self.configuration, str):
@@ -295,6 +318,23 @@ def __init__(self, backend: Backend,
                 else:
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
+        elif self.task_type in FORECASTING_TASKS:
+            if not isinstance(self.datamanager, TimeSeriesForecastingDataset):
+                raise ValueError(f'to perform time series forecastin tasks, the dataset must be '
+                                 f'autopytorch.dataset.TimeSeriesForecastingDataset.'
+                                 f'However, it is {type(self.datamanager)}')
+            n_prediction_steps = self.datamanager.n_prediction_steps
+            if isinstance(self.configuration, int):
+                self.pipeline_class = partial(partial(DummyTimeSeriesPredictionPipeline,
+                                                      n_prediction_steps=n_prediction_steps))
+            elif isinstance(self.configuration, str):
+                raise ValueError("Only tabular classifications tasks "
+                                 "are currently supported with traditional methods")
+            elif isinstance(self.configuration, Configuration):
+                self.pipeline_class = autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline
+            else:
+                raise ValueError('task {} not available'.format(self.task_type))
+            self.predict_function = self._predict_regression
 
         self.dataset_properties = self.datamanager.get_dataset_properties(get_dataset_requirements(info))
 
@@ -367,7 +407,7 @@ def _get_pipeline(self) -> BaseEstimator:
             raise ValueError("Invalid configuration entered")
         return pipeline
 
-    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]:
+    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **loss_kwargs: Dict) -> Dict[str, float]:
         """SMAC follows a minimization goal, so the make_scorer
         sign is used as a guide to obtain the value to reduce.
 
@@ -388,7 +428,7 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]:
         else:
             metrics = [self.metric]
         score = calculate_score(
-            y_true, y_hat, self.task_type, metrics)
+            y_true, y_hat, self.task_type, metrics, **loss_kwargs)
 
         err = {metric.name: metric._optimum - score[metric.name] for metric in metrics
                if metric.name in score.keys()}
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 1a97cea98..6e230604f 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -15,34 +15,14 @@
 
 from smac.tae import StatusType
 
+
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
-
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
-
-
-class DummyTimeSeriesPredictionPipeline(DummyClassificationPipeline):
-    def __init__(self, config: Configuration,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None,
-                 init_params: Optional[Dict] = None,
-                 n_prediction_steps: int = 1,
-                 ) -> None:
-        super(DummyTimeSeriesPredictionPipeline, self).__init__(config, random_state, init_params)
-        self.n_prediction_steps = n_prediction_steps
-
-    def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
-                      batch_size: int = 1000) -> np.array:
-        new_X = np.ones((self.n_prediction_steps, 1))
-        return super(DummyTimeSeriesPredictionPipeline, self).predict_proba(new_X)
-
-    def predict(self, X: Union[np.ndarray, pd.DataFrame],
-                batch_size: int = 1000) -> np.array:
-        new_X = np.ones((self.n_prediction_steps, 1))
-        return super(DummyTimeSeriesPredictionPipeline, self).predict(new_X).astype(np.float32)
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+from autoPyTorch.constants_forecasting import SEASONALITY_MAP
 
 
 class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
@@ -86,24 +66,12 @@ def __init__(self, backend: Backend, queue: Queue,
         self.datamanager: TimeSeriesForecastingDataset
         self.n_prediction_steps = self.datamanager.n_prediction_steps
         self.num_sequences = self.datamanager.num_sequences
+        self.seq_length_min = np.min(self.num_sequences)
+        seasonality = SEASONALITY_MAP.get(self.datamanager.freq, 1)
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        self.seasonality = int(seasonality)
 
-        if isinstance(self.configuration, int):
-            self.pipeline_class = partial(DummyTimeSeriesPredictionPipeline, n_prediction_steps=self.n_prediction_steps)
-        else:
-            self.pipeline_class = TimeSeriesForecastingPipeline
-
-        self.splits = self.datamanager.splits
-        if self.splits is None:
-            raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
-        self.num_folds: int = len(self.splits)
-        self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds
-        # TODO consider if we really need Y_train_targets
-        #self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
-        self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds
-        self.indices: List[Optional[Tuple[Union[np.ndarray, List], Union[np.ndarray, List]]]] = [None] * self.num_folds
-
-        self.logger.debug("Search space updates :{}".format(self.search_space_updates))
-        self.keep_models = keep_models
 
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
@@ -111,6 +79,8 @@ def fit_predict_and_loss(self) -> None:
         assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
             .format(self.__class__.__name__)
         additional_run_info: Optional[Dict] = None
+
+
         if self.num_folds == 1:
             split_id = 0
             self.logger.info("Starting fit {}".format(split_id))
@@ -119,29 +89,40 @@ def fit_predict_and_loss(self) -> None:
 
             train_split, test_split = self.splits[split_id]
 
-            y_optimization = np.ones([len(test_split), self.n_prediction_steps])
+            # TODO move these lines to TimeSeriesForecastingDatasets (Create a new object there that inherents from
+            #  np.array while return multiple values by __get_item__)!
+            # the +1 in the end indicates that X and y are not aligned (y and x with the same index corresponds to
+            # the same time step).
+            test_split_base = test_split + np.arange(len(test_split)) * self.n_prediction_steps + 1
 
-            # We implement this with the following reasons:
-            # given a series data, we don't know which value to predict so we predict the last n_predicted values
-            # However, this makes the shape unaligned with the shape of "self.Y_optimization"
-            # TODO consider fixed this under data loader (use pipline to do a preprocessing)
-            y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
-                           np.tile(np.arange(self.n_prediction_steps)[::-1], len(test_split))
+            y_test_split = np.repeat(test_split_base, self.n_prediction_steps) + \
+                           np.tile(np.arange(self.n_prediction_steps), len(test_split_base))
 
             self.Y_optimization = self.y_train[y_test_split]
+
             #self.Y_actual_train = self.y_train[train_split]
             y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
                                                                                         train_indices=train_split,
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
 
-            #As each sequence contains one test split id, and the value to be predicted is the last n_prediction_steps
-            #we need to expand the current split.
+            # use for computing MASE losses
+            y_train_forecasting = []
+            for seq_idx, test_idx in enumerate(test_split):
+                # TODO consider multi-variants here
+                seq = self.datamanager[test_idx][0]
+                if seq.shape[-1] > 1:
+                     y_train_forecasting.append(seq[self.datamanager.target_variables].squeeze())
+                else:
+                    y_train_forecasting.append(seq.squeeze())
+
+            forecasting_kwargs = {'y_train': y_train_forecasting,
+                                  'sp': self.seasonality,
+                                  'n_prediction_steps': self.n_prediction_steps,
+                                  }
 
-            #train_loss = self._loss(self.y_train[train_split], y_train_pred)
-            # TODO do we really need train loss?
             train_loss = None
-            loss = self._loss(self.y_train[y_test_split], y_opt_pred)
+            loss = self._loss(self.y_train[y_test_split], y_opt_pred, **forecasting_kwargs)
 
             additional_run_info = pipeline.get_additional_run_info() if hasattr(
                 pipeline, 'get_additional_run_info') else {}
@@ -177,9 +158,23 @@ def fit_predict_and_loss(self) -> None:
             # weights for opt_losses.
             opt_fold_weights = [np.NaN] * self.num_folds
 
-            for i, (train_split, test_split) in enumerate(self.splits):
+            y_train_forecasting_all = []
+            for i, (train_split, test_split) in self.splits:
+                # use for computing MASE losses
+                y_train_forecasting = []
+                for seq_idx, test_idx in enumerate(test_split):
+                    # TODO consider multi-variants here
+                    seq = self.datamanager[test_idx][0]
+                    if seq.shape[-1] > 1:
+                        y_train_forecasting.append(seq[self.datamanager.target_variables].squeeze())
+                    else:
+                        y_train_forecasting.append(seq.squeeze())
+                y_train_forecasting_all.append(y_train_forecasting)
 
+            for i, (train_split, test_split) in enumerate(self.splits):
                 pipeline = self.pipelines[i]
+                test_split_base = test_split + np.arange(len(test_split)) * self.n_prediction_steps + 1
+
                 train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
                                                                                     train_indices=train_split,
                                                                                     test_indices=test_split,
@@ -192,27 +187,29 @@ def fit_predict_and_loss(self) -> None:
 
                 #self.Y_train_targets[train_split] = self.y_train[train_split]
 
-                y_test_split = np.repeat(test_split, self.n_prediction_steps) - \
-                               np.tile(np.arange(self.n_prediction_steps)[::-1], len(test_split))
+                y_test_split = np.repeat(test_split_base, self.n_prediction_steps) + \
+                               np.tile(np.arange(self.n_prediction_steps), len(test_split_base))
 
                 self.Y_targets[i] = self.y_train[y_test_split]
                 # Compute train loss of this fold and store it. train_loss could
                 # either be a scalar or a dict of scalars with metrics as keys.
-                #train_loss = self._loss(
-                #    self.Y_train_targets[train_split],
-                #    train_pred,
-                #)
+
                 train_loss = 0.
                 train_losses[i] = train_loss
                 # number of training data points for this fold. Used for weighting
                 # the average.
                 train_fold_weights[i] = len(train_split)
 
+                forecasting_kwargs = {'y_train': y_train_forecasting_all[i],
+                                      'sp': self.seasonality,
+                                      'n_prediction_steps': self.n_prediction_steps,
+                                      }
 
                 # Compute validation loss of this fold and store it.
                 optimization_loss = self._loss(
                     self.Y_targets[i],
                     opt_pred,
+                    **forecasting_kwargs
                 )
                 opt_losses[i] = optimization_loss
                 # number of optimization data points for this fold.
@@ -296,15 +293,10 @@ def _predict(self, pipeline: BaseEstimator,
                  train_indices: Union[np.ndarray, List],
                  test_indices: Union[np.ndarray, List],
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
-        datamanager = self.datamanager
-        y_pred = np.ones([len(test_indices), self.n_prediction_steps])
+        # TODO consider multile outputs
+        opt_pred = np.ones([len(test_indices), self.n_prediction_steps])
         for seq_idx, test_idx in enumerate(test_indices):
-            y_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline).flatten()
-
-
-        #train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
-        #                                   self.y_train[train_indices])
-        opt_pred = y_pred.flatten()
+            opt_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline).squeeze()
 
         #TODO we consider X_valid and X_test as a multiple sequences???
         if self.X_valid is not None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py
new file mode 100644
index 000000000..57a07fff0
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py
@@ -0,0 +1,9 @@
+from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
+import torch
+
+
+def seq2tab(x: torch.Tensor):
+    # https://discuss.pytorch.org/t/how-could-i-flatten-two-dimensions-of-a-tensor/44570/4
+    return x.view(-1, *x.shape[2:])
+
+
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/distribution.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index acf292f9c..085339e32 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -240,11 +240,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
         train_split, test_split = datamanager.splits[X['split_id']]
-        valid_indices = []
-        idx_start = 0
-
-
-
 
         num_instances_dataset = np.size(train_split)
         num_instances_train = self.num_batches_per_epoch * self.batch_size
@@ -259,6 +254,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
         # at least one element of each sequence should be selected
 
+        # TODO consider the case where num_instances_train is greater than num_instances_dataset,
+        # In which case we simply iterate through all the datasets
+
         """
         # to allow a time sequence data with resolution self.sample_interval and windows size with self.window_size
         # we need to drop the first part of each sequence
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index de4578fbd..632616881 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -12,6 +12,7 @@
 """
 from typing import Any, Dict, Optional, Type
 
+import torch
 from torch.nn.modules.loss import (
     BCEWithLogitsLoss,
     CrossEntropyLoss,
@@ -21,21 +22,46 @@
 from torch.nn.modules.loss import _Loss as Loss
 
 from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, MULTICLASS, REGRESSION_TASKS, \
-    STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
+    FORECASTING_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
 
 
-losses = dict(classification=dict(
-    CrossEntropyLoss=dict(
-        module=CrossEntropyLoss, supported_output_types=[MULTICLASS, BINARY]),
-    BCEWithLogitsLoss=dict(
-        module=BCEWithLogitsLoss, supported_output_types=[BINARY])),
+class LogProbLoss(Loss):
+    __constants__ = ['reduction']
+
+    def __init__(self, reduction: str = 'mean') -> None:
+        super(Loss, self).__init__(reduction)
+
+    def forward(self, input_dist: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
+        scores = input_dist.log_prob(target_tensor)
+        if self.reduction == 'mean':
+            return - scores.mean()
+        elif self.reduction == 'sum':
+            return - scores.sum()
+        else:
+            return -scores
+
+
+losses = dict(
+    classification=dict(
+        CrossEntropyLoss=dict(
+            module=CrossEntropyLoss, supported_output_types=[MULTICLASS, BINARY]),
+        BCEWithLogitsLoss=dict(
+            module=BCEWithLogitsLoss, supported_output_types=[BINARY])),
     regression=dict(
         MSELoss=dict(
             module=MSELoss, supported_output_types=[CONTINUOUS]),
         L1Loss=dict(
-            module=L1Loss, supported_output_types=[CONTINUOUS])))
+            module=L1Loss, supported_output_types=[CONTINUOUS])),
+    forecasting=dict(
+        LogProbLoss=dict(
+            module=LogProbLoss, supported_output_types=[CONTINUOUS]),
+        MSELoss=dict(
+            module=MSELoss, supported_output_types=[CONTINUOUS]),
+        L1Loss=dict(
+            module=L1Loss, supported_output_types=[CONTINUOUS]),
+    ))
 
-default_losses = dict(classification=CrossEntropyLoss, regression=MSELoss)
+default_losses = dict(classification=CrossEntropyLoss, regression=MSELoss, forecasting=LogProbLoss)
 
 
 def get_default(task: int) -> Type[Loss]:
@@ -51,6 +77,8 @@ def get_default(task: int) -> Type[Loss]:
         return default_losses['classification']
     elif task in REGRESSION_TASKS:
         return default_losses['regression']
+    elif task in FORECASTING_TASKS:
+        return default_losses['forecasting']
     else:
         raise ValueError("Invalid task type {}".format(TASK_TYPES_TO_STRING[task]))
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index d8caf6716..131fbaf8b 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -26,7 +26,7 @@ def __init__(self,
     def __call__(self,
                  y_true: np.ndarray,
                  y_pred: np.ndarray,
-                 sample_weight: Optional[List[float]] = None
+                 sample_weight: Optional[List[float]] = None,
                  ) -> float:
         raise NotImplementedError()
 
@@ -37,6 +37,21 @@ def __repr__(self) -> str:
         return self.name
 
 
+# This is a mixin for computing time series forecasting losses, the  parameters are defined at:
+# https://www.sktime.org/en/stable/api_reference/performance_metrics.html
+# TODO considering adding more arguments to this function to allow more advanced loss function, e.g. asymmetric_error
+class ForecastingMetricMixin:
+    def __call__(self,
+                 y_true: np.ndarray,
+                 y_pred: np.ndarray,
+                 y_train: np.ndarray,
+                 sp: int,
+                 n_prediction_steps: int,
+                 horizon_weight: Optional[List[float]] = None
+                 ):
+        raise NotImplementedError()
+
+
 class _PredictMetric(autoPyTorchMetric):
     def __call__(
             self,
@@ -75,7 +90,7 @@ def __call__(
         elif type_true == 'multilabel-indicator':
             y_pred[y_pred > 0.5] = 1.0
             y_pred[y_pred <= 0.5] = 0.0
-        elif type_true in ['continuous-multioutput', 'multiclass-multioutput']:
+        elif type_true == 'continuous-multioutput':
             pass
         else:
             raise ValueError(type_true)
@@ -175,15 +190,95 @@ def __call__(
             return self._sign * self._metric_func(y_true, y_pred, **self._kwargs)
 
 
+class _ForecastingMetric(ForecastingMetricMixin, autoPyTorchMetric):
+    def __call__(
+            self,
+            y_true: np.ndarray,
+            y_pred: np.ndarray,
+            y_train: np.ndarray,
+            sp: int,
+            n_prediction_steps: int,
+            horizon_weight: Optional[List[float]] = None
+    ) -> float:
+        """Evaluate time series forecastin losses given input data
+        The description is nearly the same as the one defined under
+        https://www.sktime.org/en/stable/api_reference/performance_metrics.html
+
+        Parameters
+        ----------
+        y_true : array-like
+            Ground truth (correct) target values.
+
+        y_pred : array-like, [n_samples x n_classes]
+            Forecasted values.
+
+        sp: int
+            Seasonal periodicity of training data.
+
+        horizon_weight : array-like, optional (default=None)
+            Forecast horizon weights.
+            TODO consider weights for each individual prediction, i.e., we could mask the unobserved values
+        Returns
+        -------
+        score : float
+            Score function applied to prediction of estimator on X.
+        """
+        type_true = type_of_target(y_true)
+        if type_true == 'binary' and type_of_target(y_pred) == 'continuous' and \
+                len(y_pred.shape) == 1:
+            # For a pred autoPyTorchMetric, no threshold, nor probability is required
+            # If y_true is binary, and y_pred is continuous
+            # it means that a rounding is necessary to obtain the binary class
+            y_pred = np.around(y_pred, decimals=0)
+        elif len(y_pred.shape) == 1 or y_pred.shape[1] == 1 or \
+                type_true == 'continuous':
+            # must be regression, all other task types would return at least
+            # two probabilities
+            pass
+        elif type_true in ['binary', 'multiclass']:
+            y_pred = np.argmax(y_pred, axis=1)
+        elif type_true == 'multilabel-indicator':
+            y_pred[y_pred > 0.5] = 1.0
+            y_pred[y_pred <= 0.5] = 0.0
+        elif type_true in ['continuous-multioutput', 'multiclass-multioutput']:
+            pass
+        else:
+            raise ValueError(type_true)
+
+        agg = self._kwargs['aggregation']
+        y_true = y_true.reshape([-1, n_prediction_steps])
+        y_pred = y_pred.reshape([-1, n_prediction_steps])
+
+        if not len(y_pred) == len(y_true) == len(y_train):
+            raise ValueError(f"The length of y_true, y_pred and y_train must equal, however, they are "
+                             f"{len(y_pred)}, {len(y_true)}, and {y_train} respectively")
+
+        losses_all = np.ones([len(y_true)])
+        for seq_idx in range(len(y_true)):
+            losses_all[seq_idx] = self._sign * self._metric_func(y_true=y_true[seq_idx],
+                                                                 y_pred=y_pred[seq_idx],
+                                                                 y_train=y_train[seq_idx],
+                                                                 sp=sp,
+                                                                 horizon_weight=horizon_weight,
+                                                                 **self._kwargs)
+        if agg == 'mean':
+            return np.mean(losses_all)
+        elif agg == 'median':
+            return np.median(losses_all)
+        else:
+            raise ValueError(f'Unsupported aggregation type {agg}')
+
+
 def make_metric(
-    name: str,
-    score_func: Callable,
-    optimum: float = 1.0,
-    worst_possible_result: float = 0.0,
-    greater_is_better: bool = True,
-    needs_proba: bool = False,
-    needs_threshold: bool = False,
-    **kwargs: Any
+        name: str,
+        score_func: Callable,
+        optimum: float = 1.0,
+        worst_possible_result: float = 0.0,
+        greater_is_better: bool = True,
+        needs_proba: bool = False,
+        needs_threshold: bool = False,
+        do_forecasting: bool = False,
+        **kwargs: Any
 ) -> autoPyTorchMetric:
     """Make a autoPyTorchMetric from a performance metric or loss function.
     Factory inspired by scikit-learn which wraps scikit-learn scoring functions
@@ -215,6 +310,8 @@ def make_metric(
     needs_threshold : boolean, default=False
         Whether score_func takes a continuous decision certainty.
         This only works for binary classification.
+    do_forecasting : boolean, default=False
+        Whether
     **kwargs : additional arguments
         Additional parameters to be passed to score_func.
     """
@@ -223,5 +320,7 @@ def make_metric(
         return _ProbaMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
     elif needs_threshold:
         return _ThresholdMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
+    elif do_forecasting:
+        return _ForecastingMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
     else:
         return _PredictMetric(name, score_func, optimum, worst_possible_result, sign, kwargs)
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index b669e4ede..9dbabac66 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -1,6 +1,6 @@
 from functools import partial
 
-
+import sktime.performance_metrics.forecasting as forecasting_metrics
 import sklearn.metrics
 
 from smac.utils.constants import MAXINT
@@ -46,6 +46,97 @@
 f1 = make_metric('f1',
                  sklearn.metrics.f1_score)
 
+# Standard Forecasting Scores
+mean_MASE_forecasting = make_metric('mean_MASE_forecasting',
+                                    forecasting_metrics.mean_absolute_scaled_error,
+                                    optimum=0,
+                                    worst_possible_result=MAXINT,
+                                    greater_is_better=False,
+                                    do_forecasting=True,
+                                    aggregation='mean',
+                                    )
+
+median_MASE_forecasting = make_metric('median_absolute_scaled_error_forecasting',
+                                      forecasting_metrics.mean_absolute_scaled_error,
+                                      optimum=0,
+                                      worst_possible_result=MAXINT,
+                                      greater_is_better=False,
+                                      do_forecasting=True,
+                                      aggregation='median',
+                                      )
+
+mean_MSSE_forecasting = make_metric('mean_MSSE_forecasting',
+                                    forecasting_metrics.mean_squared_scaled_error,
+                                    optimum=0,
+                                    worst_possible_result=MAXINT,
+                                    greater_is_better=False,
+                                    do_forecasting=True,
+                                    aggregation='mean',
+                                    )
+
+median_MSSE_forecasting = make_metric('median_MSSE_forecasting',
+                                      forecasting_metrics.mean_squared_scaled_error,
+                                      optimum=0,
+                                      worst_possible_result=MAXINT,
+                                      greater_is_better=False,
+                                      do_forecasting=True,
+                                      aggregation='median',
+                                      )
+
+mean_MAE_forecasting = make_metric('mean_MAE_forecasting',
+                                   forecasting_metrics.mean_absolute_error,
+                                   optimum=0,
+                                   worst_possible_result=MAXINT,
+                                   greater_is_better=False,
+                                   do_forecasting=True,
+                                   aggregation='mean',
+                                   )
+
+median_MAE_forecasting = make_metric('median_absolute_error_forecasting',
+                                     forecasting_metrics.mean_absolute_error,
+                                     optimum=0,
+                                     worst_possible_result=MAXINT,
+                                     greater_is_better=False,
+                                     do_forecasting=True,
+                                     aggregation='median',
+                                     )
+
+mean_MAPE_forecasting = make_metric('mean_MAPE_forecasting',
+                                    forecasting_metrics.mean_absolute_percentage_error,
+                                    optimum=0,
+                                    worst_possible_result=MAXINT,
+                                    greater_is_better=False,
+                                    do_forecasting=True,
+                                    aggregation='mean',
+                                    )
+
+median_MAPE_forecasting = make_metric('median_MAPE_forecasting',
+                                      forecasting_metrics.mean_absolute_percentage_error,
+                                      optimum=0,
+                                      worst_possible_result=MAXINT,
+                                      greater_is_better=False,
+                                      do_forecasting=True,
+                                      aggregation='median',
+                                      )
+
+mean_MSE_forecasting = make_metric('mean_MSE_forecasting',
+                                   forecasting_metrics.mean_squared_error,
+                                   optimum=0,
+                                   worst_possible_result=MAXINT,
+                                   greater_is_better=False,
+                                   do_forecasting=True,
+                                   aggregation='mean',
+                                   )
+
+median_MSE_forecasting = make_metric('median_MSE_forecasting',
+                                     forecasting_metrics.mean_squared_error,
+                                     optimum=0,
+                                     worst_possible_result=MAXINT,
+                                     greater_is_better=False,
+                                     do_forecasting=True,
+                                     aggregation='median',
+                                     )
+
 # Score functions that need decision values
 roc_auc = make_metric('roc_auc', sklearn.metrics.roc_auc_score, needs_threshold=True)
 average_precision = make_metric('average_precision',
@@ -75,6 +166,14 @@
                log_loss]:
     CLASSIFICATION_METRICS[scorer.name] = scorer
 
+FORECASTING_METRICS = dict()
+for scorer in [mean_MASE_forecasting, median_MASE_forecasting,
+               mean_MSSE_forecasting, median_MSSE_forecasting,
+               mean_MAE_forecasting, median_MAE_forecasting,
+               mean_MAPE_forecasting, median_MAPE_forecasting,
+               mean_MSE_forecasting, median_MSE_forecasting]:
+    FORECASTING_METRICS[scorer.name] = scorer
+
 for name, metric in [('precision', sklearn.metrics.precision_score),
                      ('recall', sklearn.metrics.recall_score),
                      ('f1', sklearn.metrics.f1_score)]:
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 3c8208e15..936de5b66 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -6,11 +6,13 @@
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
     REGRESSION_TASKS,
+    FORECASTING_TASKS,
     STRING_TO_TASK_TYPES,
     TASK_TYPES,
 )
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS
+from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, \
+    REGRESSION_METRICS, FORECASTING_METRICS
 
 
 def sanitize_array(array: np.ndarray) -> np.ndarray:
@@ -40,6 +42,8 @@ def get_supported_metrics(dataset_properties: Dict[str, Any]) -> Dict[str, autoP
         return REGRESSION_METRICS
     elif STRING_TO_TASK_TYPES[task_type] in CLASSIFICATION_TASKS:
         return CLASSIFICATION_METRICS
+    elif STRING_TO_TASK_TYPES[task_type] in FORECASTING_TASKS:
+        return FORECASTING_METRICS
     else:
         raise NotImplementedError(task_type)
 
@@ -108,9 +112,14 @@ def calculate_score(
         prediction: np.ndarray,
         task_type: int,
         metrics: Iterable[autoPyTorchMetric],
+        **score_kwargs: Dict
 ) -> Dict[str, float]:
     score_dict = dict()
-    if task_type in REGRESSION_TASKS:
+    if task_type in FORECASTING_TASKS:
+        cprediction = sanitize_array(prediction)
+        for metric_ in metrics:
+            score_dict[metric_.name] = metric_(target, cprediction, **score_kwargs)
+    elif task_type in REGRESSION_TASKS:
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:
             try:
@@ -122,7 +131,6 @@ def calculate_score(
                     continue
                 else:
                     raise e
-
     else:
         for metric_ in metrics:
             try:
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index fac404aef..6ef503931 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -11,7 +11,7 @@
 from torch.utils.tensorboard.writer import SummaryWriter
 
 
-from autoPyTorch.constants import REGRESSION_TASKS
+from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 from autoPyTorch.utils.implementations import get_loss_weight_strategy
@@ -279,7 +279,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             return loss_sum / N, {}
 
     def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
-        if self.task_type in REGRESSION_TASKS:
+        if self.task_type in REGRESSION_TASKS or FORECASTING_TASKS:
             targets = targets.float().to(self.device)
             # make sure that targets will have same shape as outputs (really important for mse loss for example)
             if targets.ndim == 1:
@@ -302,6 +302,7 @@ def train_step(self, data: np.ndarray, targets: np.ndarray) -> Tuple[float, torc
         """
         # prepare
         data = data.float().to(self.device)
+
         targets = self.cast_targets(targets)
 
         data, criterion_kwargs = self.data_preparation(data, targets)
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 3fb166e56..46b2da33b 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -135,17 +135,17 @@ def get_configuration_space(info: Dict[str, Any],
     task_type: int = STRING_TO_TASK_TYPES[info['task_type']]
 
     if task_type in REGRESSION_TASKS:
-        if task_type in FORECASTING_TASKS:
-            return _get_forecasting_configuration_space(info,
-                                                        include if include is not None else {},
-                                                        exclude if exclude is not None else {},
-                                                        search_space_updates=search_space_updates
-                                                        )
         return _get_regression_configuration_space(info,
                                                    include if include is not None else {},
                                                    exclude if exclude is not None else {},
                                                    search_space_updates=search_space_updates
                                                    )
+    elif task_type in FORECASTING_TASKS:
+        return _get_forecasting_configuration_space(info,
+                                                    include if include is not None else {},
+                                                    exclude if exclude is not None else {},
+                                                    search_space_updates=search_space_updates
+                                                    )
     else:
         return _get_classification_configuration_space(info,
                                                        include if include is not None else {},
diff --git a/requirements.txt b/requirements.txt
index c3bbc1b23..f073b7ecd 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ openml==0.11.0
 lightgbm==3.1.1
 catboost==0.24.4
 pexpect==4.8.0
+sktime==0.8.0

From 6d6f2057e3fe8228fbaa1ef99c04f648a0ed55ce Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 24 Nov 2021 17:05:28 +0100
Subject: [PATCH 064/347] metric_kwargs for all loss computation items

---
 autoPyTorch/api/base_task.py                  |  2 +
 autoPyTorch/api/time_series_forecasting.py    |  9 ++-
 autoPyTorch/datasets/time_series_dataset.py   |  3 +-
 autoPyTorch/ensemble/ensemble_builder.py      | 18 +++++
 autoPyTorch/ensemble/ensemble_selection.py    |  5 +-
 autoPyTorch/ensemble/singlebest_ensemble.py   |  3 +
 autoPyTorch/evaluation/abstract_evaluator.py  | 27 ++++---
 ...time_series_forecasting_train_evaluator.py | 78 +++++++++++--------
 .../pipeline/components/training/losses.py    |  1 -
 .../components/training/metrics/base.py       |  7 +-
 .../components/training/metrics/metrics.py    | 10 ++-
 .../components/training/metrics/utils.py      |  9 ++-
 12 files changed, 115 insertions(+), 57 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 4519c1859..07ae7d19a 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -173,6 +173,7 @@ def __init__(
         self.search_space: Optional[ConfigurationSpace] = None
         self._dataset_requirements: Optional[List[FitRequirement]] = None
         self._metric: Optional[autoPyTorchMetric] = None
+        self._metrics_kwargs: Dict = {}
         self._logger: Optional[PicklableClientLogger] = None
         self.run_history: Optional[RunHistory] = None
         self.trajectory: Optional[List] = None
@@ -812,6 +813,7 @@ def _search(
                 output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type],
                 task_type=STRING_TO_TASK_TYPES[self.task_type],
                 metrics=[self._metric],
+                metrics_kwargs=self._metrics_kwargs,
                 opt_metric=optimize_metric,
                 ensemble_size=self.ensemble_size,
                 ensemble_nbest=self.ensemble_nbest,
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 51801ec46..43bad0f00 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -21,7 +21,7 @@
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE
+from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE, SEASONALITY_MAP
 
 
 class TimeSeriesForecastingTask(BaseTask):
@@ -272,6 +272,12 @@ def search(
                                  "Setting traditional_per_total_budget to 0.")
             traditional_per_total_budget = 0.
 
+        seasonality = SEASONALITY_MAP.get(self.dataset.freq, 1)
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        self._metrics_kwargs = {'sp': seasonality,
+                                'n_prediction_steps': n_prediction_steps}
+
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
@@ -290,7 +296,6 @@ def search(
             time_series_prediction=self.time_series_prediction
         )
 
-
     def predict(
             self,
             X_test: List[np.ndarray],
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 4791bbf3f..423f8fce9 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -530,7 +530,8 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
 
     def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
-        dataset_properties.update({'upper_window_size': self.upper_window_size,
+        dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
+                                   'upper_window_size': self.upper_window_size,
                                    'sequence_lengths_train': self.sequence_lengths_train})
         return dataset_properties
 
diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index ecf188911..5298540fb 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -64,6 +64,7 @@ def __init__(
         ensemble_memory_limit: Optional[int],
         random_state: int,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+        metrics_kwargs: Optional[Dict]=None,
     ):
         """ SMAC callback to handle ensemble building
         Parameters
@@ -112,6 +113,8 @@ def __init__(
             read at most n new prediction files in each iteration
         logger_port: int
             port in where to publish a msg
+        metrics_kwargs:  Optional[Dict]
+            additional information for comuting metrics
     Returns
     -------
         List[Tuple[int, float, float, float]]:
@@ -125,6 +128,7 @@ def __init__(
         self.task_type = task_type
         self.output_type = output_type
         self.metrics = metrics
+        self.metrics_kwargs = metrics_kwargs
         self.opt_metric = opt_metric
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
@@ -240,6 +244,7 @@ def build_ensemble(
                     pynisher_context=pynisher_context,
                     logger_port=self.logger_port,
                     unit_test=unit_test,
+                    metric_kwargs=self.metrics_kwargs,
                 ))
 
                 logger.info(
@@ -281,6 +286,7 @@ def fit_and_return_ensemble(
     pynisher_context: str,
     logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
     unit_test: bool = False,
+    metric_kwargs: Dict = {}
 ) -> Tuple[
         List[Dict[str, float]],
         int,
@@ -341,6 +347,8 @@ def fit_and_return_ensemble(
             Having this is very bad coding style, but I did not find a way to make
             unittest.mock work through the pynisher with all spawn contexts. If you know a
             better solution, please let us know by opening an issue.
+        metric_kwargs: Dict
+            additional arguments for computing metrics, this is used for time series forecasting computation
     Returns
     -------
         List[Tuple[int, float, float, float]]
@@ -364,6 +372,7 @@ def fit_and_return_ensemble(
         random_state=random_state,
         logger_port=logger_port,
         unit_test=unit_test,
+        metric_kwargs=metric_kwargs,
     ).run(
         end_at=end_at,
         iteration=iteration,
@@ -393,6 +402,7 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         unit_test: bool = False,
+        metric_kwargs: Dict = {}
     ):
         """
             Constructor
@@ -446,6 +456,8 @@ def __init__(
                 Having this is very bad coding style, but I did not find a way to make
                 unittest.mock work through the pynisher with all spawn contexts. If you know a
                 better solution, please let us know by opening an issue.
+            metric_kwargs: Dict
+            additional arguments for computing metrics, this is used for time series forecasting computation
         """
 
         super(EnsembleBuilder, self).__init__()
@@ -455,6 +467,7 @@ def __init__(
         self.task_type = task_type
         self.output_type = output_type
         self.metrics = metrics
+        self.metric_kwargs = metric_kwargs
         self.opt_metric = opt_metric
         self.ensemble_size = ensemble_size
         self.performance_range_threshold = performance_range_threshold
@@ -966,6 +979,7 @@ def score_ensemble_preds(self) -> bool:
                 target=self.y_true_ensemble,
                 prediction=y_ensemble,
                 task_type=self.task_type,
+                **self.metric_kwargs
             )
             try:
                 y_ensemble = self._read_np_fn(y_ens_fn)
@@ -974,6 +988,7 @@ def score_ensemble_preds(self) -> bool:
                     target=self.y_true_ensemble,
                     prediction=y_ensemble,
                     task_type=self.task_type,
+                    **self.metric_kwargs
                 )
 
                 if np.isfinite(self.read_scores[y_ens_fn]["ens_score"]):
@@ -1285,6 +1300,7 @@ def fit_ensemble(self, selected_keys: List[str]) -> Optional[EnsembleSelection]:
             metric=opt_metric,
             random_state=self.random_state,
             task_type=self.task_type,
+            metric_kwargs=self.metric_kwargs
         )
 
         try:
@@ -1402,6 +1418,7 @@ def _add_ensemble_trajectory(self, train_pred: np.ndarray, test_pred: np.ndarray
             target=self.y_true_ensemble,
             prediction=train_pred,
             task_type=self.task_type,
+            **self.metric_kwargs
         )
         performance_stamp.update({'train_' + str(key): val for key, val in train_scores.items()})
         if self.y_test is not None:
@@ -1410,6 +1427,7 @@ def _add_ensemble_trajectory(self, train_pred: np.ndarray, test_pred: np.ndarray
                 target=self.y_test,
                 prediction=test_pred,
                 task_type=self.task_type,
+                **self.metric_kwargs
             )
             performance_stamp.update(
                 {'test_' + str(key): val for key, val in test_scores.items()})
diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py
index 6f701787a..a1822a742 100644
--- a/autoPyTorch/ensemble/ensemble_selection.py
+++ b/autoPyTorch/ensemble/ensemble_selection.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union, Optional
 
 import numpy as np
 
@@ -16,9 +16,11 @@ def __init__(
         metric: autoPyTorchMetric,
         task_type: int,
         random_state: np.random.RandomState,
+        metric_kwargs: Dict = {},
     ) -> None:
         self.ensemble_size = ensemble_size
         self.metric = metric
+        self.metric_kwargs = metric_kwargs
         self.random_state = random_state
         self.task_type = task_type
 
@@ -118,6 +120,7 @@ def _fit(
                     target=labels,
                     prediction=fant_ensemble_prediction,
                     task_type=self.task_type,
+                    **self.metric_kwargs
                 )
                 scores[j] = self.metric._optimum - score[self.metric.name]
 
diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py
index c6fbaf576..c01939a68 100644
--- a/autoPyTorch/ensemble/singlebest_ensemble.py
+++ b/autoPyTorch/ensemble/singlebest_ensemble.py
@@ -25,8 +25,11 @@ def __init__(
         run_history: RunHistory,
         seed: int,
         backend: Backend,
+        metric_kwargs: Dict = {},
+
     ):
         self.metric = metric
+        self.metric_kwargs = metric_kwargs
         self.seed = seed
         self.backend = backend
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index b84595009..689bb019f 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -181,24 +181,35 @@ def get_default_pipeline_options() -> Dict[str, Any]:
                 'runtime': 1}
 
 
-class DummyTimeSeriesPredictionPipeline(DummyClassificationPipeline):
+class DummyTimeSeriesForecastingPipeline(DummyClassificationPipeline):
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None,
                  n_prediction_steps: int = 1,
                  ) -> None:
-        super(DummyTimeSeriesPredictionPipeline, self).__init__(config, random_state, init_params)
+        super(DummyTimeSeriesForecastingPipeline, self).__init__(config, random_state, init_params)
         self.n_prediction_steps = n_prediction_steps
 
+    def fit(self, X: Dict[str, Any], y: Any,
+            sample_weight: Optional[np.ndarray] = None) -> object:
+        self.n_prediction_steps = X['dataset_properties']['n_prediction_steps']
+        return super(DummyTimeSeriesForecastingPipeline, self).fit(X, y)
+
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int = 1000) -> np.array:
         new_X = X[-self.n_prediction_steps:]
-        return super(DummyTimeSeriesPredictionPipeline, self).predict_proba(new_X)
+        return super(DummyTimeSeriesForecastingPipeline, self).predict_proba(new_X)
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
                 batch_size: int = 1000) -> np.array:
         new_X = X[-self.n_prediction_steps:]
-        return super(DummyTimeSeriesPredictionPipeline, self).predict(new_X).astype(np.float32)
+        return super(DummyTimeSeriesForecastingPipeline, self).predict(new_X).astype(np.float32)
+
+    @staticmethod
+    def get_default_pipeline_options() -> Dict[str, Any]:
+        return {'budget_type': 'epochs',
+                'epochs': 1,
+                'runtime': 1}
 
 
 def fit_and_suppress_warnings(logger: PicklableClientLogger, pipeline: BaseEstimator,
@@ -319,14 +330,8 @@ def __init__(self, backend: Backend,
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
         elif self.task_type in FORECASTING_TASKS:
-            if not isinstance(self.datamanager, TimeSeriesForecastingDataset):
-                raise ValueError(f'to perform time series forecastin tasks, the dataset must be '
-                                 f'autopytorch.dataset.TimeSeriesForecastingDataset.'
-                                 f'However, it is {type(self.datamanager)}')
-            n_prediction_steps = self.datamanager.n_prediction_steps
             if isinstance(self.configuration, int):
-                self.pipeline_class = partial(partial(DummyTimeSeriesPredictionPipeline,
-                                                      n_prediction_steps=n_prediction_steps))
+                self.pipeline_class = DummyTimeSeriesForecastingPipeline
             elif isinstance(self.configuration, str):
                 raise ValueError("Only tabular classifications tasks "
                                  "are currently supported with traditional methods")
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 6e230604f..980b55e32 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -2,7 +2,7 @@
 from autoPyTorch.evaluation.abstract_evaluator import DummyClassificationPipeline
 
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar
+from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar, Sequence
 from functools import partial
 import warnings
 
@@ -15,8 +15,8 @@
 
 from smac.tae import StatusType
 
-
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -72,7 +72,6 @@ def __init__(self, backend: Backend, queue: Queue,
             seasonality = min(seasonality)  # Use to calculate MASE
         self.seasonality = int(seasonality)
 
-
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
         holdout"""
@@ -80,7 +79,6 @@ def fit_predict_and_loss(self) -> None:
             .format(self.__class__.__name__)
         additional_run_info: Optional[Dict] = None
 
-
         if self.num_folds == 1:
             split_id = 0
             self.logger.info("Starting fit {}".format(split_id))
@@ -106,19 +104,11 @@ def fit_predict_and_loss(self) -> None:
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
 
-            # use for computing MASE losses
-            y_train_forecasting = []
-            for seq_idx, test_idx in enumerate(test_split):
-                # TODO consider multi-variants here
-                seq = self.datamanager[test_idx][0]
-                if seq.shape[-1] > 1:
-                     y_train_forecasting.append(seq[self.datamanager.target_variables].squeeze())
-                else:
-                    y_train_forecasting.append(seq.squeeze())
+            mase_cofficient = self.compute_mase_coefficient(test_split)
 
-            forecasting_kwargs = {'y_train': y_train_forecasting,
-                                  'sp': self.seasonality,
+            forecasting_kwargs = {'sp': self.seasonality,
                                   'n_prediction_steps': self.n_prediction_steps,
+                                  'mase_cofficient': mase_cofficient,
                                   }
 
             train_loss = None
@@ -129,10 +119,12 @@ def fit_predict_and_loss(self) -> None:
 
             status = StatusType.SUCCESS
 
+            self.Y_optimization *= mase_cofficient
+
             self.finish_up(
                 loss=loss,
                 train_loss=train_loss,
-                opt_pred=y_opt_pred,
+                opt_pred=y_opt_pred.flatten() * mase_cofficient,
                 valid_pred=y_valid_pred,
                 test_pred=y_test_pred,
                 additional_run_info=additional_run_info,
@@ -158,18 +150,10 @@ def fit_predict_and_loss(self) -> None:
             # weights for opt_losses.
             opt_fold_weights = [np.NaN] * self.num_folds
 
-            y_train_forecasting_all = []
-            for i, (train_split, test_split) in self.splits:
-                # use for computing MASE losses
-                y_train_forecasting = []
-                for seq_idx, test_idx in enumerate(test_split):
-                    # TODO consider multi-variants here
-                    seq = self.datamanager[test_idx][0]
-                    if seq.shape[-1] > 1:
-                        y_train_forecasting.append(seq[self.datamanager.target_variables].squeeze())
-                    else:
-                        y_train_forecasting.append(seq.squeeze())
-                y_train_forecasting_all.append(y_train_forecasting)
+            mase_coefficient_all = []
+            for train_split, test_split in self.splits:
+                mase_coefficient = self.compute_mase_coefficient(test_split)
+                mase_coefficient_all.append(mase_coefficient)
 
             for i, (train_split, test_split) in enumerate(self.splits):
                 pipeline = self.pipelines[i]
@@ -200,7 +184,7 @@ def fit_predict_and_loss(self) -> None:
                 # the average.
                 train_fold_weights[i] = len(train_split)
 
-                forecasting_kwargs = {'y_train': y_train_forecasting_all[i],
+                forecasting_kwargs = {'mase_cofficient': mase_coefficient_all[i],
                                       'sp': self.seasonality,
                                       'n_prediction_steps': self.n_prediction_steps,
                                       }
@@ -244,10 +228,10 @@ def fit_predict_and_loss(self) -> None:
             Y_train_targets = self.Y_train_targets
 
             Y_optimization_preds = np.concatenate(
-                [Y_optimization_pred[i] for i in range(self.num_folds)
+                [Y_optimization_pred[i] * mase_coefficient_all[i] for i in range(self.num_folds)
                  if Y_optimization_pred[i] is not None])
             Y_targets = np.concatenate([
-                Y_targets[i] for i in range(self.num_folds)
+                Y_targets[i] * mase_coefficient_all[i] for i in range(self.num_folds)
                 if Y_targets[i] is not None
             ])
 
@@ -281,7 +265,7 @@ def fit_predict_and_loss(self) -> None:
             self.finish_up(
                 loss=opt_loss,
                 train_loss=train_loss,
-                opt_pred=Y_optimization_preds,
+                opt_pred=Y_optimization_preds.flatten(),
                 valid_pred=Y_valid_preds,
                 test_pred=Y_test_preds,
                 additional_run_info=additional_run_info,
@@ -289,6 +273,36 @@ def fit_predict_and_loss(self) -> None:
                 status=status,
             )
 
+    def compute_mase_coefficient(self, test_split: Sequence) -> np.ndarray:
+        """
+        Compute the denominator for Mean Absolute Scaled Losses,
+        For detail, please check sktime.performance_metrics.forecasting._functions.mean_absolute_scaled_error
+
+        Parameters:
+        ----------
+        test_split: Sequence
+            test splits, consistent of int
+        Return:
+        ----------
+        mase_coefficient: np.ndarray(self.num_sequence * self.n_prediction_steps)
+            inverse of the mase_denominator
+        """
+        mase_coefficient = np.ones(len(test_split))
+        if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
+            from sktime.performance_metrics.forecasting._functions import EPS, mean_absolute_error
+            for seq_idx, test_idx in enumerate(test_split):
+                seq = self.datamanager[test_idx][0]
+                if seq.shape[-1] > 1:
+                    seq = seq[self.datamanager.target_variables].squeeze()
+                else:
+                    seq = seq.squeeze()
+                mase_denominator = mean_absolute_error(seq[self.seasonality:],
+                                                       seq[:-self.seasonality],
+                                                       multioutput="uniform_average")
+                mase_coefficient[seq_idx] = 1.0 / np.maximum(mase_denominator, EPS)
+        mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps)
+        return mase_coefficient
+
     def _predict(self, pipeline: BaseEstimator,
                  train_indices: Union[np.ndarray, List],
                  test_indices: Union[np.ndarray, List],
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 632616881..abe9dcea1 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -82,7 +82,6 @@ def get_default(task: int) -> Type[Loss]:
     else:
         raise ValueError("Invalid task type {}".format(TASK_TYPES_TO_STRING[task]))
 
-
 def get_supported_losses(task: int, output_type: int) -> Dict[str, Type[Loss]]:
     """
     Utility function to get supported losses for a given task and output type
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 131fbaf8b..f57085e5a 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -44,7 +44,6 @@ class ForecastingMetricMixin:
     def __call__(self,
                  y_true: np.ndarray,
                  y_pred: np.ndarray,
-                 y_train: np.ndarray,
                  sp: int,
                  n_prediction_steps: int,
                  horizon_weight: Optional[List[float]] = None
@@ -195,7 +194,6 @@ def __call__(
             self,
             y_true: np.ndarray,
             y_pred: np.ndarray,
-            y_train: np.ndarray,
             sp: int,
             n_prediction_steps: int,
             horizon_weight: Optional[List[float]] = None
@@ -249,15 +247,14 @@ def __call__(
         y_true = y_true.reshape([-1, n_prediction_steps])
         y_pred = y_pred.reshape([-1, n_prediction_steps])
 
-        if not len(y_pred) == len(y_true) == len(y_train):
+        if not len(y_pred) == len(y_true):
             raise ValueError(f"The length of y_true, y_pred and y_train must equal, however, they are "
-                             f"{len(y_pred)}, {len(y_true)}, and {y_train} respectively")
+                             f"{len(y_pred)} and {len(y_true)} respectively")
 
         losses_all = np.ones([len(y_true)])
         for seq_idx in range(len(y_true)):
             losses_all[seq_idx] = self._sign * self._metric_func(y_true=y_true[seq_idx],
                                                                  y_pred=y_pred[seq_idx],
-                                                                 y_train=y_train[seq_idx],
                                                                  sp=sp,
                                                                  horizon_weight=horizon_weight,
                                                                  **self._kwargs)
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 9dbabac66..f8a19b4a0 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -47,8 +47,11 @@
                  sklearn.metrics.f1_score)
 
 # Standard Forecasting Scores
+
+# To avoid storing unnecessary scale values here, we scale all the values under
+# AutoPytorch.evaluation.time_series_forecasting_train_evaluator
 mean_MASE_forecasting = make_metric('mean_MASE_forecasting',
-                                    forecasting_metrics.mean_absolute_scaled_error,
+                                    forecasting_metrics.mean_absolute_error,
                                     optimum=0,
                                     worst_possible_result=MAXINT,
                                     greater_is_better=False,
@@ -57,7 +60,7 @@
                                     )
 
 median_MASE_forecasting = make_metric('median_absolute_scaled_error_forecasting',
-                                      forecasting_metrics.mean_absolute_scaled_error,
+                                      forecasting_metrics.mean_absolute_error,
                                       optimum=0,
                                       worst_possible_result=MAXINT,
                                       greater_is_better=False,
@@ -65,6 +68,9 @@
                                       aggregation='median',
                                       )
 
+MASE_LOSSES = [mean_MASE_forecasting, median_MASE_forecasting]
+
+
 mean_MSSE_forecasting = make_metric('mean_MSSE_forecasting',
                                     forecasting_metrics.mean_squared_scaled_error,
                                     optimum=0,
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 936de5b66..803ae74ed 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -12,7 +12,7 @@
 )
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, \
-    REGRESSION_METRICS, FORECASTING_METRICS
+    REGRESSION_METRICS, FORECASTING_METRICS, MASE_LOSSES
 
 
 def sanitize_array(array: np.ndarray) -> np.ndarray:
@@ -118,7 +118,12 @@ def calculate_score(
     if task_type in FORECASTING_TASKS:
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:
-            score_dict[metric_.name] = metric_(target, cprediction, **score_kwargs)
+            if metric_ in MASE_LOSSES and 'mase_cofficient' in score_kwargs:
+                target_scaled = target * score_kwargs['mase_cofficient']
+                cprediction_scaled = cprediction_scaled * score_kwargs['mase_cofficient']
+                score_dict[metric_.name] = metric_(target_scaled, cprediction_scaled, **score_kwargs)
+            else:
+                score_dict[metric_.name] = metric_(target, cprediction, **score_kwargs)
     elif task_type in REGRESSION_TASKS:
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:

From e61a931f30064d5cf9b4b6dc144e09b23ce0dfbf Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 24 Nov 2021 19:56:13 +0100
Subject: [PATCH 065/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    |  5 +++
 autoPyTorch/datasets/time_series_dataset.py   | 10 ++++++
 .../scaling/utils.py                          |  2 +-
 .../setup/network_head/distribution.py        | 36 +++++++++++++++++++
 .../time_series_forecasting_data_loader.py    |  5 +--
 .../pipeline/components/training/losses.py    |  7 +++-
 autoPyTorch/utils/pipeline.py                 | 26 +++++++++-----
 7 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 43bad0f00..dbb0a120f 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -138,6 +138,7 @@ def search(
             load_models: bool = True,
             shift_input_data: bool = True,
             normalize_y: bool = True,
+            train_with_log_prob: bool = True
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -211,6 +212,9 @@ def search(
                 if the input data needs to be shifted
             normalize_y: bool
                 if the input y values need to be normalized
+            train_with_log_prob: bool
+                if the network is trained with log_prob losses, this will create a network header that is different
+                from the current version.
         Returns:
             self
 
@@ -244,6 +248,7 @@ def search(
             n_prediction_steps=n_prediction_steps,
             shift_input_data=shift_input_data,
             normalize_y=normalize_y,
+            train_with_log_prob=train_with_log_prob,
         )
 
         if self.dataset.freq_value is not None or not self.customized_window_size:
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 423f8fce9..9ac3a0b5d 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -163,6 +163,7 @@ def __init__(self,
                  n_prediction_steps: int = 1,
                  shift_input_data: bool = True,
                  normalize_y: bool = True,
+                 train_with_log_prob: bool = True,
                  ):
         """
         :param target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] used for multi-variant forecasting
@@ -175,6 +176,9 @@ def __init__(self,
         such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
         :param normalize_y: bool
         if y values needs to be normalized with mean 0 and variance 1
+        :param train_with_log_prob: bool
+        if the dataset is trained with log_prob losses, this needs to be specified in the very beginning such that the
+        header's configspace can be built beforehand.
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
         self.n_prediction_steps = n_prediction_steps
@@ -301,6 +305,11 @@ def __init__(self,
         self.freq: Optional[str] = freq
         self.freq_value: Optional[int] = freq_value
 
+        # TODO in the future, if training losses types are considered as a type of hyperparameters, we need to remove
+        #  this line and create  conditional configspace under
+        #  autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice .
+        self.train_with_log_prob = train_with_log_prob
+
     def __getitem__(self, idx, train=True):
         if idx < 0:
             if -idx > len(self):
@@ -525,6 +534,7 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
             'upper_window_size': self.upper_window_size,
+            'train_with_log_prob': self.train_with_log_prob
         })
         return info
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 3d6a39e70..cec1255dd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -9,7 +9,7 @@
 # Similar to / inspired by
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
 class TimeSeriesScaler(BaseEstimator):
-    def __init__(self, mode: str, sequence_lengths_train:List[int], is_training=True):
+    def __init__(self, mode: str, sequence_lengths_train: List[int], is_training=True):
         self.mode = mode
         self.sequence_lengths_train = sequence_lengths_train
         self.is_training = is_training
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/distribution.py
index e69de29bb..b836e61eb 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distribution.py
@@ -0,0 +1,36 @@
+# This part mainly follows the implementation in gluonts:
+# https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
+# However, we don't simply follow their implementation mainly due to the different network backbone.
+# Additionally, we rescale the output in the later phases to avoid
+
+from typing import Callable, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import (
+    AffineTransform,
+    Beta,
+    Distribution,
+    Gamma,
+    NegativeBinomial,
+    Normal,
+    Poisson,
+    StudentT,
+    TransformedDistribution,
+)
+
+
+class ProjectionLayer(nn.Module):
+    """
+    A projection layer that
+    """
+    def __init__(self,
+                 in_features: int,
+                 n_prediction_steps: int,
+                 args_dims: [int],
+                 domain_map: Callable[..., Tuple[torch.Tensor]],
+                 **kwargs,):
+        super().__init__(**kwargs)
+
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 085339e32..8e455c79c 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -21,7 +21,7 @@
 
 from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
-from autoPyTorch.utils.common import  custom_collate_fn
+from autoPyTorch.utils.common import custom_collate_fn
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
@@ -92,9 +92,6 @@ def __len__(self):
         return self.num_instances
 
 
-
-
-
 class ExpandTransformTimeSeries(object):
     """Expand Dimensionality so tabular transformations see
        a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index abe9dcea1..300a3faac 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -29,7 +29,7 @@ class LogProbLoss(Loss):
     __constants__ = ['reduction']
 
     def __init__(self, reduction: str = 'mean') -> None:
-        super(Loss, self).__init__(reduction)
+        super(LogProbLoss, self).__init__(reduction)
 
     def forward(self, input_dist: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
         scores = input_dist.log_prob(target_tensor)
@@ -82,6 +82,7 @@ def get_default(task: int) -> Type[Loss]:
     else:
         raise ValueError("Invalid task type {}".format(TASK_TYPES_TO_STRING[task]))
 
+
 def get_supported_losses(task: int, output_type: int) -> Dict[str, Type[Loss]]:
     """
     Utility function to get supported losses for a given task and output type
@@ -102,6 +103,10 @@ def get_supported_losses(task: int, output_type: int) -> Dict[str, Type[Loss]]:
         for key, value in losses['regression'].items():
             if output_type in value['supported_output_types']:
                 supported_losses[key] = value['module']
+    elif task in FORECASTING_TASKS:
+        for key, value in losses['forecasting'].items():
+            if output_type in value['supported_output_types']:
+                supported_losses[key] = value['module']
     return supported_losses
 
 
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 46b2da33b..4ad4a612d 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -66,8 +66,10 @@ def get_dataset_requirements(info: Dict[str, Any],
 
     if task_type in REGRESSION_TASKS:
         return _get_regression_dataset_requirements(info, include, exclude)
-    else:
+    elif task_type in CLASSIFICATION_TASKS:
         return _get_classification_dataset_requirements(info, include, exclude)
+    else:
+        return _get_forecasting_dataset_requirements(info, include, exclude)
 
 
 def _get_regression_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]],
@@ -86,13 +88,6 @@ def _get_regression_dataset_requirements(info: Dict[str, Any], include: Dict[str
             include=include,
             exclude=exclude
         ).get_dataset_requirements()
-    elif task_type in FORECASTING_TASKS:
-        return TimeSeriesForecastingPipeline(
-            dataset_properties=info,
-            include=include,
-            exclude=exclude
-        ).get_dataset_requirements()
-
     else:
         raise ValueError("Task_type not supported")
 
@@ -127,6 +122,21 @@ def _get_classification_dataset_requirements(info: Dict[str, Any],
         raise ValueError("Task_type not supported")
 
 
+def _get_forecasting_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]],
+                                         exclude: Dict[str, List[str]]) -> List[FitRequirement]:
+    task_type = STRING_TO_TASK_TYPES[info['task_type']]
+
+    if task_type in FORECASTING_TASKS:
+        return TimeSeriesForecastingPipeline(
+            dataset_properties=info,
+            include=include,
+            exclude=exclude
+        ).get_dataset_requirements()
+
+    else:
+        raise ValueError("Task_type not supported")
+
+
 def get_configuration_space(info: Dict[str, Any],
                             include: Optional[Dict] = None,
                             exclude: Optional[Dict] = None,

From 98a513f49a961835dafeff7d8e8985bdd3869154 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 25 Nov 2021 19:17:04 +0100
Subject: [PATCH 066/347] fix conflit

---
 autoPyTorch/api/time_series_classification.py |   3 +-
 autoPyTorch/api/time_series_forecasting.py    |   2 +-
 autoPyTorch/api/time_series_regression.py     |   3 +-
 autoPyTorch/datasets/resampling_strategy.py   | 118 +---
 autoPyTorch/datasets/time_series_dataset.py   |  13 +-
 ...time_series_forecasting_train_evaluator.py |   2 +-
 autoPyTorch/optimizer/smbo.py                 |  22 +-
 autoPyTorch/pipeline/base_pipeline.py         |   1 +
 .../components/setup/network/base_network.py  |   8 +-
 .../setup/network_backbone/LSTMBackbone.py    |  65 +--
 .../network_backbone/base_network_backbone.py |  46 +-
 .../time_series_forecasting_data_loader.py    |  68 +--
 .../pipeline/time_series_classification.py    |  72 ++-
 .../pipeline/time_series_forecasting.py       | 115 ++--
 .../pipeline/time_series_regression.py        |  70 ++-
 autoPyTorch/utils/backend.py                  | 512 ------------------
 autoPyTorch/utils/pipeline.py                 |  68 +--
 .../example_time_series_classification.py     | 103 ----
 ...ple_time_series_classification_pipeline.py | 153 ------
 ..._series_classification_sequential_mnist.py | 104 ----
 examples/example_time_series_regression.py    | 103 ----
 ...example_time_series_regression_pipeline.py | 149 -----
 test/conftest.py                              |   4 -
 23 files changed, 305 insertions(+), 1499 deletions(-)
 delete mode 100644 autoPyTorch/utils/backend.py
 delete mode 100644 examples/example_time_series_classification.py
 delete mode 100644 examples/example_time_series_classification_pipeline.py
 delete mode 100644 examples/example_time_series_classification_sequential_mnist.py
 delete mode 100644 examples/example_time_series_regression.py
 delete mode 100644 examples/example_time_series_regression_pipeline.py

diff --git a/autoPyTorch/api/time_series_classification.py b/autoPyTorch/api/time_series_classification.py
index 0d6241edf..caa5a5d0a 100644
--- a/autoPyTorch/api/time_series_classification.py
+++ b/autoPyTorch/api/time_series_classification.py
@@ -1,3 +1,4 @@
+#TODO Note: This API is still under construction!
 import os
 import uuid
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -19,7 +20,7 @@
 )
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
 from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-from autoPyTorch.utils.backend import Backend
+from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 637a5b476..5e6b342f4 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -19,7 +19,7 @@
 )
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
-from autoPyTorch.utils.backend import Backend
+from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE, SEASONALITY_MAP
 
diff --git a/autoPyTorch/api/time_series_regression.py b/autoPyTorch/api/time_series_regression.py
index 70d3ac48e..aefaed97b 100644
--- a/autoPyTorch/api/time_series_regression.py
+++ b/autoPyTorch/api/time_series_regression.py
@@ -1,3 +1,4 @@
+#TODO Note: This API is still under construction!
 import os
 import uuid
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -19,7 +20,7 @@
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
 from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
-from autoPyTorch.utils.backend import Backend
+from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 241bf775a..e12b314a7 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -105,119 +105,6 @@ def is_stratified(self) -> bool:
 }
 
 
-def get_cross_validators(*cross_val_types: CrossValTypes) -> Dict[str, CROSS_VAL_FN]:
-    cross_validators = {}  # type: Dict[str, CROSS_VAL_FN]
-    for cross_val_type in cross_val_types:
-        cross_val_fn = globals()[cross_val_type.name]
-        cross_validators[cross_val_type.name] = cross_val_fn
-    return cross_validators
-
-
-def get_holdout_validators(*holdout_val_types: HoldoutValTypes) -> Dict[str, HOLDOUT_FN]:
-    holdout_validators = {}  # type: Dict[str, HOLDOUT_FN]
-    for holdout_val_type in holdout_val_types:
-        holdout_val_fn = globals()[holdout_val_type.name]
-        holdout_validators[holdout_val_type.name] = holdout_val_fn
-    return holdout_validators
-
-
-def is_stratified(val_type: Union[str, CrossValTypes, HoldoutValTypes]) -> bool:
-    if isinstance(val_type, str):
-        return val_type.lower().startswith("stratified")
-    else:
-        return val_type.name.lower().startswith("stratified")
-
-
-def holdout_validation(val_share: float, indices: np.ndarray, **kwargs: Any) -> Tuple[np.ndarray, np.ndarray]:
-    train, val = train_test_split(indices, test_size=val_share, shuffle=False)
-    return train, val
-
-
-def stratified_holdout_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \
-        -> Tuple[np.ndarray, np.ndarray]:
-    train, val = train_test_split(indices, test_size=val_share, shuffle=False, stratify=kwargs["stratify"])
-    return train, val
-
-
-def shuffle_split_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any) \
-        -> List[Tuple[np.ndarray, np.ndarray]]:
-    cv = ShuffleSplit(n_splits=num_splits)
-    splits = list(cv.split(indices))
-    return splits
-
-
-def stratified_shuffle_split_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any) \
-        -> List[Tuple[np.ndarray, np.ndarray]]:
-    cv = StratifiedShuffleSplit(n_splits=num_splits)
-    splits = list(cv.split(indices, kwargs["stratify"]))
-    return splits
-
-
-def stratified_k_fold_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any) \
-        -> List[Tuple[np.ndarray, np.ndarray]]:
-    cv = StratifiedKFold(n_splits=num_splits)
-    splits = list(cv.split(indices, kwargs["stratify"]))
-    return splits
-
-
-def k_fold_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any) -> List[Tuple[np.ndarray, np.ndarray]]:
-    """
-    Standard k fold cross validation.
-
-    :param indices: array of indices to be split
-    :param num_splits: number of cross validation splits
-    :return: list of tuples of training and validation indices
-    """
-    cv = KFold(n_splits=num_splits)
-    splits = list(cv.split(indices))
-    return splits
-
-
-# TODO DO we move these under autoPyTorch/datasets/time_series_dataset.py?
-# TODO rewrite this part, as we only need holdout sets
-def time_series_hold_out_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \
-        -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Return holdout indices respecting hte temporal ordering of the data
-    Args:
-        val_share:
-        indices: List of all possible indices
-        **kwargs:
-
-    Returns:
-    """
-    # TODO consider how we handle test size properly
-    # Time Series prediction only requires on set of prediction for each
-    # This implement needs to be combined with time series forecasting dataloader, where each time an entire time series
-    # is used for prediction
-    test_size = kwargs['n_prediction_steps']
-    cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
-    train, val = list(cv.split(indices))[-1]
-    return train, val
-
-
-def time_series_cross_validation(num_splits: int, indices: np.ndarray, **kwargs: Any) \
-        -> List[Tuple[np.ndarray, np.ndarray]]:
-    """
-    Returns train and validation indices respecting the temporal ordering of the data.
-    Dummy example: [0, 1, 2, 3] with 3 folds yields
-        [0] [1]
-        [0, 1] [2]
-        [0, 1, 2] [3]
-
-    :param indices: array of indices to be split, seq_length
-    :param num_splits: number of cross validation splits
-    :return: list of tuples of training and validation indices
-    """
-    # TODO: we use gap=n_prediction_step here, we need to consider if we want to implement n_prediction_step here or
-    # under DATALOADER!!!
-    # TODO do we need cross valriadtion for time series datasets?
-    test_size = kwargs['n_prediction_steps']
-    cv = TimeSeriesSplit(n_splits=num_splits, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
-    splits = list(cv.split(indices))
-    return splits
-
-
 class HoldOutFuncs():
     @staticmethod
     def holdout_validation(random_state: np.random.RandomState,
@@ -244,8 +131,9 @@ def stratified_holdout_validation(random_state: np.random.RandomState,
 
     # TODO DO we move these under autoPyTorch/datasets/time_series_dataset.py?
     # TODO rewrite this part, as we only need holdout sets
-    @classmethod
-    def time_series_hold_out_validation(val_share: float, indices: np.ndarray, **kwargs: Any) \
+    @staticmethod
+    def time_series_hold_out_validation(random_state: np.random.RandomState,
+                                        val_share: float, indices: np.ndarray, **kwargs: Any) \
             -> Tuple[np.ndarray, np.ndarray]:
         """
         Return holdout indices respecting hte temporal ordering of the data
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 946ae66db..3e921fbb6 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -208,7 +208,7 @@ def __init__(self,
             self.sequence_lengths_tests = None
 
         self.shuffle = shuffle
-        self.rand = np.random.RandomState(seed=seed)
+        self.random_state = np.random.RandomState(seed=seed)
 
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
@@ -271,9 +271,8 @@ def __init__(self,
 
         self.numerical_features: List[int] = list(range(self.num_features))
         self.categorical_features: List[int] = []
-
-        self.cross_validators = CrossValFuncs(CrossValTypes.time_series_cross_validation)
-        self.holdout_validators = HoldOutFuncs(HoldoutValTypes.time_series_hold_out_validation)
+        self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation)
+        self.holdout_validators = HoldOutFuncs.get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
 
         self.splits = self.get_splits_from_resampling_strategy()
 
@@ -622,11 +621,13 @@ def create_holdout_val_split(
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
             if self.shift_input_data:
-                split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
+                split = self.holdout_validators[holdout_val_type.name](self.random_state,
+                                                                       val_share,
                                                                        indices=np.arange(len(dataset)),
                                                                        **kwargs)
             else:
-                split = self.holdout_validators[holdout_val_type.name](holdout_val_type,
+                split = self.holdout_validators[holdout_val_type.name](self.random_state,
+                                                                       val_share,
                                                                        indices=np.arange(
                                                                            len(dataset) - self.n_prediction_steps),
                                                                        **kwargs)
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 980b55e32..5c7ec45d0 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -17,7 +17,7 @@
 
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
-from autoPyTorch.utils.backend import Backend
+from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index a3935b6ee..6ffd023bb 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -42,7 +42,7 @@ def get_smac_object(
     ta_kwargs: Dict[str, Any],
     n_jobs: int,
     initial_budget: int,
-    max_budget: int,
+    max_budget: Union[int, float],
     dask_client: Optional[dask.distributed.Client],
     initial_configurations: Optional[List[Configuration]] = None,
 ) -> SMAC4AC:
@@ -189,7 +189,7 @@ def __init__(self,
                 max_budget states the maximum resource allocation a pipeline is going to
                 be ran. For example, if the budget_type is epochs, and max_budget=50,
                 then the pipeline training will be terminated after 50 epochs.
-            time_series_prediction (bool):
+            time_series_forecasting (bool):
                 If we want to apply this optimizer to optimize time series prediction tasks (which has a different
                 tae)
         """
@@ -237,7 +237,7 @@ def __init__(self,
 
         self.search_space_updates = search_space_updates
 
-        self.time_series_prediction = time_series_prediction
+        self.time_series_forecasting = time_series_forecasting
 
         if logger_port is None:
             self.logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
@@ -307,7 +307,7 @@ def run_smbo(self, func: Optional[Callable] = None
             pynisher_context=self.pynisher_context,
         )
 
-        if self.time_series_prediction:
+        if self.time_series_forecasting:
             ta_kwargs["evaluator_class"] = TimeSeriesForecastingTrainEvaluator
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")
@@ -356,16 +356,10 @@ def run_smbo(self, func: Optional[Callable] = None
             scenario_dict.update(self.smac_scenario_args)
 
         budget_type = self.pipeline_config['budget_type']
-        if budget_type == 'epochs':
-            initial_budget = self.pipeline_config['min_epochs']
-            max_budget = self.pipeline_config['epochs']
-        elif budget_type == 'resolution':
-            initial_budget = self.pipeline_config.get('min_resolution', 0.1)
-            max_budget = self.pipeline_config.get('full_resolution', 1.0)
-        else:
-            raise ValueError("Illegal value for budget type, must be one of "
-                             "('epochs', 'runtime'), but is : %s" %
-                             budget_type)
+        if budget_type == 'resolution':
+            if self.min_budget > 1. or self.max_budget > 1.:
+                self.min_budget = self.min_budget / self.max_budget
+                self.max_budget = 1.0
 
         if self.get_smac_object_callback is not None:
             smac = self.get_smac_object_callback(scenario_dict=scenario_dict,
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index 90c0f6362..70d3fa897 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -231,6 +231,7 @@ def set_hyperparameters(
                     new_name = param.replace('%s:' % node_name, '', 1)
                     sub_config_dict[new_name] = value
 
+
             sub_configuration = Configuration(sub_configuration_space,
                                               values=sub_config_dict)
 
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index 768d0eb20..eec9f18f7 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -30,7 +30,7 @@ def __init__(
         self.add_fit_requirements([
             FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            # FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
         ])
         self.network = network
         self.final_activation: Optional[torch.nn.Module] = None
@@ -49,8 +49,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         # Make sure that input dictionary X has the required
         # information to fit this stage
         self.check_requirements(X, y)
-
-        self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
+        if 'network_embedding' in X.keys():
+            self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
+        else:
+            self.network = torch.nn.Sequential(X['network_backbone'], X['network_head'])
 
         # Properly set the network training device
         if self.device is None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
index fd23999d5..382d94fdb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
@@ -12,6 +12,7 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
 class _LSTM(nn.Module):
@@ -71,46 +72,38 @@ def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[
         }
 
     @staticmethod
-    def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
-                                        num_layers: Tuple[Tuple, int] = ((1, 3), 1),
-                                        hidden_size: Tuple[Tuple, int] = ((64, 512), 256),
-                                        use_dropout: Tuple[Tuple, bool] = ((True, False), False),
-                                        dropout: Tuple[Tuple, float] = ((0, 0.5), 0.2),
-                                        bidirectional: Tuple[Tuple, bool] = ((True, False), True)
-                                        ) -> ConfigurationSpace:
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_layers',
+                                                                              value_range=(1, 3),
+                                                                              default_value=1),
+            hidden_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='hidden_size',
+                                                                               value_range=(64, 512),
+                                                                               default_value=256),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
+                                                                               value_range=(True, False),
+                                                                               default_value=False),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
+                                                                           value_range=(0., 0.5),
+                                                                           default_value=0.2),
+            bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
+                                                                                 value_range=(True, False),
+                                                                                 default_value=True)
+    ) -> ConfigurationSpace:
         cs = CS.ConfigurationSpace()
 
-        min_num_layers, max_num_layers = num_layers[0]
-        num_layers = UniformIntegerHyperparameter('num_layers',
-                                                  lower=min_num_layers,
-                                                  upper=max_num_layers,
-                                                  default_value=num_layers[1])
-        cs.add_hyperparameter(num_layers)
-
-        min_hidden_size, max_hidden_size = hidden_size[0]
-        hidden_size = UniformIntegerHyperparameter('hidden_size',
-                                                   lower=min_hidden_size,
-                                                   upper=max_hidden_size,
-                                                   default_value=hidden_size[1])
-        cs.add_hyperparameter(hidden_size)
-
-        use_dropout = CategoricalHyperparameter('use_dropout',
-                                                choices=use_dropout[0],
-                                                default_value=use_dropout[1])
-
-        min_dropout, max_dropout = dropout[0]
-        dropout = UniformFloatHyperparameter('dropout',
-                                             lower=min_dropout,
-                                             upper=max_dropout,
-                                             default_value=dropout[1])
-
-        cs.add_hyperparameters([use_dropout, dropout])
+        # TODO consider lstm layers with different hidden size
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameters([num_layers, use_dropout, dropout])
+
+        # Add plain hyperparameters
+        add_hyperparameter(cs, hidden_size, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, bidirectional, CategoricalHyperparameter)
+
         cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
                                            CS.GreaterThanCondition(dropout, num_layers, 1)))
 
-        bidirectional = CategoricalHyperparameter('bidirectional',
-                                                  choices=bidirectional[0],
-                                                  default_value=bidirectional[1])
-        cs.add_hyperparameter(bidirectional)
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index 2ec83286e..e93a455de 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Any, Dict, Iterable, Optional, Tuple
+from typing import Any, Dict, Iterable, Optional, Tuple, List
 
 import numpy as np
 
@@ -30,18 +30,42 @@ class NetworkBackboneComponent(autoPyTorchComponent):
     def __init__(self,
                  **kwargs: Any):
         super().__init__()
-        self.add_fit_requirements([
-            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                           dataset_property=False),
-            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
-            FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
-        ])
+        self.add_fit_requirements(
+            self._required_fit_arguments
+        )
         self.backbone: nn.Module = None
         self.config = kwargs
         self.input_shape: Optional[Iterable] = None
 
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        if self.get_properties()['handles_tabular']:
+            return [
+                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+                               dataset_property=False),
+                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+                FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
+                FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
+            ]
+        elif self.get_properties()['handles_time_series']:
+            return [
+                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+                               dataset_property=False),
+                FitRequirement('time_series_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
+                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            ]
+        elif self.get_properties()['handles_image']:
+            return [
+                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+                               dataset_property=False),
+                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            ]
+        else:
+            raise ValueError('Unsupported task type!')
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Builds the backbone component and assigns it to self.backbone
@@ -64,8 +88,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 # get input shape by transforming first two elements of the training set
                 transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
                 input_shape = transforms(X_train[:1, ...]).shape[1:]
-
-        input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
+        if 'network_embedding' in X.keys():
+            input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
         self.input_shape = input_shape
 
         self.backbone = self.build_backbone(
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 8e455c79c..e5a24d4d0 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -4,7 +4,6 @@
 
 from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
 
-
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
     UniformIntegerHyperparameter, Constant
@@ -18,10 +17,14 @@
 
 import warnings
 
-
 from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
-from autoPyTorch.utils.common import custom_collate_fn
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    custom_collate_fn,
+    add_hyperparameter
+)
+
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
@@ -97,6 +100,7 @@ class ExpandTransformTimeSeries(object):
        a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
        along the last axis
     """
+
     def __call__(self, data: np.ndarray) -> np.ndarray:
         if len(data.shape) <= 1:
             data = np.expand_dims(data, axis=-1)
@@ -120,6 +124,7 @@ class SequenceBuilder(object):
     window_size : int, default=1
         sliding window size
     """
+
     def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length=1, padding_value=0.):
         """
         initialization
@@ -158,13 +163,14 @@ class TimeSeriesForecastingDataLoader(FeatureDataLoader):
     datasets as described in:
     https://pytorch.org/docs/stable/data.html
     """
+
     def __init__(self,
                  batch_size: int = 64,
                  window_size: int = 1,
-                 #sample_interval: int = 1,
                  upper_sequence_length: int = np.iinfo(np.int32).max,
                  num_batches_per_epoch: Optional[int] = 50,
-                 n_prediction_steps: int = 1) -> None:
+                 n_prediction_steps: int = 1,
+                 random_state: Optional[np.random.RandomState] = None) -> None:
         """
         initialize a dataloader
         Args:
@@ -176,14 +182,14 @@ def __init__(self,
             num_batches_per_epoch: how
             n_prediction_steps: how many stpes to predict in advance
         """
-        super().__init__(batch_size=batch_size)
+        super().__init__(batch_size=batch_size, random_state=random_state)
         self.window_size: int = window_size
         self.upper_sequence_length = upper_sequence_length
         self.n_prediction_steps = n_prediction_steps
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
-        #self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.subseq_length = self.window_size
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
 
@@ -270,7 +276,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         _, sampler_indices_train, _ = np.intersect1d(train_split, valid_indices, return_indices=True)
         """
         # test_indices not required as testsets usually lies on the trail of hte sequence
-        #_, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
+        # _, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
 
         sampler_indices_train = np.arange(num_instances_dataset)
 
@@ -316,7 +322,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         candidate_transformations = []  # type: List[Callable]
 
-        #if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
+        # if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
         #    candidate_transformations.extend(X['preprocess_transforms'])
 
         candidate_transformations.append((SequenceBuilder(sample_interval=self.sample_interval,
@@ -329,7 +335,8 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         return torchvision.transforms.Compose(candidate_transformations)
 
-    def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.ndarray] = None, batch_size: int = np.inf,
+    def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.ndarray] = None,
+                   batch_size: int = np.inf,
                    ) -> torch.utils.data.DataLoader:
         """
         Creates a data loader object from the provided data,
@@ -407,39 +414,18 @@ def get_test_data_loader(self) -> torch.utils.data.DataLoader:
 
     @staticmethod
     def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
-                                        batch_size: Tuple[Tuple, int] = ((32, 320), 64),
-                                        window_size: Tuple[Tuple, int] = ((20, 50), 25)
+                                        batch_size: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter="batch_size",
+                                                                  value_range=(32, 320),
+                                                                  default_value=64),
+                                        window_size: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='window_size',
+                                                                  value_range=(20, 50),
+                                                                  default_value=30)
                                         ) -> ConfigurationSpace:
-        batch_size = UniformIntegerHyperparameter(
-            "batch_size", batch_size[0][0], batch_size[0][1], default_value=batch_size[1])
-        if "upper_window_size" not in dataset_properties:
-            warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
-                          'length that is greater than the maximal allowed length of the dataset')
-            upper_window_size = min(np.iinfo(np.int32).max, window_size[0][1])
-        else:
-            upper_window_size = min(dataset_properties["upper_window_size"], window_size[0][1])
-        if window_size[0][0] >= upper_window_size:
-            if upper_window_size == 1:
-                warnings.warn("window size is fixed as 1")
-                window_size = Constant("window_size", value=1)
-            else:
-                warnings.warn("the lower bound of window size is greater than the upper bound")
-                window_size = UniformIntegerHyperparameter("window_size",
-                                                           lower=1,
-                                                           upper=upper_window_size,
-                                                           default_value=upper_window_size)
-        elif window_size[0][0] <= upper_window_size < window_size[0][1]:
-            window_size = UniformIntegerHyperparameter("window_size",
-                                                       lower=window_size[0][0],
-                                                       upper=upper_window_size,
-                                                       default_value=upper_window_size)
-        else:
-            window_size = UniformIntegerHyperparameter("window_size",
-                                                       lower=window_size[0][0],
-                                                       upper=window_size[0][1],
-                                                       default_value=window_size[1])
         cs = ConfigurationSpace()
-        cs.add_hyperparameters([batch_size, window_size])
+        add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, window_size, UniformIntegerHyperparameter)
         return cs
 
     def __str__(self) -> str:
diff --git a/autoPyTorch/pipeline/time_series_classification.py b/autoPyTorch/pipeline/time_series_classification.py
index 8a3cbbc04..749aaabe1 100644
--- a/autoPyTorch/pipeline/time_series_classification.py
+++ b/autoPyTorch/pipeline/time_series_classification.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
@@ -8,29 +8,36 @@
 import sklearn.preprocessing
 from sklearn.base import ClassifierMixin
 
+
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesTransformer
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
+    EncoderChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import (
+    FeatureProprocessorChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
-from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import (
-    NetworkInitializerChoice
-)
-from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
+from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice
+from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
-from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
-    TrainerChoice
-)
+from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
+
 class TimeSeriesClassificationPipeline(ClassifierMixin, BasePipeline):
     """This class is a proof of concept to integrate AutoSklearn Components
 
@@ -58,8 +65,8 @@ class TimeSeriesClassificationPipeline(ClassifierMixin, BasePipeline):
     def __init__(
         self,
         config: Optional[Configuration] = None,
-        steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None,
-        dataset_properties: Optional[Dict[str, Any]] = None,
+        steps: Optional[List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]] = None,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
         include: Optional[Dict[str, Any]] = None,
         exclude: Optional[Dict[str, Any]] = None,
         random_state: Optional[np.random.RandomState] = None,
@@ -204,17 +211,28 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
             default_dataset_properties.update(dataset_properties)
 
         steps.extend([
-            ("scaler", ScalerChoice(default_dataset_properties)),
-            ("preprocessing", EarlyPreprocessing()),
-            ("time_series_transformer", TimeSeriesTransformer()),
-            ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
-            ("network_head", NetworkHeadChoice(default_dataset_properties)),
-            ("network", NetworkComponent()),
-            ("network_init", NetworkInitializerChoice(default_dataset_properties)),
-            ("optimizer", OptimizerChoice(default_dataset_properties)),
-            ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
-            ("data_loader", TimeSeriesDataLoader()),
-            ("trainer", TrainerChoice(default_dataset_properties)),
+            ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
+            ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
+                                                               random_state=self.random_state)),
+            ("tabular_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+            ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+            ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
+                                                         random_state=self.random_state)),
+            ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
+                                                       random_state=self.random_state)),
+            ("network_head", NetworkHeadChoice(default_dataset_properties,
+                                               random_state=self.random_state)),
+            ("network", NetworkComponent(random_state=self.random_state)),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties,
+                                                      random_state=self.random_state)),
+            ("optimizer", OptimizerChoice(default_dataset_properties,
+                                          random_state=self.random_state)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties,
+                                             random_state=self.random_state)),
+            ("data_loader", TimeSeriesDataLoader(random_state=self.random_state)),
+            ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 5cad6e957..51f9c77d7 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -1,40 +1,39 @@
+import copy
 import warnings
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformIntegerHyperparameter
+from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause
 
 import numpy as np
 
 from sklearn.base import RegressorMixin
-from sklearn.pipeline import Pipeline
+
+import torch
 
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
-from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.base_pipeline import BasePipeline, PipelineStepType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesTransformer
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
-from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import (
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
-from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
-from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
+from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
     TimeSeriesForecastingDataLoader
-)
-
-from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
-    TrainerChoice
-)
+from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.utils.common import subsampler
 
 
 class TimeSeriesForecastingPipeline(RegressorMixin, BasePipeline):
@@ -63,15 +62,14 @@ class TimeSeriesForecastingPipeline(RegressorMixin, BasePipeline):
 
     def __init__(self,
                  config: Optional[Configuration] = None,
-                 steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None,
-                 dataset_properties: Optional[Dict[str, Any]] = None,
+                 steps: Optional[List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]] = None,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
                  random_state: Optional[np.random.RandomState] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  ):
-        # TODO consider multi steps prediction
         if 'upper_sequence_length' not in dataset_properties:
             warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
                           'length that is greater than the maximal allowed length of the dataset')
@@ -83,6 +81,11 @@ def __init__(self,
             config, steps, dataset_properties, include, exclude,
             random_state, init_params, search_space_updates)
 
+        # Because a pipeline is passed to a worker, we need to honor the random seed
+        # in this context. A tabular regression pipeline will implement a torch
+        # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html
+        torch.manual_seed(self.random_state.get_state()[1][0])
+
     def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
         """Scores the fitted estimator on (X, y)
 
@@ -94,6 +97,7 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
         Returns:
             np.ndarray: coefficient of determination R^2 of the prediction
         """
+        # TODO adjust to sktime's losses
         from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
         metrics = get_metrics(self.dataset_properties, ['r2'])
         y_pred = self.predict(X, batch_size=batch_size)
@@ -132,11 +136,12 @@ def _get_hyperparameter_search_space(self,
             dataset_properties = dict()
 
         if 'target_type' not in dataset_properties:
-            dataset_properties['target_type'] = 'time_series_regression'
-        if dataset_properties['target_type'] != 'time_series_regression':
-            warnings.warn('Time series regression is being used, however the target_type'
-                          'is not given as "time_series_regression". Overriding it.')
-            dataset_properties['target_type'] = 'time_series_regression'
+            dataset_properties['target_type'] = 'time_series_forecasting'
+        if dataset_properties['target_type'] != 'time_series_forecasting':
+            warnings.warn('Time series forecasting is being used, however the target_type'
+                          'is not given as "time_series_forecasting". Overriding it.')
+            dataset_properties['target_type'] = 'time_series_forecasting'
+
         # get the base search space given this
         # dataset properties. Then overwrite with custom
         # regression requirements
@@ -146,6 +151,34 @@ def _get_hyperparameter_search_space(self,
 
         # Here we add custom code, like this with this
         # is not a valid configuration
+        # Learned Entity Embedding is only valid when encoder is one hot encoder
+        if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys():
+            embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
+            if 'LearnedEntityEmbedding' in embeddings:
+                encoders = cs.get_hyperparameter('encoder:__choice__').choices
+                default = cs.get_hyperparameter('network_embedding:__choice__').default_value
+                possible_default_embeddings = copy.copy(list(embeddings))
+                del possible_default_embeddings[possible_default_embeddings.index(default)]
+
+                for encoder in encoders:
+                    if encoder == 'OneHotEncoder':
+                        continue
+                    while True:
+                        try:
+                            cs.add_forbidden_clause(ForbiddenAndConjunction(
+                                ForbiddenEqualsClause(cs.get_hyperparameter(
+                                    'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
+                                ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder)
+                            ))
+                            break
+                        except ValueError:
+                            # change the default and try again
+                            try:
+                                default = possible_default_embeddings.pop()
+                            except IndexError:
+                                raise ValueError("Cannot find a legal default configuration")
+                            cs.get_hyperparameter('network_embedding:__choice__').default_value = default
+
 
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
@@ -167,18 +200,24 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             default_dataset_properties.update(dataset_properties)
 
         steps.extend([
-            ("scaler", ScalerChoice(default_dataset_properties)),
-            ("time_series_transformer", TimeSeriesTransformer()),
-            ("preprocessing", EarlyPreprocessing()),
-            ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
-            ("network_head", NetworkHeadChoice(default_dataset_properties)),
-            ("network", NetworkComponent()),
-            ("network_init", NetworkInitializerChoice(default_dataset_properties)),
-            ("optimizer", OptimizerChoice(default_dataset_properties)),
-            ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
+            ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+            ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+            ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
+                                                       random_state=self.random_state)),
+            ("network_head", NetworkHeadChoice(default_dataset_properties,
+                                               random_state=self.random_state)),
+            ("network", NetworkComponent(random_state=self.random_state)),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties,
+                                                      random_state=self.random_state)),
+            ("optimizer", OptimizerChoice(default_dataset_properties,
+                                          random_state=self.random_state)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties,
+                                             random_state=self.random_state)),
             ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
-                                                            )),
-            ("trainer", TrainerChoice(default_dataset_properties)),
+                                                            random_state=self.random_state)),
+            ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps
 
diff --git a/autoPyTorch/pipeline/time_series_regression.py b/autoPyTorch/pipeline/time_series_regression.py
index d18207256..9c43e5966 100644
--- a/autoPyTorch/pipeline/time_series_regression.py
+++ b/autoPyTorch/pipeline/time_series_regression.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
@@ -7,27 +7,31 @@
 
 from sklearn.base import RegressorMixin
 
-from autoPyTorch.constants import STRING_TO_TASK_TYPES
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesTransformer
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
+    EncoderChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import (
+    FeatureProprocessorChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
-from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice
+from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import (
-    NetworkInitializerChoice
-)
-from autoPyTorch.pipeline.components.setup.optimizer.base_optimizer_choice import OptimizerChoice
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
+from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice
+from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
-from autoPyTorch.pipeline.components.training.trainer.base_trainer_choice import (
-    TrainerChoice
-)
+from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
@@ -57,8 +61,8 @@ class TimeSeriesRegressionPipeline(RegressorMixin, BasePipeline):
 
     def __init__(self,
                  config: Optional[Configuration] = None,
-                 steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None,
-                 dataset_properties: Optional[Dict[str, Any]] = None,
+                 steps: Optional[List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]] = None,
+                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
                  random_state: Optional[np.random.RandomState] = None,
@@ -153,17 +157,27 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             default_dataset_properties.update(dataset_properties)
 
         steps.extend([
-            ("scaler", ScalerChoice(default_dataset_properties)),
-            ("preprocessing", EarlyPreprocessing()),
-            ("time_series_transformer", TimeSeriesTransformer()),
-            ("network_backbone", NetworkBackboneChoice(default_dataset_properties)),
-            ("network_head", NetworkHeadChoice(default_dataset_properties)),
-            ("network", NetworkComponent()),
-            ("network_init", NetworkInitializerChoice(default_dataset_properties)),
-            ("optimizer", OptimizerChoice(default_dataset_properties)),
-            ("lr_scheduler", SchedulerChoice(default_dataset_properties)),
-            ("data_loader", TimeSeriesDataLoader()),
-            ("trainer", TrainerChoice(default_dataset_properties)),
+            ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
+                                                               random_state=self.random_state)),
+            ("tabular_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+            ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+            ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
+                                                         random_state=self.random_state)),
+            ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
+                                                       random_state=self.random_state)),
+            ("network_head", NetworkHeadChoice(default_dataset_properties,
+                                               random_state=self.random_state)),
+            ("network", NetworkComponent(random_state=self.random_state)),
+            ("network_init", NetworkInitializerChoice(default_dataset_properties,
+                                                      random_state=self.random_state)),
+            ("optimizer", OptimizerChoice(default_dataset_properties,
+                                          random_state=self.random_state)),
+            ("lr_scheduler", SchedulerChoice(default_dataset_properties,
+                                             random_state=self.random_state)),
+            ("data_loader", TimeSeriesDataLoader(random_state=self.random_state)),
+            ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps
 
diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py
deleted file mode 100644
index 78f2a42cf..000000000
--- a/autoPyTorch/utils/backend.py
+++ /dev/null
@@ -1,512 +0,0 @@
-import glob
-import os
-import pickle
-import shutil
-import tempfile
-import time
-import uuid
-import warnings
-from typing import Dict, List, Optional, Tuple, Union
-
-import lockfile
-
-import numpy as np
-
-from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
-from autoPyTorch.pipeline.base_pipeline import BasePipeline
-from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
-
-__all__ = [
-    'Backend'
-]
-
-
-def create(
-        temporary_directory: Optional[str],
-        output_directory: Optional[str],
-        delete_tmp_folder_after_terminate: bool = True,
-        delete_output_folder_after_terminate: bool = True,
-) -> 'Backend':
-    """
-    Creates a backend object that manages disk related transactions
-
-    Args:
-        temporary_directory (str): where all temporal data is to be dumped
-        output_directory (str): where all predictions are to be output
-        delete_tmp_folder_after_terminate (bool): whether to delete the
-            temporal directory when then run completes
-        delete_output_folder_after_terminate (bool): whether to delete
-            the output directory when the run completes
-
-    Returns:
-        Backend object
-    """
-    context = BackendContext(temporary_directory, output_directory,
-                             delete_tmp_folder_after_terminate,
-                             delete_output_folder_after_terminate,
-                             )
-    backend = Backend(context)
-
-    return backend
-
-
-def get_randomized_directory_name(temporary_directory: Optional[str] = None) -> str:
-    uuid_str = str(uuid.uuid1(clock_seq=os.getpid()))
-
-    temporary_directory = (
-        temporary_directory
-        if temporary_directory
-        else os.path.join(
-            tempfile.gettempdir(),
-            "autoPyTorch_tmp_{}".format(
-                uuid_str,
-            ),
-        )
-    )
-
-    return temporary_directory
-
-
-class BackendContext(object):
-
-    def __init__(self,
-                 temporary_directory: Optional[str],
-                 output_directory: Optional[str],
-                 delete_tmp_folder_after_terminate: bool,
-                 delete_output_folder_after_terminate: bool,
-                 ):
-
-        # Check that the names of tmp_dir and output_dir is not the same.
-        if temporary_directory == output_directory and temporary_directory is not None:
-            raise ValueError("The temporary and the output directory "
-                             "must be different.")
-
-        self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
-        self.delete_output_folder_after_terminate = delete_output_folder_after_terminate
-        # attributes to check that directories were created by autoPyTorch
-        self._tmp_dir_created = False
-        self._output_dir_created = False
-
-        self._temporary_directory = (
-            get_randomized_directory_name(
-                temporary_directory=temporary_directory,
-            )
-        )
-        self._output_directory = output_directory
-        self.create_directories()
-        self._logger = None  # type: Optional[PicklableClientLogger]
-
-    @property
-    def output_directory(self) -> Optional[str]:
-        if self._output_directory is not None:
-            # make sure that tilde does not appear on the path.
-            return os.path.expanduser(os.path.expandvars(self._output_directory))
-        else:
-            return None
-
-    @property
-    def temporary_directory(self) -> str:
-        # make sure that tilde does not appear on the path.
-        return os.path.expanduser(os.path.expandvars(self._temporary_directory))
-
-    def create_directories(self) -> None:
-        # Exception is raised if self.temporary_directory already exists.
-        os.makedirs(self.temporary_directory)
-        self._tmp_dir_created = True
-
-        # Exception is raised if self.output_directory already exists.
-        if self.output_directory is not None:
-            os.makedirs(self.output_directory)
-            self._output_dir_created = True
-
-    def delete_directories(self, force: bool = True) -> None:
-        if self.output_directory and (self.delete_output_folder_after_terminate or force):
-            if self._output_dir_created is False:
-                raise ValueError("Failed to delete output dir: %s because autoPyTorch did not "
-                                 "create it. Please make sure that the specified output dir does "
-                                 "not exist when instantiating autoPyTorch."
-                                 % self.output_directory)
-            try:
-                shutil.rmtree(self.output_directory)
-            except Exception:
-                try:
-                    if self._logger is not None:
-                        self._logger.warning("Could not delete output dir: %s" %
-                                             self.output_directory)
-                    else:
-                        warnings.warn("Could not delete output dir: %s" % self.output_directory)
-                except Exception:
-                    warnings.warn("Could not delete output dir: %s" % self.output_directory)
-
-        if self.delete_tmp_folder_after_terminate or force:
-            if self._tmp_dir_created is False:
-                raise ValueError("Failed to delete tmp dir: % s because autoPyTorch did not "
-                                 "create it. Please make sure that the specified tmp dir does not "
-                                 "exist when instantiating autoPyTorch."
-                                 % self.temporary_directory)
-            try:
-                shutil.rmtree(self.temporary_directory)
-            except Exception:
-                try:
-                    if self._logger is not None:
-                        self._logger.warning(
-                            "Could not delete tmp dir: %s" % self.temporary_directory)
-                    else:
-                        warnings.warn("Could not delete tmp dir: %s" % self.temporary_directory)
-                except Exception:
-                    warnings.warn("Could not delete tmp dir: %s" % self.temporary_directory)
-
-
-class Backend(object):
-    """Utility class to load and save all objects to be persisted.
-    These are:
-    * start time of auto-pytorch
-    * true targets of the ensemble
-    """
-
-    def __init__(self, context: BackendContext):
-        self._logger = None  # type: Optional[PicklableClientLogger]
-        self.context = context
-
-        # Create the temporary directory if it does not yet exist
-        try:
-            os.makedirs(self.temporary_directory)
-        except Exception:
-            pass
-        # This does not have to exist or be specified
-        if self.output_directory is not None:
-            if not os.path.exists(self.output_directory):
-                raise ValueError("Output directory %s does not exist." % self.output_directory)
-
-        self.internals_directory = os.path.join(self.temporary_directory, ".autoPyTorch")
-        self._make_internals_directory()
-
-    def setup_logger(self, name: str, port: int) -> None:
-        self._logger = get_named_client_logger(
-            name=name,
-            port=port,
-        )
-        self.context._logger = self._logger
-        return
-
-    @property
-    def output_directory(self) -> Optional[str]:
-        return self.context.output_directory
-
-    @property
-    def temporary_directory(self) -> str:
-        return self.context.temporary_directory
-
-    def _make_internals_directory(self) -> None:
-        try:
-            os.makedirs(self.internals_directory)
-        except Exception as e:
-            if self._logger is not None:
-                self._logger.debug("_make_internals_directory: %s" % e)
-        try:
-            os.makedirs(self.get_runs_directory())
-        except Exception as e:
-            if self._logger is not None:
-                self._logger.debug("_make_internals_directory: %s" % e)
-
-    def _get_start_time_filename(self, seed: Union[str, int]) -> str:
-        if isinstance(seed, str):
-            seed = int(seed)
-        return os.path.join(self.internals_directory, "start_time_%d" % seed)
-
-    def save_start_time(self, seed: str) -> str:
-        self._make_internals_directory()
-        start_time = time.time()
-
-        filepath = self._get_start_time_filename(seed)
-
-        if not isinstance(start_time, float):
-            raise ValueError("Start time must be a float, but is %s." % type(start_time))
-
-        if os.path.exists(filepath):
-            raise ValueError(
-                "{filepath} already exist. Different seeds should be provided for different jobs."
-            )
-
-        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(filepath), delete=False) as fh:
-            fh.write(str(start_time))
-            tempname = fh.name
-        os.rename(tempname, filepath)
-
-        return filepath
-
-    def load_start_time(self, seed: int) -> float:
-        with open(self._get_start_time_filename(seed), 'r') as fh:
-            start_time = float(fh.read())
-        return start_time
-
-    def get_smac_output_directory(self) -> str:
-        return os.path.join(self.temporary_directory, 'smac3-output')
-
-    def get_smac_output_directory_for_run(self, seed: int) -> str:
-        return os.path.join(
-            self.temporary_directory,
-            'smac3-output',
-            'run_%d' % seed
-        )
-
-    def _get_targets_ensemble_filename(self) -> str:
-        return os.path.join(self.internals_directory,
-                            "true_targets_ensemble.npy")
-
-    def save_targets_ensemble(self, targets: np.ndarray) -> str:
-        self._make_internals_directory()
-        if not isinstance(targets, np.ndarray):
-            raise ValueError('Targets must be of type np.ndarray, but is %s' %
-                             type(targets))
-
-        filepath = self._get_targets_ensemble_filename()
-
-        # Try to open the file without locking it, this will reduce the
-        # number of times where we erroneously keep a lock on the ensemble
-        # targets file although the process already was killed
-        try:
-            existing_targets = np.load(filepath, allow_pickle=True)
-            if existing_targets.shape[0] > targets.shape[0] or (
-                    existing_targets.shape == targets.shape and np.allclose(existing_targets, targets)):
-                return filepath
-        except Exception:
-            pass
-
-        with lockfile.LockFile(filepath):
-            if os.path.exists(filepath):
-                with open(filepath, 'rb') as fh:
-                    existing_targets = np.load(fh, allow_pickle=True)
-                    if existing_targets.shape[0] > targets.shape[0] or (
-                            existing_targets.shape == targets.shape and np.allclose(existing_targets, targets)):
-                        return filepath
-
-            with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
-                    filepath), delete=False) as fh_w:
-                np.save(fh_w, targets.astype(np.float32))
-                tempname = fh_w.name
-
-            os.rename(tempname, filepath)
-
-        return filepath
-
-    def load_targets_ensemble(self) -> np.ndarray:
-        filepath = self._get_targets_ensemble_filename()
-
-        with lockfile.LockFile(filepath):
-            with open(filepath, 'rb') as fh:
-                targets = np.load(fh, allow_pickle=True)
-
-        return targets
-
-    def _get_datamanager_pickle_filename(self) -> str:
-        return os.path.join(self.internals_directory, 'datamanager.pkl')
-
-    def save_datamanager(self, datamanager: BaseDataset, overwrite=False) -> str:
-        self._make_internals_directory()
-        filepath = self._get_datamanager_pickle_filename()
-
-        with lockfile.LockFile(filepath):
-            if not os.path.exists(filepath) or overwrite:
-                with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
-                        filepath), delete=False) as fh:
-                    pickle.dump(datamanager, fh, -1)
-                    tempname = fh.name
-                os.rename(tempname, filepath)
-
-        return filepath
-
-    def load_datamanager(self) -> BaseDataset:
-        filepath = self._get_datamanager_pickle_filename()
-        with lockfile.LockFile(filepath):
-            with open(filepath, 'rb') as fh:
-                return pickle.load(fh)
-
-    def get_runs_directory(self) -> str:
-        return os.path.join(self.internals_directory, 'runs')
-
-    def get_numrun_directory(self, seed: int, num_run: int, budget: float) -> str:
-        return os.path.join(self.internals_directory, 'runs', '%d_%d_%s' % (seed, num_run, budget))
-
-    def get_model_filename(self, seed: int, idx: int, budget: float) -> str:
-        return '%s.%s.%s.model' % (seed, idx, budget)
-
-    def get_cv_model_filename(self, seed: int, idx: int, budget: float) -> str:
-        return '%s.%s.%s.cv_model' % (seed, idx, budget)
-
-    def list_all_models(self, seed: int) -> List[str]:
-        runs_directory = self.get_runs_directory()
-        model_files = glob.glob(
-            os.path.join(glob.escape(runs_directory), '%d_*' % seed, '%s.*.*.model' % seed)
-        )
-        return model_files
-
-    def load_models_by_identifiers(self, identifiers: List[Tuple[int, int, float]]
-                                   ) -> Dict:
-        models = dict()
-
-        for identifier in identifiers:
-            seed, idx, budget = identifier
-            models[identifier] = self.load_model_by_seed_and_id_and_budget(
-                seed, idx, budget)
-
-        return models
-
-    def load_model_by_seed_and_id_and_budget(self, seed: int,
-                                             idx: int,
-                                             budget: float
-                                             ) -> BasePipeline:
-        model_directory = self.get_numrun_directory(seed, idx, budget)
-
-        model_file_name = '%s.%s.%s.model' % (seed, idx, budget)
-        model_file_path = os.path.join(model_directory, model_file_name)
-        with open(model_file_path, 'rb') as fh:
-            return pickle.load(fh)
-
-    def load_cv_models_by_identifiers(self, identifiers: List[Tuple[int, int, float]]
-                                      ) -> Dict:
-        models = dict()
-
-        for identifier in identifiers:
-            seed, idx, budget = identifier
-            models[identifier] = self.load_cv_model_by_seed_and_id_and_budget(
-                seed, idx, budget)
-
-        return models
-
-    def load_cv_model_by_seed_and_id_and_budget(self,
-                                                seed: int,
-                                                idx: int,
-                                                budget: float
-                                                ) -> BasePipeline:
-        model_directory = self.get_numrun_directory(seed, idx, budget)
-
-        model_file_name = '%s.%s.%s.cv_model' % (seed, idx, budget)
-        model_file_path = os.path.join(model_directory, model_file_name)
-        with open(model_file_path, 'rb') as fh:
-            return pickle.load(fh)
-
-    def save_numrun_to_dir(
-            self, seed: int, idx: int, budget: float, model: Optional[BasePipeline],
-            cv_model: Optional[BasePipeline], ensemble_predictions: Optional[np.ndarray],
-            valid_predictions: Optional[np.ndarray], test_predictions: Optional[np.ndarray],
-    ) -> None:
-        runs_directory = self.get_runs_directory()
-        tmpdir = tempfile.mkdtemp(dir=runs_directory)
-        if model is not None:
-            file_path = os.path.join(tmpdir, self.get_model_filename(seed, idx, budget))
-            with open(file_path, 'wb') as fh:
-                pickle.dump(model, fh, -1)
-
-        if cv_model is not None:
-            file_path = os.path.join(tmpdir, self.get_cv_model_filename(seed, idx, budget))
-            with open(file_path, 'wb') as fh:
-                pickle.dump(cv_model, fh, -1)
-
-        for preds, subset in (
-                (ensemble_predictions, 'ensemble'),
-                (valid_predictions, 'valid'),
-                (test_predictions, 'test')
-        ):
-            if preds is not None:
-                file_path = os.path.join(
-                    tmpdir,
-                    self.get_prediction_filename(subset, seed, idx, budget)
-                )
-                with open(file_path, 'wb') as fh:
-                    pickle.dump(preds.astype(np.float32), fh, -1)
-        try:
-            os.rename(tmpdir, self.get_numrun_directory(seed, idx, budget))
-        except OSError:
-            if os.path.exists(self.get_numrun_directory(seed, idx, budget)):
-                os.rename(self.get_numrun_directory(seed, idx, budget),
-                          os.path.join(runs_directory, tmpdir + '.old'))
-                os.rename(tmpdir, self.get_numrun_directory(seed, idx, budget))
-                shutil.rmtree(os.path.join(runs_directory, tmpdir + '.old'))
-
-    def get_ensemble_dir(self) -> str:
-        return os.path.join(self.internals_directory, 'ensembles')
-
-    def load_ensemble(self, seed: int) -> Optional[AbstractEnsemble]:
-        ensemble_dir = self.get_ensemble_dir()
-
-        if not os.path.exists(ensemble_dir):
-            if self._logger is not None:
-                self._logger.warning('Directory %s does not exist' % ensemble_dir)
-            else:
-                warnings.warn('Directory %s does not exist' % ensemble_dir)
-            return None
-
-        if seed >= 0:
-            indices_files = glob.glob(
-                os.path.join(glob.escape(ensemble_dir), '%s.*.ensemble' % seed)
-            )
-            indices_files.sort()
-        else:
-            indices_files = os.listdir(ensemble_dir)
-            indices_files = [os.path.join(ensemble_dir, f) for f in indices_files]
-            indices_files.sort(key=lambda f: time.ctime(os.path.getmtime(f)))
-
-        with open(indices_files[-1], 'rb') as fh:
-            ensemble_members_run_numbers = pickle.load(fh)
-
-        return ensemble_members_run_numbers
-
-    def save_ensemble(self, ensemble: AbstractEnsemble, idx: int, seed: int) -> None:
-        try:
-            os.makedirs(self.get_ensemble_dir())
-        except Exception:
-            pass
-
-        filepath = os.path.join(
-            self.get_ensemble_dir(),
-            '%s.%s.ensemble' % (str(seed), str(idx).zfill(10))
-        )
-        with tempfile.NamedTemporaryFile('wb', dir=os.path.dirname(
-                filepath), delete=False) as fh:
-            pickle.dump(ensemble, fh)
-            tempname = fh.name
-        os.rename(tempname, filepath)
-
-    def get_prediction_filename(self, subset: str,
-                                automl_seed: Union[str, int],
-                                idx: int,
-                                budget: float
-                                ) -> str:
-        return 'predictions_%s_%s_%s_%s.npy' % (subset, automl_seed, idx, budget)
-
-    def save_predictions_as_txt(self,
-                                predictions: np.ndarray,
-                                subset: str,
-                                idx: int, precision: int,
-                                prefix: Optional[str] = None) -> None:
-        if not self.output_directory:
-            return
-        # Write prediction scores in prescribed format
-        filepath = os.path.join(
-            self.output_directory,
-            ('%s_' % prefix if prefix else '') + '%s_%s.predict' % (subset, str(idx)),
-        )
-
-        format_string = '{:.%dg} ' % precision
-        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(
-                filepath), delete=False) as output_file:
-            for row in predictions:
-                if not isinstance(row, np.ndarray) and not isinstance(row, list):
-                    row = [row]
-                for val in row:
-                    output_file.write(format_string.format(float(val)))
-                output_file.write('\n')
-            tempname = output_file.name
-        os.rename(tempname, filepath)
-
-    def write_txt_file(self, filepath: str, data: str, name: str) -> None:
-        with lockfile.LockFile(filepath):
-            with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(
-                    filepath), delete=False) as fh:
-                fh.write(data)
-                tempname = fh.name
-            os.rename(tempname, filepath)
-            if self._logger is not None:
-                self._logger.debug('Created %s file %s' % (name, filepath))
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index f3aa3c607..9a38af209 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -6,10 +6,11 @@
 from autoPyTorch.constants import (
     IMAGE_TASKS,
     REGRESSION_TASKS,
+    CLASSIFICATION_TASKS,
+    FORECASTING_TASKS,
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS,
     TIMESERIES_TASKS,
-    FORECASTING_TASKS,
 )
 from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
@@ -63,28 +64,23 @@ def get_dataset_requirements(info: Dict[str, Any],
     """
     task_type: int = STRING_TO_TASK_TYPES[info['task_type']]
     if task_type in REGRESSION_TASKS:
-<<<<<<< HEAD
-        return _get_regression_dataset_requirements(info, include, exclude)
-    elif task_type in CLASSIFICATION_TASKS:
-        return _get_classification_dataset_requirements(info, include, exclude)
-    else:
-        return _get_forecasting_dataset_requirements(info, include, exclude)
-
-
-def _get_regression_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]],
-                                         exclude: Dict[str, List[str]]) -> List[FitRequirement]:
-=======
         return _get_regression_dataset_requirements(info,
                                                     include if include is not None else {},
                                                     exclude if exclude is not None else {},
                                                     search_space_updates=search_space_updates
                                                     )
-    else:
+    elif task_type in CLASSIFICATION_TASKS:
         return _get_classification_dataset_requirements(info,
                                                         include if include is not None else {},
                                                         exclude if exclude is not None else {},
                                                         search_space_updates=search_space_updates
                                                         )
+    else:
+        return _get_forecasting_dataset_requirements(info,
+                                                     include if include is not None else {},
+                                                     exclude if exclude is not None else {},
+                                                     search_space_updates=search_space_updates
+                                                     )
 
 
 def _get_regression_dataset_requirements(info: Dict[str, Any],
@@ -92,7 +88,6 @@ def _get_regression_dataset_requirements(info: Dict[str, Any],
                                          exclude: Optional[Dict] = None,
                                          search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                          ) -> List[FitRequirement]:
->>>>>>> upstream/master
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
     if task_type in TABULAR_TASKS:
         return TabularRegressionPipeline(
@@ -113,65 +108,42 @@ def _get_regression_dataset_requirements(info: Dict[str, Any],
 
 
 def _get_classification_dataset_requirements(info: Dict[str, Any],
-<<<<<<< HEAD
-                                             include: Dict[str, List[str]],
-                                             exclude: Dict[str, List[str]]) -> List[FitRequirement]:
-=======
                                              include: Optional[Dict] = None,
                                              exclude: Optional[Dict] = None,
                                              search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                              ) -> List[FitRequirement]:
->>>>>>> upstream/master
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
 
     if task_type in TABULAR_TASKS:
         return TabularClassificationPipeline(
             dataset_properties=info,
-<<<<<<< HEAD
-            include=include,
-            exclude=exclude
-        ).get_dataset_requirements()
-
-    elif task_type in TIMESERIES_TASKS:
-        return TimeSeriesClassificationPipeline(
-            dataset_properties=info,
-            include=include,
-            exclude=exclude,
+            include=include, exclude=exclude,
+            search_space_updates=search_space_updates
         ).get_dataset_requirements()
-
     elif task_type in IMAGE_TASKS:
         return ImageClassificationPipeline(
             dataset_properties=info,
-            include=include,
-            exclude=exclude
+            include=include, exclude=exclude,
+            search_space_updates=search_space_updates
         ).get_dataset_requirements()
-
     else:
         raise ValueError("Task_type not supported")
 
 
-def _get_forecasting_dataset_requirements(info: Dict[str, Any], include: Dict[str, List[str]],
-                                         exclude: Dict[str, List[str]]) -> List[FitRequirement]:
+def _get_forecasting_dataset_requirements(info: Dict[str, Any],
+                                             include: Optional[Dict] = None,
+                                             exclude: Optional[Dict] = None,
+                                             search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                                          ) -> List[FitRequirement]:
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
 
     if task_type in FORECASTING_TASKS:
         return TimeSeriesForecastingPipeline(
             dataset_properties=info,
             include=include,
-            exclude=exclude
+            exclude=exclude,
+            search_space_updates=search_space_updates
         ).get_dataset_requirements()
-
-=======
-            include=include, exclude=exclude,
-            search_space_updates=search_space_updates). \
-            get_dataset_requirements()
-    elif task_type in IMAGE_TASKS:
-        return ImageClassificationPipeline(
-            dataset_properties=info,
-            include=include, exclude=exclude,
-            search_space_updates=search_space_updates). \
-            get_dataset_requirements()
->>>>>>> upstream/master
     else:
         raise ValueError("Task_type not supported")
 
diff --git a/examples/example_time_series_classification.py b/examples/example_time_series_classification.py
deleted file mode 100644
index 1294c7ab8..000000000
--- a/examples/example_time_series_classification.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-======================
-Time Series Classification
-======================
-
-The following example shows how to fit a sample classification model
-with AutoPyTorch
-"""
-import os
-import tempfile as tmp
-import warnings
-
-os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
-os.environ['OMP_NUM_THREADS'] = '1'
-os.environ['OPENBLAS_NUM_THREADS'] = '1'
-os.environ['MKL_NUM_THREADS'] = '1'
-
-warnings.simplefilter(action='ignore', category=UserWarning)
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-import numpy as np
-
-import sklearn.model_selection
-
-from sktime.datasets import load_gunpoint
-
-from autoPyTorch.api.time_series_classification import TimeSeriesClassificationTask
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-def get_search_space_updates():
-    """
-    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
-    Returns:
-        HyperparameterSearchSpaceUpdates
-    """
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="data_loader",
-                   hyperparameter="batch_size",
-                   value_range=[16, 512],
-                   default_value=32)
-    updates.append(node_name="lr_scheduler",
-                   hyperparameter="CosineAnnealingLR:T_max",
-                   value_range=[50, 60],
-                   default_value=55)
-    updates.append(node_name='optimizer',
-                   hyperparameter='AdamOptimizer:lr',
-                   value_range=[0.0001, 0.001],
-                   default_value=0.0005)
-    return updates
-
-
-if __name__ == '__main__':
-    ############################################################################
-    # Data Loading
-    # ============
-    X, y = load_gunpoint(return_X_y=True)
-
-    # Convert the pandas dataframes returned from load_gunpoint to 3D numpy array since that is
-    # the format AutoPyTorch expects for now
-    X = [X.iloc[i][0].values for i in range(len(X))]
-    y = [int(y.iloc[i]) for i in range(len(y))]
-    X = np.vstack(X)
-
-    # Expand the last dimension because time series data has to be of shape [B, T, F]
-    # where B is the batch size, T is the time dimension and F are the number of features per time step
-    X = X[..., np.newaxis]
-
-    # Subtract one from the labels because they are initially in {1, 2}, but are expected to be in {0, 1}
-    y = np.array(y) - 1
-
-    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-        X,
-        y,
-        random_state=1,
-        stratify=y
-    )
-
-    ############################################################################
-    # Build and fit a classifier
-    # ==========================
-    api = TimeSeriesClassificationTask(
-        delete_tmp_folder_after_terminate=False,
-        search_space_updates=get_search_space_updates()
-    )
-    api.search(
-        X_train=X_train,
-        y_train=y_train,
-        X_test=X_test.copy(),
-        y_test=y_test.copy(),
-        optimize_metric='accuracy',
-        total_walltime_limit=500,
-        func_eval_time_limit=50
-    )
-
-    ############################################################################
-    # Print the final ensemble performance
-    # ====================================
-    print(api.run_history, api.trajectory)
-    y_pred = api.predict(X_test)
-    score = api.score(y_pred, y_test)
-    print(score)
-    print(api.show_models())
diff --git a/examples/example_time_series_classification_pipeline.py b/examples/example_time_series_classification_pipeline.py
deleted file mode 100644
index c674e1098..000000000
--- a/examples/example_time_series_classification_pipeline.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""
-======================
-Example for the time series classification pipeline
----------------------------
-
-This is a temporal example to make sure that ensemble works.
-It also sets how SMAC should create the output information,
-so that the ensemble builder works.
-
-We will remove this file, once SMAC + ensemble builder work
-======================
-"""
-import typing
-
-import numpy as np
-import sklearn
-from sklearn.metrics import accuracy_score
-
-from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
-from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-from autoPyTorch.utils.backend import Backend, create
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.utils.pipeline import get_dataset_requirements
-
-
-def get_data_to_train(backend: Backend):
-    """
-    This function returns a fit dictionary that within itself, contains all
-    the information to fit a pipeline
-    """
-
-    from sktime.datasets import load_gunpoint
-
-    data, labels = load_gunpoint(return_X_y=True)
-
-    data = [data.iloc[i][0].values for i in range(len(data))]
-    labels = [int(labels.iloc[i]) for i in range(len(labels))]
-
-    data = np.vstack(data)
-    X = data[..., np.newaxis]
-    y = np.array(labels) - 1  # minus one because labels are initially in {1, 2}
-
-    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-        X,
-        y,
-        random_state=1,
-        stratify=y
-    )
-
-    validator = TimeSeriesInputValidator(is_classification=True)
-    validator.fit(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
-
-    # Create a datamanager for this toy problem
-    datamanager = TimeSeriesDataset(
-        X=X_train, Y=y_train,
-        X_test=X_test, Y_test=y_test,
-        validator=validator,
-        resampling_strategy=CrossValTypes.stratified_k_fold_cross_validation
-    )
-    backend.save_datamanager(datamanager)
-
-    info = {'task_type': datamanager.task_type,
-            'numerical_features': datamanager.numerical_features,
-            'categorical_features': datamanager.categorical_features,
-            'output_type': datamanager.output_type,
-            'issparse': datamanager.issparse}
-
-    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
-
-    # Fit the pipeline
-    fit_dictionary = {
-        'X_train': X_train,
-        'y_train': y_train,
-        'train_indices': np.arange(X_train.shape[0]),
-        'dataset_properties': dataset_properties,
-        # Training configuration
-        'num_run': 5,
-        'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
-        'device': 'cuda',
-        'runtime': 50,
-        'torch_num_threads': 1,
-        'early_stopping': 20,
-        'use_tensorboard_logger': True,
-        'use_pynisher': False,
-        'memory_limit': 4096,
-        'metrics_during_training': True,
-        'seed': 0,
-        'budget_type': 'epochs',
-        'epochs': 100.0,
-        'split_id': 0,
-        'backend': backend,
-        'job_id': 1
-    }
-
-    return fit_dictionary, X_train, y_train, X_test, y_test
-
-
-if __name__ == "__main__":
-    # Build a repository with random fitted models
-    backend = create(temporary_directory=None, output_directory=None,
-                     delete_tmp_folder_after_terminate=False)
-
-    # Create the directory structure
-    backend._make_internals_directory()
-
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="optimizer",
-                   hyperparameter="AdamOptimizer:lr",
-                   value_range=[0.0001, 0.001],
-                   default_value=0.0005)
-
-    # Get data to train
-    fit_dictionary, X_train, y_train, X_test, y_test = get_data_to_train(backend)
-    pipeline = TimeSeriesClassificationPipeline(
-        dataset_properties=fit_dictionary['dataset_properties'],
-        search_space_updates=updates,
-        include={
-            'network_backbone': ['InceptionTimeBackbone']
-        }
-    )
-
-    # Goal: Able to indicate a network type and train it successfully on dummy data
-    # Step1: Be able to select and MLP with desired hyperparameters
-    pipeline_cs = pipeline.get_hyperparameter_search_space()
-    print(pipeline_cs)
-    config = pipeline_cs.get_default_configuration()
-    pipeline.set_hyperparameters(config)
-    print(config)
-
-    ## Step2: train it on dummy data
-
-    ## Fit the pipeline
-    print("Fitting the pipeline...")
-    something = pipeline.fit(fit_dictionary)
-
-    ## Showcase some components of the pipeline
-    # print(pipeline)
-
-    from sktime.classification import compose
-
-    tsf = compose.TimeSeriesForestClassifier()
-    tsf.fit(np.moveaxis(X_train, 1, 2), y_train)
-    tsf_predictions = tsf.predict(np.moveaxis(X_test, 1, 2))
-
-    ## Showcase performance of pipeline
-    # print(pipeline.named_steps['trainer'].run_summary.performance_tracker)
-
-    predictions = pipeline.predict_proba(X_test)
-    predictions = np.array(predictions).argmax(axis=1)
-    print(f"accuracy={accuracy_score(y_test, predictions)}")
-    print(f"tsf accuracy={accuracy_score(y_test, tsf_predictions)}")
diff --git a/examples/example_time_series_classification_sequential_mnist.py b/examples/example_time_series_classification_sequential_mnist.py
deleted file mode 100644
index f03bb425f..000000000
--- a/examples/example_time_series_classification_sequential_mnist.py
+++ /dev/null
@@ -1,104 +0,0 @@
-"""
-======================
-Time Series Classification on Sequential MNIST
-======================
-
-The following example shows how to fit a sample classification model
-with AutoPyTorch
-"""
-import os
-import tempfile as tmp
-import warnings
-
-import torch
-from torch.utils.data import Subset
-
-import torchvision
-
-os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
-os.environ['OMP_NUM_THREADS'] = '1'
-os.environ['OPENBLAS_NUM_THREADS'] = '1'
-os.environ['MKL_NUM_THREADS'] = '1'
-
-warnings.simplefilter(action='ignore', category=UserWarning)
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-import numpy as np
-
-from autoPyTorch.api.time_series_classification import TimeSeriesClassificationTask
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-def get_search_space_updates():
-    """
-    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
-    Returns:
-        HyperparameterSearchSpaceUpdates
-    """
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="data_loader",
-                   hyperparameter="batch_size",
-                   value_range=[16, 512],
-                   default_value=32)
-    updates.append(node_name="lr_scheduler",
-                   hyperparameter="CosineAnnealingLR:T_max",
-                   value_range=[50, 60],
-                   default_value=55)
-    updates.append(node_name='optimizer',
-                   hyperparameter='AdamOptimizer:lr',
-                   value_range=[0.0001, 0.001],
-                   default_value=0.0005)
-    return updates
-
-
-if __name__ == '__main__':
-    ############################################################################
-    # Data Loading
-    # ============
-    train_dataset = torchvision.datasets.MNIST(root=".", train=True, download=True)
-    test_dataset = torchvision.datasets.MNIST(root=".", train=False)
-
-    train_dataset = Subset(train_dataset, indices=torch.randperm(len(train_dataset))[:10000])
-    test_dataset = Subset(train_dataset, indices=torch.randperm(len(test_dataset))[:100])
-
-    X_train = np.empty((len(train_dataset), 28 * 28, 1), dtype=np.float32)
-    y_train = np.empty(len(train_dataset), dtype=np.int32)
-    X_test = np.empty((len(test_dataset), 28 * 28, 1), dtype=np.float32)
-    y_test = np.empty(len(test_dataset), dtype=np.int32)
-
-    for i, (image, label) in enumerate(train_dataset):
-        X_train[i] = np.asarray(image).reshape(28 * 28, 1)
-        y_train[i] = label
-
-    for i, (image, label) in enumerate(test_dataset):
-        X_test[i] = np.asarray(image).reshape(28 * 28, 1)
-        y_test[i] = label
-
-    ############################################################################
-    # Build and fit a classifier
-    # ==========================
-    api = TimeSeriesClassificationTask(
-        n_jobs=6,
-        delete_tmp_folder_after_terminate=False,
-        search_space_updates=get_search_space_updates(),
-        exclude_components={"network_backbone": ["LSTMBackbone"]}
-    )
-    api.set_pipeline_config(device="cuda")
-    api.search(
-        X_train=X_train,
-        y_train=y_train,
-        X_test=X_test.copy(),
-        y_test=y_test.copy(),
-        optimize_metric='accuracy',
-        total_walltime_limit=1200,
-        func_eval_time_limit=1200
-    )
-
-    ############################################################################
-    # Print the final ensemble performance
-    # ====================================
-    print(api.run_history, api.trajectory)
-    y_pred = api.predict(X_test)
-    score = api.score(y_pred, y_test)
-    print(score)
-    print(api.show_models())
diff --git a/examples/example_time_series_regression.py b/examples/example_time_series_regression.py
deleted file mode 100644
index be77209a8..000000000
--- a/examples/example_time_series_regression.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-======================
-Time Series Regression
-======================
-
-The following example shows how to fit a sample classification model
-with AutoPyTorch
-"""
-import os
-import tempfile as tmp
-import warnings
-
-os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
-os.environ['OMP_NUM_THREADS'] = '1'
-os.environ['OPENBLAS_NUM_THREADS'] = '1'
-os.environ['MKL_NUM_THREADS'] = '1'
-
-warnings.simplefilter(action='ignore', category=UserWarning)
-warnings.simplefilter(action='ignore', category=FutureWarning)
-
-import numpy as np
-
-from sktime.datasets import load_italy_power_demand
-
-from autoPyTorch.api.time_series_regression import TimeSeriesRegressionTask
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-def get_search_space_updates():
-    """
-    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
-    Returns:
-        HyperparameterSearchSpaceUpdates
-    """
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="data_loader",
-                   hyperparameter="batch_size",
-                   value_range=[32, 64],
-                   default_value=32)
-    updates.append(node_name="lr_scheduler",
-                   hyperparameter="CosineAnnealingLR:T_max",
-                   value_range=[50, 60],
-                   default_value=55)
-    updates.append(node_name='optimizer',
-                   hyperparameter='AdamOptimizer:lr',
-                   value_range=[0.0001, 0.001],
-                   default_value=0.0005)
-    return updates
-
-
-if __name__ == '__main__':
-    ############################################################################
-    # Data Loading
-    # (Mostly copied from
-    # https://github.com/sktime/sktime-dl/blob/master/examples/univariate_time_series_regression_and_forecasting.ipynb)
-    # ============
-    X_train_pd, _ = load_italy_power_demand(split='train', return_X_y=True)
-    X_test_pd, _ = load_italy_power_demand(split='test', return_X_y=True)
-
-    # Create some regression values.
-    # Make the value y equal to the sum of the X values at time-steps 1 and 10.
-    X_train = np.zeros((len(X_train_pd), 24, 1), dtype=float)
-    y_train = np.zeros(len(X_train_pd), dtype=float)
-    for i in range(len(X_train_pd)):
-        y_train[i] = X_train_pd.iloc[i].iloc[0].iloc[1]
-        y_train[i] = y_train[i] + X_train_pd.iloc[i].iloc[0].iloc[10]
-        X_train[i] = X_train_pd.iloc[i].iloc[0][:, np.newaxis]
-
-    X_test = np.zeros((len(X_test_pd), 24, 1), dtype=float)
-    y_test = np.zeros(len(X_test_pd))
-    for i in range(len(X_test_pd)):
-        y_test[i] = X_test_pd.iloc[i].iloc[0].iloc[1]
-        y_test[i] = y_test[i] + X_test_pd.iloc[i].iloc[0].iloc[10]
-        X_test[i] = X_test_pd.iloc[i].iloc[0][:, np.newaxis]
-
-    ############################################################################
-    # Build and fit a regressor
-    # ==========================
-    api = TimeSeriesRegressionTask(
-        delete_tmp_folder_after_terminate=False,
-        search_space_updates=get_search_space_updates(),
-        include_components={"network_backbone": ["InceptionTimeBackbone"]}
-    )
-    api.search(
-        X_train=X_train,
-        y_train=y_train,
-        X_test=X_test.copy(),
-        y_test=y_test.copy(),
-        optimize_metric='r2',
-        budget_type="runtime",
-        budget=50,
-        total_walltime_limit=500,
-        func_eval_time_limit=50
-    )
-
-    ############################################################################
-    # Print the final ensemble performance
-    # ====================================
-    print(api.run_history, api.trajectory)
-    y_pred = api.predict(X_test)
-    score = api.score(y_pred, y_test)
-    print(score)
-    print(api.show_models())
diff --git a/examples/example_time_series_regression_pipeline.py b/examples/example_time_series_regression_pipeline.py
deleted file mode 100644
index 276260bc0..000000000
--- a/examples/example_time_series_regression_pipeline.py
+++ /dev/null
@@ -1,149 +0,0 @@
-"""
-======================
-Ensemble from random search
----------------------------
-
-This is a temporal example to make sure that ensemble works.
-It also sets how SMAC should create the output information,
-so that the ensemble builder works.
-
-We will remove this file, once SMAC + ensemble builder work
-======================
-"""
-
-import numpy as np
-from sklearn.metrics import r2_score
-from sklearn.model_selection import train_test_split
-from sktime.datasets import load_italy_power_demand
-
-from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
-from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-from autoPyTorch.utils.backend import Backend, create
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.utils.pipeline import get_dataset_requirements
-
-
-def get_data_to_train(backend: Backend):
-    """
-    This function returns a fit dictionary that within itself, contains all
-    the information to fit a pipeline
-    """
-
-    X_train_pd, y_train = load_italy_power_demand(split='train', return_X_y=True)
-    X_test_pd, y_test = load_italy_power_demand(split='test', return_X_y=True)
-
-    # Create some regression values.
-    # Make the value y equal to the sum of the X values at time-steps 1 and 10.
-    X_train = np.zeros((len(X_train_pd), 24, 1), dtype=float)
-    y_train = np.zeros(len(y_train), dtype=float)
-    for i in range(len(X_train_pd)):
-        y_train[i] = X_train_pd.iloc[i].iloc[0].iloc[1]
-        y_train[i] = y_train[i] + X_train_pd.iloc[i].iloc[0].iloc[10]
-        X_train[i] = X_train_pd.iloc[i].iloc[0][:, np.newaxis]
-
-    X_test = np.zeros((len(X_test_pd), 24, 1), dtype=float)
-    y_test = np.zeros(len(y_test))
-    for i in range(len(X_test_pd)):
-        y_test[i] = X_test_pd.iloc[i].iloc[0].iloc[1]
-        y_test[i] = y_test[i] + X_test_pd.iloc[i].iloc[0].iloc[10]
-        X_test[i] = X_test_pd.iloc[i].iloc[0][:, np.newaxis]
-
-    validator = TimeSeriesInputValidator(is_classification=False)
-    validator.fit(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
-
-    # Create a datamanager for this toy problem
-    datamanager = TimeSeriesDataset(
-        X=X_train, Y=y_train,
-        X_test=X_test, Y_test=y_test,
-        validator=validator
-    )
-    backend.save_datamanager(datamanager)
-
-    info = {'task_type': datamanager.task_type,
-            'numerical_features': datamanager.numerical_features,
-            'categorical_features': datamanager.categorical_features,
-            'output_type': datamanager.output_type,
-            'issparse': datamanager.issparse}
-
-    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
-
-    # Fit the pipeline
-    fit_dictionary = {
-        'X_train': X_train,
-        'y_train': y_train,
-        'train_indices': np.arange(X_train.shape[0]),
-        'dataset_properties': dataset_properties,
-        # Training configuration
-        'num_run': 5,
-        'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
-        'device': 'cpu',
-        'runtime': 100,
-        'torch_num_threads': 1,
-        'early_stopping': 20,
-        'use_tensorboard_logger': True,
-        'use_pynisher': False,
-        'memory_limit': 4096,
-        'metrics_during_training': True,
-        'seed': 0,
-        'budget_type': 'epochs',
-        'epochs': 100.0,
-        'split_id': 0,
-        'backend': backend,
-        'job_id': 1
-    }
-
-    return fit_dictionary, X_train, y_train, X_test, y_test
-
-
-if __name__ == "__main__":
-    # Build a repository with random fitted models
-    backend = create(temporary_directory=None, output_directory=None,
-                     delete_tmp_folder_after_terminate=False)
-
-    # Create the directory structure
-    backend._make_internals_directory()
-
-    updates = HyperparameterSearchSpaceUpdates()
-    updates.append(node_name="optimizer",
-                   hyperparameter="AdamOptimizer:lr",
-                   value_range=[0.0001, 0.001],
-                   default_value=0.0005)
-
-    # Get data to train
-    fit_dictionary, X_train, y_train, X_test, y_test = get_data_to_train(backend)
-    pipeline = TimeSeriesClassificationPipeline(
-        dataset_properties=fit_dictionary['dataset_properties'],
-        search_space_updates=updates,
-        include={
-            'network_backbone': ['InceptionTimeBackbone']
-        }
-    )
-
-    # Goal: Able to indicate a network type and train it successfully on dummy data
-    # Step1: Be able to select and MLP with desired hyperparameters
-    pipeline_cs = pipeline.get_hyperparameter_search_space()
-    print(pipeline_cs)
-    config = pipeline_cs.get_default_configuration()
-    pipeline.set_hyperparameters(config)
-    print(config)
-
-    ## Step2: train it on dummy data
-
-    ## Fit the pipeline
-    print("Fitting the pipeline...")
-    something = pipeline.fit(fit_dictionary)
-
-    ## Showcase some components of the pipeline
-    # print(pipeline)
-
-    from sktime.regression import compose
-
-    tsf = compose.TimeSeriesForestRegressor()
-    tsf.fit(np.moveaxis(X_train, 1, 2), y_train)
-    tsf_predictions = tsf.predict(np.moveaxis(X_test, 1, 2))
-
-    predictions = pipeline.predict(X_test)
-    print(f"r2={r2_score(y_test, predictions)}")
-    print(f"tsf r2={r2_score(y_test, tsf_predictions)}")
diff --git a/test/conftest.py b/test/conftest.py
index 382e2dcdb..0284ffc7a 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -23,11 +23,7 @@
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
-<<<<<<< HEAD
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
-from autoPyTorch.utils.backend import create
-=======
->>>>>>> upstream/master
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 

From b27e6e9c153261fd541bcccf0c25cc00b6355930 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 25 Nov 2021 19:27:33 +0100
Subject: [PATCH 067/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 3e921fbb6..f25c90af3 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,5 +1,6 @@
+import os
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
-import warnings
+import uuid
 import bisect
 
 import numpy as np
@@ -156,6 +157,7 @@ def __init__(self,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  validator: Optional[TimeSeriesForecastingInputValidator] = None,
                  n_prediction_steps: int = 1,
+                 dataset_name: Optional[str] = None,
                  shift_input_data: bool = True,
                  normalize_y: bool = True,
                  train_with_log_prob: bool = True,
@@ -176,6 +178,12 @@ def __init__(self,
         header's configspace can be built beforehand.
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
+
+        self.dataset_name = dataset_name
+
+        if self.dataset_name is None:
+            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
         self.n_prediction_steps = n_prediction_steps
         if validator is None:
             validator = TimeSeriesForecastingInputValidator(is_classification=False)

From 39e7e417457fbe3232d6324a6949437ca2013b65 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 29 Nov 2021 18:21:18 +0100
Subject: [PATCH 068/347] distribution output

---
 .../network_head/distributed_network_head.py  | 102 +++++++++++++
 .../setup/network_head/distribution.py        | 139 +++++++++++++++++-
 .../setup/network_head/fully_connected.py     |  15 +-
 3 files changed, 243 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
index e69de29bb..052bf02a3 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
@@ -0,0 +1,102 @@
+from abc import abstractmethod
+from typing import Any, Dict, Iterable, Tuple, Optional
+
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_head.distribution import ALL_DISTRIBUTIONS,NormalOutput, \
+    StudentTOutput, BetaOutput, GammaOutput, PoissonOutput
+
+from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
+
+
+
+class DistributedNetworkComponents(NetworkHeadComponent):
+    """
+    Base class for network heads. Holds the head module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series",
+                            "n_prediction_steps", "train_with_log_prob"]
+
+    def __init__(self,
+                 **kwargs: Any):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
+            FitRequirement('train_with_log_prob', (str, ), user_defined=True, dataset_property=True),
+            FitRequirement('n_prediction_steps', (int,), user_defined=True, dataset_property=True),
+            FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
+        ])
+        self.head: nn.Module = None
+        self.config = kwargs
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Builds the head component and assigns it to self.head
+
+        Args:
+            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
+            y (Any): not used. To comply with sklearn API
+        Returns:
+            Self
+        """
+        input_shape = X['dataset_properties']['input_shape']
+        output_shape = X['dataset_properties']['output_shape']
+        n_prediction_steps = X['dataset_properties']['n_prediction_steps']
+
+        self.head = self.build_head(
+            input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape),
+            output_shape=output_shape,
+            n_prediction_steps=n_prediction_steps,
+        )
+        return self
+
+    @abstractmethod
+    def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...],
+                   n_prediction_steps: int =1) -> nn.Module:
+        """
+        Builds the head module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            output_shape (Tuple[int, ...]): shape of the output of the head
+            n_prediction_steps (int): how many steps need to be predicted in advance
+
+        Returns:
+            nn.Module: head module
+        """
+        raise NotImplementedError()
+
+    def _build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...],
+                   n_prediction_steps: int =1) -> nn.Module:
+        """
+        Builds the head module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            output_shape (Tuple[int, ...]): shape of the output of the head
+            n_prediction_steps (int): how many steps need to be predicted in advance
+
+        Returns:
+            nn.Module: head module
+        """
+        raise NotImplementedError()
+
+    def build_proj_layer(self, dist_cls: str, head_base_output_features: int, n_prediction_steps: int) ->\
+            torch.distributions.Distribution:
+        """
+        Builds a layer that maps the head output features to a torch distribution
+        """
+        if dist_cls not in ALL_DISTRIBUTIONS.keys():
+            raise ValueError(f'Unsupported distribution class type: {dist_cls}')
+        proj_layer = ALL_DISTRIBUTIONS[dist_cls](in_features=head_base_output_features,
+                                                 n_prediction_steps=n_prediction_steps)
+        return proj_layer
+
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/distribution.py
index b836e61eb..37eed9c33 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distribution.py
@@ -1,16 +1,29 @@
-# This part mainly follows the implementation in gluonts:
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+# This part of codes mainly follow the implementation in gluonts:
 # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
 # However, we don't simply follow their implementation mainly due to the different network backbone.
-# Additionally, we rescale the output in the later phases to avoid
+# Additionally, scale information is not presented here to avoid
+
 
-from typing import Callable, Dict, Optional, Tuple
+from typing import Dict, Tuple
 
-import numpy as np
+from abc import abstractmethod
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributions import (
-    AffineTransform,
     Beta,
     Distribution,
     Gamma,
@@ -18,7 +31,6 @@
     Normal,
     Poisson,
     StudentT,
-    TransformedDistribution,
 )
 
 
@@ -29,8 +41,119 @@ class ProjectionLayer(nn.Module):
     def __init__(self,
                  in_features: int,
                  n_prediction_steps: int,
-                 args_dims: [int],
-                 domain_map: Callable[..., Tuple[torch.Tensor]],
                  **kwargs,):
         super().__init__(**kwargs)
+        # we consider all the prediction steps holistically. thus, the output of the poj layer is
+        # n_prediction_steps * dim
+        self.proj = nn.ModuleList(
+            [nn.Linear(in_features, n_prediction_steps * dim) for dim in self.args_dim.values()]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.distributions:
+        params_unbounded = [proj(x) for proj in self.proj]
+        return self.dist_cls(self.domain_map(*params_unbounded))
+
+    @property
+    @abstractmethod
+    def arg_dims(self) -> Dict[str, int]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def domain_map(self, *args: torch.Tensor) -> Tuple[torch.Tensor]:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def dist_cls(self) -> type(Distribution):
+        raise NotImplementedError
+
+
+class NormalOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"loc": 1, "scale": 1}
+
+    def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        scale = F.softplus(scale)
+        return loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def dist_cls(self) -> type(Distribution):
+        return Normal
+
+
+class StudentTOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"df": 1, "loc": 1, "scale": 1}
+
+    def domain_map(self, cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor)\
+            -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        scale = F.softplus(scale)
+        df = 2.0 + F.softplus(df)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+
+    @property
+    def dist_cls(self) -> type(Distribution):
+        return StudentT
+
+
+class BetaOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"concentration1": 1, "concentration0": 1}
+
+    def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor)\
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO we need to adapt epsilon value given the datatype of this module
+        epsilon = 1e-10
+        concentration1 = F.softplus(concentration1) + epsilon
+        concentration0 = F.softplus(concentration0) + epsilon
+        return concentration1.squeeze(dim=-1), concentration0.squeeze(dim=-1)
+
+    @property
+    def dist_cls(self) -> type(Distribution):
+        return Beta
+
+
+class GammaOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"concentration": 1, "rate": 1}
+
+    def domain_map(self, concentration: torch.Tensor, rate: torch.Tensor)\
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO we need to adapt epsilon value given the datatype of this module
+        epsilon = 1e-10
+        concentration = F.softplus(concentration) + epsilon
+        rate = F.softplus(rate) + epsilon
+        return concentration.squeeze(dim=-1), rate.squeeze(dim=-1)
+
+    @property
+    def dist_cls(self) -> type(Distribution):
+        return Gamma
+
+
+class PoissonOutput(ProjectionLayer):
+    @property
+    def arg_dims(self) -> Dict[str, int]:
+        return {"rate": 1}
+
+    def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor,]:
+        rate_pos = F.softplus(rate).clone()
+        return rate_pos.squeeze(-1),
+
+    @property
+    def dist_cls(self) -> type(Distribution):
+        return Poisson
+
+
+ALL_DISTRIBUTIONS = {'normal': NormalOutput,
+                     'studentT': StudentTOutput,
+                     'beta': BetaOutput,
+                     'gamma': GammaOutput,
+                     'poisson': PoissonOutput}  # type: Dict[str, type(ProjectionLayer)]
+
 
+#TODO consider how to implement NegativeBinomialOutput without scale information
+# class NegativeBinomialOutput(ProjectionLayer):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
index 99762bbcf..b16ed01d5 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union, List
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -21,6 +21,13 @@ class FullyConnectedHead(NetworkHeadComponent):
     """
 
     def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> nn.Module:
+        layers, head_base_output_features = self._build_head(input_shape)
+        out_features = np.prod(output_shape).item()
+        layers.append(nn.Linear(in_features=head_base_output_features,
+                                out_features=out_features))
+        return nn.Sequential(*layers)
+
+    def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
         layers = [nn.Flatten()]
         in_features = np.prod(input_shape).item()
         for i in range(1, self.config["num_layers"]):
@@ -28,10 +35,8 @@ def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]
                                     out_features=self.config[f"units_layer_{i}"]))
             layers.append(_activations[self.config["activation"]]())
             in_features = self.config[f"units_layer_{i}"]
-        out_features = np.prod(output_shape).item()
-        layers.append(nn.Linear(in_features=in_features,
-                                out_features=out_features))
-        return nn.Sequential(*layers)
+        head_base_output_features = in_features
+        return layers, head_base_output_features
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None

From 89a744cf9dc256b4dbf0817c45c2d612c22f65d3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 30 Nov 2021 23:12:42 +0100
Subject: [PATCH 069/347] distributed head

---
 .../setup/network_backbone/LSTMBackbone.py    |  7 ++-
 .../setup/network_backbone/MLPBackbone.py     |  4 +-
 .../setup/network_backbone/MLPForecasting.py  |  9 ---
 .../setup/network_backbone/TCNBackbone.py     | 36 ++++++-----
 .../network_backbone/TimeSeriesMLPBackbone.py | 56 +++++++++++++++++
 .../network_backbone/base_network_backbone.py |  2 +
 .../components/setup/network_head/__init__.py | 47 +++++++++++---
 .../distributed_network_head/__init__.py      | 17 +++++
 .../distributed_fully_connected.py            | 59 +++++++++++++++++
 .../distributed_network_head.py               | 53 +++++++++-------
 .../distribution.py                           | 63 +++++++++++++------
 .../time_series_forecasting_data_loader.py    |  1 +
 .../pipeline/time_series_forecasting.py       |  6 +-
 requirements.txt                              | 27 +-------
 14 files changed, 278 insertions(+), 109 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
 rename autoPyTorch/pipeline/components/setup/network_head/{ => distributed_network_head}/distributed_network_head.py (64%)
 rename autoPyTorch/pipeline/components/setup/network_head/{ => distributed_network_head}/distribution.py (68%)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
index 382d94fdb..b2fe70c89 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
@@ -11,7 +11,8 @@
 import torch
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import\
+    NetworkBackboneComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -51,8 +52,9 @@ class LSTMBackbone(NetworkBackboneComponent):
     """
     Standard searchable LSTM backbone for time series data
     """
+    _fixed_seq_length = False
 
-    def __init__(self, **kwargs: Any):
+    def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
@@ -105,5 +107,4 @@ def get_hyperparameter_search_space(
         cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
                                            CS.GreaterThanCondition(dropout, num_layers, 1)))
 
-
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
index f3fb4d7a2..46f3f913d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
@@ -28,9 +28,11 @@ class MLPBackbone(NetworkBackboneComponent):
     """
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        layers: List[nn.Module] = list()
         in_features = input_shape[0]
+        return self._build_backbone(in_features)
 
+    def _build_backbone(self, in_features: int, ):
+        layers: List[nn.Module] = list()
         self._add_layer(layers, in_features, self.config['num_units_1'], 1)
 
         for i in range(2, self.config['num_groups'] + 1):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py
deleted file mode 100644
index 57a07fff0..000000000
--- a/autoPyTorch/pipeline/components/setup/network_backbone/MLPForecasting.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-import torch
-
-
-def seq2tab(x: torch.Tensor):
-    # https://discuss.pytorch.org/t/how-could-i-flatten-two-dimensions-of-a-tensor/44570/4
-    return x.view(-1, *x.shape[2:])
-
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
index 6ea5a179e..be9683ffe 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
@@ -13,7 +13,8 @@
 from torch.nn.utils import weight_norm
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import \
+    NetworkBackboneComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -100,6 +101,7 @@ class TCNBackbone(NetworkBackboneComponent):
     """
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
+    _fixed_seq_length = False
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_0"]]
@@ -126,22 +128,22 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                          value_range=(1, 10),
-                                                                          default_value=5),
-        num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
-                                                                           value_range=(4, 64),
-                                                                           default_value=32),
-        kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
-                                                                           value_range=(4, 64),
-                                                                           default_value=32),
-        use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
-                                                                           value_range=(True, False),
-                                                                           default_value=False),
-        dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
-                                                                       value_range=(0, 0.5),
-                                                                       default_value=0.1),
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                                              value_range=(1, 10),
+                                                                              default_value=5),
+            num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
+                                                                               value_range=(4, 64),
+                                                                               default_value=32),
+            kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
+                                                                               value_range=(4, 64),
+                                                                               default_value=32),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                                                               value_range=(True, False),
+                                                                               default_value=False),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
+                                                                           value_range=(0, 0.5),
+                                                                           default_value=0.1),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
new file mode 100644
index 000000000..c9673f3d2
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, List, Optional, Union
+
+
+from typing import Tuple
+from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
+
+import torch
+from torch import nn
+
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.utils.common import FitRequirement
+
+
+class _TimeSeriesMLP(nn.Module):
+    def __init__(self,
+                 module_layers: nn.Module,
+                 ):
+        self.module_layers = module_layers
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        # https://discuss.pytorch.org/t/how-could-i-flatten-two-dimensions-of-a-tensor/44570/4
+        x = x.view(-1, *x.shape[2:])
+        return self.module_layers(x)
+
+
+class TimeSeriesMLPBackbone(MLPBackbone):
+    _fixed_seq_length = True
+    window_size = 1
+
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        requirements_list = super()._required_fit_arguments
+        requirements_list.append(FitRequirement('window_size', (str,), user_defined=False, dataset_property=False))
+        return requirements_list
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.window_size = X["window_size"]
+        return super().fit(X, y)
+
+    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[0] * self.window_size
+        return self._build_backbone(in_features)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TSMLPBackbone',
+            'name': 'TimeSeriesMLPBackbone',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index e93a455de..eb3f75336 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -26,6 +26,7 @@ class NetworkBackboneComponent(autoPyTorchComponent):
     Base class for network backbones. Holds the backbone module and the config which was used to create it.
     """
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+    _fixed_seq_length = False  # only used for time series tasks, if the input seq_length needs to be fixed
 
     def __init__(self,
                  **kwargs: Any):
@@ -153,3 +154,4 @@ def get_name(cls) -> str:
             str: Name of the backbone
         """
         return str(cls.get_properties()["shortname"])
+
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index 34163b986..1e00d02d1 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -18,12 +18,20 @@
     NetworkHeadComponent,
 )
 
+
 directory = os.path.split(__file__)[0]
 _heads = find_components(__package__,
                          directory,
                          NetworkHeadComponent)
 _addons = ThirdPartyComponents(NetworkHeadComponent)
 
+# avoid path pollution, (otherwise FC layer will not be correctly detected)
+from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head import _distributed_heads, \
+    _distributed_addons
+
+from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distributed_network_head import \
+    DistributionNetworkHeadComponents
+
 
 def add_head(head: NetworkHeadComponent) -> None:
     _addons.add_component(head)
@@ -44,6 +52,10 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components = OrderedDict()
         components.update(_heads)
         components.update(_addons.components)
+
+        components.update(_distributed_heads)
+        components.update(_distributed_addons.components)
+
         return components
 
     def get_available_components(
@@ -104,6 +116,10 @@ def get_available_components(
                 continue
             elif 'time_series' in task_type and not bool(properties['handles_time_series']):
                 continue
+            train_with_log_prob = dataset_properties.get("train_with_log_prob", False)
+            if train_with_log_prob:
+                if not issubclass(entry, DistributionNetworkHeadComponents):
+                    continue
 
             # target_type = dataset_properties['target_type']
             # Apply some automatic filtering here for
@@ -112,7 +128,6 @@ def get_available_components(
             # is not recommended for a certain dataset
 
             components_dict[name] = entry
-
         return components_dict
 
     def get_hyperparameter_search_space(
@@ -147,16 +162,27 @@ def get_hyperparameter_search_space(
 
         if len(available_heads) == 0:
             raise ValueError("No head found")
+        train_with_log_prob = dataset_properties.get("train_with_log_prob", False)
+        if train_with_log_prob:
+            if default is None:
+                defaults = [
+                    'DistributionFullyConnectedHead',
+                ]
+                for default_ in defaults:
+                    if default_ in available_heads:
+                        default = default_
+                        break
+        else:
+            if default is None:
+                defaults = [
+                    'FullyConnectedHead',
+                    'FullyConvolutional2DHead',
+                ]
+                for default_ in defaults:
+                    if default_ in available_heads:
+                        default = default_
+                        break
 
-        if default is None:
-            defaults = [
-                'FullyConnectedHead',
-                'FullyConvolutional2DHead',
-            ]
-            for default_ in defaults:
-                if default_ in available_heads:
-                    default = default_
-                    break
         updates = self._get_search_space_updates()
         if '__choice__' in updates.keys():
             choice_hyperparameter = updates['__choice__']
@@ -185,6 +211,7 @@ def get_hyperparameter_search_space(
                 parent_hyperparameter=parent_hyperparameter
             )
 
+
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py
new file mode 100644
index 000000000..4b83096d7
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py
@@ -0,0 +1,17 @@
+import os
+
+from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distributed_network_head import (
+    DistributionNetworkHeadComponents,
+)
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    find_components,
+)
+
+directory = os.path.split(__file__)[0]
+_distributed_heads = find_components(__package__,
+                         directory,
+                         DistributionNetworkHeadComponents)
+
+_distributed_addons = ThirdPartyComponents(DistributionNetworkHeadComponents)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
new file mode 100644
index 000000000..e0ff4f925
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
@@ -0,0 +1,59 @@
+from typing import Dict, Optional, Tuple, Union, List
+
+from torch import nn
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distributed_network_head import \
+    DistributionNetworkHeadComponents
+from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
+
+
+class DistributionFullyConnectedHead(DistributionNetworkHeadComponents, FullyConnectedHead):
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'DistributionFullyConnectedHead',
+            'name': 'DistributionFullyConnectedHead',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
+        return FullyConnectedHead._build_head(self, input_shape)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
+                                                                          value_range=(1, 4),
+                                                                          default_value=2),
+        units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
+                                                                           value_range=(64, 512),
+                                                                           default_value=128),
+        activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
+                                                                          value_range=tuple(_activations.keys()),
+                                                                          default_value=list(_activations.keys())[0]),
+        dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
+                                                                        value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                                                                        default_value=list(ALL_DISTRIBUTIONS.keys())[0]),
+        auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+                                                                               value_range=(True, False),
+                                                                               default_value=False)
+    ) -> ConfigurationSpace:
+        cs = FullyConnectedHead.get_hyperparameter_search_space(dataset_properties=dataset_properties,
+                                                                num_layers=num_layers,
+                                                                units_layer=units_layer,
+                                                                activation=activation)
+
+        add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
+        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
similarity index 64%
rename from autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
rename to autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
index 052bf02a3..3ff766a23 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
@@ -1,6 +1,7 @@
 from abc import abstractmethod
-from typing import Any, Dict, Iterable, Tuple, Optional
+from typing import Any, Dict, Iterable, Tuple, List, Optional
 
+import numpy as np
 import torch
 from torch import nn
 
@@ -9,17 +10,13 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
 
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_head.distribution import ALL_DISTRIBUTIONS,NormalOutput, \
-    StudentTOutput, BetaOutput, GammaOutput, PoissonOutput
+from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distribution import ALL_DISTRIBUTIONS
 
-from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
 
-
-
-class DistributedNetworkComponents(NetworkHeadComponent):
+class DistributionNetworkHeadComponents(NetworkHeadComponent):
     """
-    Base class for network heads. Holds the head module and the config which was used to create it.
+    Base class for network heads used for distribution output.
+     Holds the head module and the config which was used to create it.
     """
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series",
                             "n_prediction_steps", "train_with_log_prob"]
@@ -30,11 +27,11 @@ def __init__(self,
         self.add_fit_requirements([
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
-            FitRequirement('train_with_log_prob', (str, ), user_defined=True, dataset_property=True),
+            FitRequirement('train_with_log_prob', (str,), user_defined=True, dataset_property=True),
             FitRequirement('n_prediction_steps', (int,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
         ])
-        self.head: nn.Module = None
+        self.head: Optional[nn.Module] = None
         self.config = kwargs
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -49,33 +46,39 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         input_shape = X['dataset_properties']['input_shape']
         output_shape = X['dataset_properties']['output_shape']
-        n_prediction_steps = X['dataset_properties']['n_prediction_steps']
 
+        auto_regressive = self.config.get("auto_regressive", False)
+        X.update({"auto_regressive": auto_regressive})
+        # TODO consider Auto-regressive model on vanilla network head
+        if auto_regressive:
+            output_shape[0] = 1
         self.head = self.build_head(
             input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape),
             output_shape=output_shape,
-            n_prediction_steps=n_prediction_steps,
         )
         return self
 
-    @abstractmethod
-    def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...],
-                   n_prediction_steps: int =1) -> nn.Module:
+    def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> nn.Module:
         """
         Builds the head module and returns it
 
         Args:
             input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
             output_shape (Tuple[int, ...]): shape of the output of the head
-            n_prediction_steps (int): how many steps need to be predicted in advance
 
         Returns:
             nn.Module: head module
         """
-        raise NotImplementedError()
+        base_header_layer, num_head_base_output_features = self._build_head(input_shape)
+        # TODO consider other form of proj layers
+        proj_layer = self.build_proj_layer(dist_cls=self.config["dist_cls"],
+                                           num_head_base_output_features=num_head_base_output_features,
+                                           output_shape=output_shape,
+                                           )
+        return nn.Sequential(*base_header_layer, proj_layer)
 
-    def _build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...],
-                   n_prediction_steps: int =1) -> nn.Module:
+    @abstractmethod
+    def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
         """
         Builds the head module and returns it
 
@@ -89,14 +92,16 @@ def _build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...
         """
         raise NotImplementedError()
 
-    def build_proj_layer(self, dist_cls: str, head_base_output_features: int, n_prediction_steps: int) ->\
+    @staticmethod
+    def build_proj_layer(dist_cls: str,
+                         num_head_base_output_features: int,
+                         output_shape: Tuple[int, ...],) -> \
             torch.distributions.Distribution:
         """
         Builds a layer that maps the head output features to a torch distribution
         """
         if dist_cls not in ALL_DISTRIBUTIONS.keys():
             raise ValueError(f'Unsupported distribution class type: {dist_cls}')
-        proj_layer = ALL_DISTRIBUTIONS[dist_cls](in_features=head_base_output_features,
-                                                 n_prediction_steps=n_prediction_steps)
+        proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
+                                                 output_shape=output_shape,)
         return proj_layer
-
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
similarity index 68%
rename from autoPyTorch/pipeline/components/setup/network_head/distribution.py
rename to autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
index 37eed9c33..4fe8715ff 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
@@ -20,6 +20,8 @@
 from typing import Dict, Tuple
 
 from abc import abstractmethod
+
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -38,20 +40,46 @@ class ProjectionLayer(nn.Module):
     """
     A projection layer that
     """
+
     def __init__(self,
-                 in_features: int,
-                 n_prediction_steps: int,
-                 **kwargs,):
+                 num_in_features: int,
+                 output_shape: Tuple[int, ...],
+                 **kwargs, ):
         super().__init__(**kwargs)
+
         # we consider all the prediction steps holistically. thus, the output of the poj layer is
-        # n_prediction_steps * dim
+        # n_prediction_steps * dim *output_shape
+
+        def build_single_proj_layer(arg_dim):
+            """
+            build a single proj layer given the input dims, the output is unflattened to fit the required output_shape
+            and n_prediction_steps.
+            Args:
+                arg_dim: dimension of the target distribution
+
+            Returns:
+
+            """
+            return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+                                 nn.Unflatten(-1, (*output_shape, arg_dim)))
+
         self.proj = nn.ModuleList(
-            [nn.Linear(in_features, n_prediction_steps * dim) for dim in self.args_dim.values()]
+            [build_single_proj_layer(dim) for dim in self.arg_dims.values()]
         )
 
     def forward(self, x: torch.Tensor) -> torch.distributions:
+        """
+        get a target distribution
+        Args:
+            x: input tensor ([batch_size, in_features]): input tensor, acquired by the base header, have the shape
+            [batch_size, in_features]
+
+        Returns:
+            dist: torch.distributions ([batch_size, n_prediction_steps, output_shape]): an output torch distribution
+             with shape (batch_size, n_prediction_steps, output_shape)
+        """
         params_unbounded = [proj(x) for proj in self.proj]
-        return self.dist_cls(self.domain_map(*params_unbounded))
+        return self.dist_cls(*self.domain_map(*params_unbounded))
 
     @property
     @abstractmethod
@@ -75,7 +103,7 @@ def arg_dims(self) -> Dict[str, int]:
 
     def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         scale = F.softplus(scale)
-        return loc.squeeze(-1), scale.squeeze(-1)
+        return loc.squeeze(-1).squeeze(-1), scale.squeeze(-1).squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -87,11 +115,11 @@ class StudentTOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"df": 1, "loc": 1, "scale": 1}
 
-    def domain_map(self, cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor)\
+    def domain_map(self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor) \
             -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         scale = F.softplus(scale)
         df = 2.0 + F.softplus(df)
-        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
+        return df.squeeze(-1).squeeze(-1), loc.squeeze(-1).squeeze(-1), scale.squeeze(-1).squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -103,13 +131,13 @@ class BetaOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration1": 1, "concentration0": 1}
 
-    def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor)\
+    def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor) \
             -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO we need to adapt epsilon value given the datatype of this module
         epsilon = 1e-10
         concentration1 = F.softplus(concentration1) + epsilon
         concentration0 = F.softplus(concentration0) + epsilon
-        return concentration1.squeeze(dim=-1), concentration0.squeeze(dim=-1)
+        return concentration1.squeeze(-1).squeeze(-1), concentration0.squeeze(-1).squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -121,13 +149,13 @@ class GammaOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration": 1, "rate": 1}
 
-    def domain_map(self, concentration: torch.Tensor, rate: torch.Tensor)\
+    def domain_map(self, concentration: torch.Tensor, rate: torch.Tensor) \
             -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO we need to adapt epsilon value given the datatype of this module
         epsilon = 1e-10
         concentration = F.softplus(concentration) + epsilon
         rate = F.softplus(rate) + epsilon
-        return concentration.squeeze(dim=-1), rate.squeeze(dim=-1)
+        return concentration.squeeze(-1).squeeze(-1), rate.squeeze(-1).squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -141,19 +169,18 @@ def arg_dims(self) -> Dict[str, int]:
 
     def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor,]:
         rate_pos = F.softplus(rate).clone()
-        return rate_pos.squeeze(-1),
+        return rate_pos.squeeze(-1).squeeze(-1),
 
     @property
     def dist_cls(self) -> type(Distribution):
         return Poisson
 
 
-ALL_DISTRIBUTIONS = {'normal': NormalOutput,
-                     'studentT': StudentTOutput,
+ALL_DISTRIBUTIONS = {'studentT': StudentTOutput,
+                     'normal': NormalOutput,
                      'beta': BetaOutput,
                      'gamma': GammaOutput,
                      'poisson': PoissonOutput}  # type: Dict[str, type(ProjectionLayer)]
 
-
-#TODO consider how to implement NegativeBinomialOutput without scale information
+# TODO consider how to implement NegativeBinomialOutput without scale information
 # class NegativeBinomialOutput(ProjectionLayer):
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index e5a24d4d0..2f29f27e3 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -204,6 +204,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Returns:
             A instance of self
         """
+        X["window_size"] = self.window_size
         sample_interval = X.get('sample_interval', 1)
         self.sample_interval = sample_interval
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 51f9c77d7..1061fb313 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -188,6 +188,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         """
         Defines what steps a pipeline should follow.
         The step itself has choices given via autoPyTorchChoice.
+        One key difference between Forecasting pipeline and others is that we put "data_loader"
+        before "network_backbone" such that
 
         Returns:
             List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
@@ -204,6 +206,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
             ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+            ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
+                                                            random_state=self.random_state)),
             ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
                                                        random_state=self.random_state)),
             ("network_head", NetworkHeadChoice(default_dataset_properties,
@@ -215,8 +219,6 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                                           random_state=self.random_state)),
             ("lr_scheduler", SchedulerChoice(default_dataset_properties,
                                              random_state=self.random_state)),
-            ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
-                                                            random_state=self.random_state)),
             ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps
diff --git a/requirements.txt b/requirements.txt
index 9cc5e25de..b54faec45 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,27 +1,3 @@
-<<<<<<< HEAD
-setuptools
-Cython==0.29.21
-netifaces==0.10.9
-numpy==1.19.5
-pandas==1.2.0
-scipy==1.6.0
-statsmodels==0.12.1
-scikit-learn==0.23.0
-imbalanced-learn==0.7.0
-imblearn==0.0
-ConfigSpace==0.4.17
-pynisher==0.6.3
-hpbandster==0.7.4
-fasteners==0.16
-torch==1.7.1
-torchvision==0.8.2
-tensorboard-logger==0.1.0
-openml==0.11.0
-lightgbm==3.1.1
-catboost==0.24.4
-pexpect==4.8.0
-sktime==0.8.0
-=======
 pandas
 torch
 torchvision
@@ -41,4 +17,5 @@ catboost
 lightgbm
 flaky
 tabulate
->>>>>>> upstream/master
+sktime>=0.8.0
+

From e81b24d73308fdfa13575995308721ab02985926 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 30 Nov 2021 23:23:46 +0100
Subject: [PATCH 070/347] avoid additional computation when metric is not
 reuquired

---
 .../pipeline/components/training/trainer/base_trainer.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index e447db911..4d46a3fc0 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -302,9 +302,10 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
             loss, outputs = self.train_step(data, targets)
 
-            # save for metric evaluation
-            outputs_data.append(outputs.detach().cpu())
-            targets_data.append(targets.detach().cpu())
+            if self.metrics_during_training:
+                # save for metric evaluation
+                outputs_data.append(outputs.detach().cpu())
+                targets_data.append(targets.detach().cpu())
 
             batch_size = data.size(0)
             loss_sum += loss * batch_size

From 23d2a4ddcf49166223734f097da095a456185a45 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 30 Nov 2021 23:37:03 +0100
Subject: [PATCH 071/347] maint

---
 autoPyTorch/api/base_task.py                  |  4 ++++
 .../test_pipeline/components/training/base.py | 22 +++++++------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index ac21863e5..42aec5f8c 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1163,6 +1163,10 @@ def _get_fit_dictionary(
                                   'split_id': split_id,
                                   'num_run': self._backend.get_next_num_run(),
                                   })
+        if self.time_series_forecasting:
+            warnings.WarningMessage("Currently Time Series Forecasting tasks do not allow computing metrics "
+                                    "during training. It will be automatically set as False")
+            self.pipeline_options["metrics_during_training"] = False
         X.update(self.pipeline_options)
         return X
 
diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py
index 9e937340a..1e85df5e6 100644
--- a/test/test_pipeline/components/training/base.py
+++ b/test/test_pipeline/components/training/base.py
@@ -1,3 +1,4 @@
+from typing import Any, Optional, Dict, List, Tuple
 import logging
 
 from sklearn.datasets import make_classification, make_regression
@@ -11,16 +12,15 @@
     CONTINUOUS,
     OUTPUT_TYPES_TO_STRING,
     REGRESSION_TASKS,
-    TASK_TYPES_TO_STRING
-<<<<<<< HEAD:test/test_pipeline/components/base.py
+    TASK_TYPES_TO_STRING)
+
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \
     TabularColumnTransformer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \
-    EncoderChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice as TabularScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice as\
+    TabularScalerChoice
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
@@ -29,11 +29,7 @@
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-=======
-)
-from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
-from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
->>>>>>> upstream/master:test/test_pipeline/components/training/base.py
+
 
 
 class BaseTraining:
@@ -144,7 +140,6 @@ def train_model(self,
                 # Backward pass
                 loss.backward()
                 optimizer.step()
-<<<<<<< HEAD:test/test_pipeline/components/base.py
 
 
 class TabularPipeline(TabularClassificationPipeline):
@@ -195,5 +190,4 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
             ("time_series_transformer", TimeSeriesTransformer()),
         ])
         return steps
-=======
->>>>>>> upstream/master:test/test_pipeline/components/training/base.py
+

From 6233706ccc4feb4183b895cf5be6b729538f51a3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 2 Dec 2021 22:11:28 +0100
Subject: [PATCH 072/347] pass scaling to trainer

---
 autoPyTorch/datasets/time_series_dataset.py   | 15 ++-
 .../TimeSeriesTransformer.py                  | 38 ++++---
 .../scaling/MaxAbsScaler.py                   |  4 +-
 .../scaling/MinMaxScaler.py                   |  4 +-
 .../scaling/NoScaler.py                       |  3 +-
 .../scaling/StandardScaler.py                 |  4 +-
 .../scaling/utils.py                          | 99 ++++++-------------
 .../time_series_forecasting_data_loader.py    | 18 +---
 .../pipeline/time_series_forecasting.py       |  3 +-
 9 files changed, 72 insertions(+), 116 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f25c90af3..c93402811 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -29,6 +29,8 @@
 )
 
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
+    TimeSeriesTransformer
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.constants_forecasting import SEASONALITY_MAP
 
@@ -71,7 +73,7 @@ def __init__(self,
         self.train_transform = train_transforms
         self.val_transform = val_transforms
 
-    def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
+    def __getitem__(self, index: int, train: bool = True) -> Tuple[Dict[str, torch.Tensor], Optional[torch.Tensor]]:
         """
         get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous sequences
         until the given index, this allows us to do further transformation when the
@@ -92,9 +94,12 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
             X = self.X[:index + 1]
 
         if self.train_transform is not None and train:
-            X = self.train_transform(X)
+            X, loc, scale = self.train_transform(X)
         elif self.val_transform is not None and not train:
-            X = self.val_transform(X)
+            X, loc, scale = self.val_transform(X)
+        else:
+            loc = 0.0
+            scale = 1.0
 
         # In case of prediction, the targets are not provided
         Y = self.Y
@@ -107,7 +112,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         else:
             Y = None
 
-        return X, Y
+        return {"value": torch.from_numpy(X), "loc": torch.from_numpy(loc), "scale": torch.from_numpy(scale)}, Y
 
     def __len__(self) -> int:
         return self.X.shape[0]
@@ -273,7 +278,7 @@ def __init__(self,
             self.output_shape = [self.n_prediction_steps, num_target]
 
         # TODO: Look for a criteria to define small enough to preprocess
-        self.is_small_preprocess = True
+        self.is_small_preprocess = False
 
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index f1d75e401..46d24518e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -3,6 +3,8 @@
 import numpy as np
 
 from sklearn.pipeline import Pipeline, make_pipeline
+#from sktime.transformations.panel.compose import ColumnTransformer
+from sklearn.compose import ColumnTransformer
 
 import torch
 
@@ -17,10 +19,11 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         super().__init__()
         self.random_state = random_state
         self.preprocessor: Optional[Pipeline] = None
-        self.is_training = True
         self.add_fit_requirements([
             FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
+        self.loc = 0.
+        self.scale = 1.
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
         """
@@ -33,27 +36,31 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
             "TabularColumnTransformer": an instance of self
         """
         self.check_requirements(X, y)
+        numerical_pipeline = 'drop'
+        categorical_pipeline = 'drop'
 
         preprocessors = get_time_series_preprocessers(X)
 
-        if len(X['dataset_properties']['categorical_features']):
-            raise ValueError("Categorical features are not yet supported for time series")
+        if len(X['dataset_properties']['numerical_columns']):
+            numerical_pipeline = make_pipeline(*preprocessors['numerical'])
+        if len(X['dataset_properties']['categorical_columns']):
+            categorical_pipeline = make_pipeline(*preprocessors['categorical'])
 
-        numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-
-        self.preprocessor = numerical_pipeline
+        # as X_train is a 2d array here, we simply use ColumnTransformer from sklearn
+        self.preprocessor = ColumnTransformer([
+            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
+            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
+            remainder='passthrough'
+        )
 
+        """
         # Where to get the data -- Prioritize X_train if any else
         # get from backend
-        # TODO consider how to handle the inconsistency between Transformer and Datasets
-        X_train = X['backend'].load_datamanager().train_tensors[0]
-        """
         if 'X_train' in X:
             X_train = subsampler(X['X_train'], X['train_indices'])
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
         """
-        self.preprocessor.fit(X_train)
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -68,18 +75,19 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'time_series_transformer': self})
         return X
 
-    def eval(self):
-        self.is_training = False
-        self.preprocessor.set_params(timeseriesscaler__is_training=False)
-
     def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
 
         if self.preprocessor is None:
             raise ValueError("cant call {} without fitting the column transformer first."
                              .format(self.__class__.__name__))
+        self.preprocessor.fit(X)
 
         #if len(X.shape) == 2:
         #    # expand batch dimension when called on a single record
         #    X = X[np.newaxis, ...]
 
-        return self.preprocessor.transform(X)
+        scaler = self.preprocessor.named_transformers_['numerical_pipeline']['timeseriesscaler']
+        loc = scaler.loc
+        scale = scaler.scale
+
+        return self.preprocessor.transform(X), loc, scale
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
index e687f97b5..4818e20b4 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
@@ -22,9 +22,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
 
-        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
-        self.preprocessor['numerical'] = TimeSeriesScaler(mode="max_abs",
-                                                          sequence_lengths_train=sequence_lengths_train)
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode="max_abs")
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
index 2c105f616..c23d8cf06 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
@@ -21,9 +21,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
 
-        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
-
-        self.preprocessor["numerical"] = TimeSeriesScaler(mode="min_max", sequence_lengths_train=sequence_lengths_train)
+        self.preprocessor["numerical"] = TimeSeriesScaler(mode="min_max")
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
index c171b81d0..67a2c55fd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
@@ -32,8 +32,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         """
         self.check_requirements(X, y)
 
-        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
-        self.preprocessor["numerical"] = TimeSeriesScaler(mode="none", sequence_lengths_train=sequence_lengths_train)
+        self.preprocessor["numerical"] = TimeSeriesScaler(mode="none")
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
index 3cf9bb960..b831e222a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
@@ -21,9 +21,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
         self.check_requirements(X, y)
 
-        sequence_lengths_train = X['dataset_properties']['sequence_lengths_train']
-        self.preprocessor['numerical'] = TimeSeriesScaler(mode="standard",
-                                                          sequence_lengths_train=sequence_lengths_train)
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode="standard")
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index cec1255dd..97bfe8f76 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Callable, Optional
+from typing import Any, List, Callable, Optional, Union, Tuple
 
 import numpy as np
 
@@ -9,29 +9,42 @@
 # Similar to / inspired by
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
 class TimeSeriesScaler(BaseEstimator):
-    def __init__(self, mode: str, sequence_lengths_train: List[int], is_training=True):
+    def __init__(self, mode: str):
         self.mode = mode
-        self.sequence_lengths_train = sequence_lengths_train
-        self.is_training = is_training
+        #self.loc = 0.  # type: Union[np.ndarray, float]
+        #self.scale = 1.  # type: Union[np.ndarray, float]
 
     def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
         """
-        For time series we do not need to fit anything since each time series is scaled individually
+        The transformer is transformed on the fly (for each batch)
         """
-        return self
+        # we assuem that the last two
+        if self.mode == "standard":
+            self.loc = np.mean(X, axis=-2, keepdims=True)
+            self.scale = np.std(X, axis=-2, keepdims=True)
+
+        elif self.mode == "min_max":
+            min_ = np.min(X, axis=-2, keepdims=True)
+            max_ = np.max(X, axis=-2, keepdims=True)
+
+            diff_ = max_ - min_
+            self.loc = min_
+            self.scale = diff_
 
-    def eval(self):
-        self.is_training = False
+        elif self.mode == "max_abs":
+            max_abs_ = np.max(np.abs(X), axis=-2, keepdims=True)
+            max_abs_[max_abs_ == 0.0] = 1.0
+            self.loc = np.zeros_like(max_abs_)
+            self.scale = max_abs_
 
-    def scale_individual_seq(self, X, scaling: Callable):
-        idx_start = 0
-        for seq_length_train in self.sequence_lengths_train:
-            idx_end = seq_length_train + idx_start
-            X[idx_start: idx_end] = scaling(X[idx_start: idx_end])
-            idx_start = idx_end
-        return X
+        elif self.mode == "none":
+            self.loc = np.zeros([*X.shape[:-2], 1, X.shape[-1]])
+            self.scale = np.ones([*X.shape[:-2], 1, X.shape[-1]])
+        else:
+            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
+        return self
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: np.ndarray) -> Tuple[np.ndarray, ...]:
         """
         X = sklearn.utils.check_array(
             X,
@@ -40,59 +53,9 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             allow_nd=True,
             accept_sparse=False,
             accept_large_sparse=False
-        )
+        ) # type: np.ndarray
         """
-        if self.mode == "standard":
-            #mean_ = np.mean(X, axis=1, keepdims=True)
-            #std_ = np.std(X, axis=1, keepdims=True)
-            #std_[std_ == 0.0] = 1.0
 
-            def standard_scaling(x_seq):
-                mean_ = np.mean(x_seq)
-                std_ = np.std(x_seq)
-                if std_ == 0.0:
-                    std_ = 1.0
-                return (x_seq - mean_) / std_
+        return (X - self.loc) / self.scale
 
-            if self.is_training:
-                return self.scale_individual_seq(X, standard_scaling)
-            else:
-                return standard_scaling(X)
 
-        elif self.mode == "min_max":
-            #min_ = np.min(X, axis=1, keepdims=True)
-            #max_ = np.max(X, axis=1, keepdims=True)
-            def min_max_scaling(x_seq):
-                min_ = np.min(x_seq)
-                max_ = np.max(x_seq)
-
-                diff_ = max_ - min_
-                if diff_ == 0.0:
-                    diff_ = 1.0
-
-                return (x_seq - min_) / diff_
-            if self.is_training:
-                return self.scale_individual_seq(X, min_max_scaling)
-            else:
-                return min_max_scaling(X)
-
-
-        elif self.mode == "max_abs":
-            #max_abs_ = np.max(np.abs(X), axis=1, keepdims=True)
-            #max_abs_[max_abs_ == 0.0] = 1.0
-            def max_abs_scaling(x_seq):
-                max_abs_ = np.max(np.abs(x_seq))
-                if max_abs_ == 0.0:
-                    max_abs_ = 1.0
-
-                return x_seq / max_abs_
-            if self.is_training:
-                return self.scale_individual_seq(X, max_abs_scaling)
-            else:
-                return max_abs_scaling(X)
-
-        elif self.mode == "none":
-            return X
-
-        else:
-            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 2f29f27e3..94c4c90bb 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -2,8 +2,6 @@
 
 from torch.utils.data.sampler import SubsetRandomSampler
 
-from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
-
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
     UniformIntegerHyperparameter, Constant
@@ -236,7 +234,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
             self.dataset_small_preprocess = True
-            self.preprocess_transforms_test = X['preprocess_transforms']
         else:
             self.dataset_small_preprocess = False
 
@@ -330,10 +327,10 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
                                                           window_size=self.window_size,
                                                           subseq_length=self.subseq_length)))
         candidate_transformations.append((ExpandTransformTimeSeries()))
+        if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
+            candidate_transformations.extend(X['preprocess_transforms'])
 
-        # Transform to tensor
-        candidate_transformations.append(torch.from_numpy)
-
+        # We transform to tensor under dataset
         return torchvision.transforms.Compose(candidate_transformations)
 
     def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.ndarray] = None,
@@ -348,15 +345,6 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         if isinstance(X, np.ndarray):
             X = X[-self.subseq_length - self.n_prediction_steps + 1:]
 
-            if self.dataset_small_preprocess:
-                for preprocess in self.preprocess_transforms_test:
-                    if isinstance(preprocess, TimeSeriesTransformer):
-                        if preprocess.is_training:
-                            preprocess.eval()
-
-                transform = torchvision.transforms.Compose(self.preprocess_transforms_test)
-                X = transform(X)
-
             if y is not None:
                 # we want to make sure that X, and y can be mapped one to one (as sampling y requires a shifted value)
                 y = y[-self.subseq_length - self.n_prediction_steps + 1:]
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 1061fb313..612681b22 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -179,7 +179,6 @@ def _get_hyperparameter_search_space(self,
                                 raise ValueError("Cannot find a legal default configuration")
                             cs.get_hyperparameter('network_embedding:__choice__').default_value = default
 
-
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
         return cs
@@ -200,7 +199,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         default_dataset_properties = {'target_type': 'time_series_prediction'}
         if dataset_properties is not None:
             default_dataset_properties.update(dataset_properties)
-
+        # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),

From f5e8d4f9eb4c19b94c7d444b83452b22613f0b4f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 3 Dec 2021 18:57:36 +0100
Subject: [PATCH 073/347] forecasting trainer

---
 autoPyTorch/api/time_series_forecasting.py    |   2 +-
 autoPyTorch/datasets/time_series_dataset.py   |  28 ++-
 ...time_series_forecasting_train_evaluator.py |  14 +-
 .../scaling/utils.py                          |  17 +-
 .../time_series_forecasting_data_loader.py    |  71 +++---
 .../components/training/metrics/metrics.py    |  28 ++-
 .../components/training/metrics/utils.py      |   7 +-
 .../components/training/trainer/__init__.py   |   4 +-
 .../training/trainer/base_trainer.py          |   3 +-
 .../ForecastingMixUpTrainer.py                |  17 ++
 .../ForecastingStandardTrainer.py             |  17 ++
 .../trainer/forecasting_trainer/__init__.py   |  48 ++++
 .../forecasting_base_trainer.py               | 234 ++++++++++++++++++
 .../pipeline/time_series_forecasting.py       |   4 +-
 14 files changed, 435 insertions(+), 59 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
 create mode 100644 autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
 create mode 100644 autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 5e6b342f4..2784ca67e 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -304,7 +304,7 @@ def search(
         seasonality = SEASONALITY_MAP.get(self.dataset.freq, 1)
         if isinstance(seasonality, list):
             seasonality = min(seasonality)  # Use to calculate MASE
-        self._metrics_kwargs = {'sp': seasonality,
+        self._metrics_kwargs = {'sp': self.dataset.seasonality,
                                 'n_prediction_steps': n_prediction_steps}
 
         return self._search(
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index c93402811..2c4418cb3 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -73,7 +73,8 @@ def __init__(self,
         self.train_transform = train_transforms
         self.val_transform = val_transforms
 
-    def __getitem__(self, index: int, train: bool = True) -> Tuple[Dict[str, torch.Tensor], Optional[torch.Tensor]]:
+    def __getitem__(self, index: int, train: bool = True) \
+            -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
         """
         get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous sequences
         until the given index, this allows us to do further transformation when the
@@ -83,7 +84,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[Dict[str, torch.T
             train (bool): Whether to apply a train or test transformation, if any
 
         Returns:
-            A transformed single point prediction
+            features from past, targets from past and future
         """
         if index < 0:
             index = self.__len__() + 1 - index
@@ -98,21 +99,26 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[Dict[str, torch.T
         elif self.val_transform is not None and not train:
             X, loc, scale = self.val_transform(X)
         else:
-            loc = 0.0
-            scale = 1.0
+            loc = None
+            scale = None
 
         # In case of prediction, the targets are not provided
         Y = self.Y
         if Y is not None:
             # Y = Y[:index + self.n_prediction_steps]
             # Y = Y[index + 1: index + self.n_prediction_steps + 1]
-            Y = Y[index + 1: index + self.n_prediction_steps + 1]
+            Y_future = Y[index + 1: index + self.n_prediction_steps + 1]
 
-            Y = torch.from_numpy(Y)
+            Y_future = torch.from_numpy(Y_future)
+            # Y_Past does not need to be fed to the network, we keep it as np array
         else:
-            Y = None
+            Y_future = None
 
-        return {"value": torch.from_numpy(X), "loc": torch.from_numpy(loc), "scale": torch.from_numpy(scale)}, Y
+        # TODO consider static information and missing information
+        return {"value": torch.from_numpy(X),
+                "loc": torch.from_numpy(loc) if loc is not None else loc,
+                "scale": torch.from_numpy(scale) if scale is not None else scale}, \
+               Y_future
 
     def __len__(self) -> int:
         return self.X.shape[0]
@@ -303,6 +309,11 @@ def __init__(self,
             tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
             freq_value = tmp_freq
 
+        seasonality = SEASONALITY_MAP.get(freq, 1)
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        self.seasonality = seasonality
+
         self.freq: Optional[str] = freq
         self.freq_value: Optional[int] = freq_value
 
@@ -543,6 +554,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
         dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
                                    'upper_window_size': self.upper_window_size,
+                                   'sp': self.seasonality,  # For metric compuation
                                    'sequence_lengths_train': self.sequence_lengths_train})
         return dataset_properties
 
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 5c7ec45d0..e789da39c 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -16,7 +16,7 @@
 from smac.tae import StatusType
 
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
+from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES, compute_mase_coefficient
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -104,7 +104,7 @@ def fit_predict_and_loss(self) -> None:
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
 
-            mase_cofficient = self.compute_mase_coefficient(test_split)
+            mase_cofficient = self.generate_mase_coefficient_for_validation(test_split)
 
             forecasting_kwargs = {'sp': self.seasonality,
                                   'n_prediction_steps': self.n_prediction_steps,
@@ -152,7 +152,7 @@ def fit_predict_and_loss(self) -> None:
 
             mase_coefficient_all = []
             for train_split, test_split in self.splits:
-                mase_coefficient = self.compute_mase_coefficient(test_split)
+                mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)
                 mase_coefficient_all.append(mase_coefficient)
 
             for i, (train_split, test_split) in enumerate(self.splits):
@@ -273,7 +273,7 @@ def fit_predict_and_loss(self) -> None:
                 status=status,
             )
 
-    def compute_mase_coefficient(self, test_split: Sequence) -> np.ndarray:
+    def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.ndarray:
         """
         Compute the denominator for Mean Absolute Scaled Losses,
         For detail, please check sktime.performance_metrics.forecasting._functions.mean_absolute_scaled_error
@@ -289,17 +289,13 @@ def compute_mase_coefficient(self, test_split: Sequence) -> np.ndarray:
         """
         mase_coefficient = np.ones(len(test_split))
         if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
-            from sktime.performance_metrics.forecasting._functions import EPS, mean_absolute_error
             for seq_idx, test_idx in enumerate(test_split):
                 seq = self.datamanager[test_idx][0]
                 if seq.shape[-1] > 1:
                     seq = seq[self.datamanager.target_variables].squeeze()
                 else:
                     seq = seq.squeeze()
-                mase_denominator = mean_absolute_error(seq[self.seasonality:],
-                                                       seq[:-self.seasonality],
-                                                       multioutput="uniform_average")
-                mase_coefficient[seq_idx] = 1.0 / np.maximum(mase_denominator, EPS)
+                mase_coefficient[seq_idx] = compute_mase_coefficient(seq, self.seasonality)
         mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps)
         return mase_coefficient
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 97bfe8f76..4c1d51716 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -18,10 +18,11 @@ def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
         """
         The transformer is transformed on the fly (for each batch)
         """
-        # we assuem that the last two
+        # we assuem that the last two dimensions are [seq, features]
         if self.mode == "standard":
             self.loc = np.mean(X, axis=-2, keepdims=True)
             self.scale = np.std(X, axis=-2, keepdims=True)
+            self.scale[self.scale == 0.0] = 1.0
 
         elif self.mode == "min_max":
             min_ = np.min(X, axis=-2, keepdims=True)
@@ -30,16 +31,17 @@ def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
             diff_ = max_ - min_
             self.loc = min_
             self.scale = diff_
+            self.scale[self.scale == 0.0] = 1.0
 
         elif self.mode == "max_abs":
             max_abs_ = np.max(np.abs(X), axis=-2, keepdims=True)
             max_abs_[max_abs_ == 0.0] = 1.0
-            self.loc = np.zeros_like(max_abs_)
+            self.loc = None
             self.scale = max_abs_
 
         elif self.mode == "none":
-            self.loc = np.zeros([*X.shape[:-2], 1, X.shape[-1]])
-            self.scale = np.ones([*X.shape[:-2], 1, X.shape[-1]])
+            self.loc = None
+            self.scale = None
         else:
             raise ValueError(f"Unknown mode {self.mode} for time series scaler")
         return self
@@ -56,6 +58,11 @@ def transform(self, X: np.ndarray) -> Tuple[np.ndarray, ...]:
         ) # type: np.ndarray
         """
 
-        return (X - self.loc) / self.scale
+        if self.mode in ['standard', 'min_max']:
+            return (X - self.loc) / self.scale
+        elif self.mode == "max_abs":
+            return X / self.scale
+        else:
+            return X
 
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 94c4c90bb..c5e0a2739 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -32,7 +32,7 @@ class TimeSeriesSampler(SubsetRandomSampler):
     def __init__(self,
                  indices: Sequence[int],
                  seq_lengths: Sequence[int],
-                 num_instances_per_seqs: List[int],
+                 num_instances_per_seqs: Optional[List[int]]=None,
                  min_start: int = 0,
                  generator: Optional[torch.Generator] = None) -> None:
         """
@@ -47,32 +47,38 @@ def __init__(self,
             The set of all the possible indices that can be sampled from
         seq_lengths: Sequence[int]
             lengths of each sequence, applied to unsqueeze indices
-        num_instances_per_seqs: List[int]
-            how many instances are sampled in each sequence
+        num_instances_per_seqs: OPtional[List[int]]=None
+            how many instances are sampled in each sequence, if it is None, all the sequences are sampled
         min_start: int
             the how many first instances we want to skip (the first few sequences need to be padded with 0)
         generator: Optional[torch.Generator]
             pytorch generator to control the randomness
         """
         super(TimeSeriesSampler, self).__init__(indices, generator)
-        if len(seq_lengths) != len(num_instances_per_seqs):
-            raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
-                             f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
-        if np.sum(seq_lengths) != len(indices):
-            raise ValueError(f'the sum of sequence length must correspond to the number of indices. '
-                             f'However, they are {np.sum(seq_lengths)} versus {len(indices)}')
-        seq_intervals = []
-        idx_tracker = 0
-        for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
-            idx_end = idx_tracker + seq_length
-            idx_start = idx_tracker + min_start
-            interval = np.linspace(idx_start, idx_end, num_instances + 1, endpoint=True, dtype=np.int)
-            seq_intervals.append(interval)
-        self.seq_lengths = seq_lengths
-        self.num_instances = np.sum(num_instances_per_seqs)
-        self.seq_intervals = seq_intervals
+        if num_instances_per_seqs is None:
+            self.iter_all_seqs = True
+        else:
+            self.iter_all_seqs = False
+            if len(seq_lengths) != len(num_instances_per_seqs):
+                raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
+                                 f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
+            if np.sum(seq_lengths) != len(indices):
+                raise ValueError(f'the sum of sequence length must correspond to the number of indices. '
+                                 f'However, they are {np.sum(seq_lengths)} versus {len(indices)}')
+            seq_intervals = []
+            idx_tracker = 0
+            for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
+                idx_end = idx_tracker + seq_length
+                idx_start = idx_tracker + min_start
+                interval = np.linspace(idx_start, idx_end, num_instances + 1, endpoint=True, dtype=np.int)
+                seq_intervals.append(interval)
+            self.seq_lengths = seq_lengths
+            self.num_instances = np.sum(num_instances_per_seqs)
+            self.seq_intervals = seq_intervals
 
     def __iter__(self):
+        if self.iter_all_seqs:
+            return super().__iter__()
         samples = torch.ones(self.num_instances, dtype=torch.int)
         idx_samples_start = 0
         idx_seq_tracker = 0
@@ -245,15 +251,18 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_dataset = np.size(train_split)
         num_instances_train = self.num_batches_per_epoch * self.batch_size
 
-        # get the length of each sequence of training data (after split)
-        # as we already know that the elements in 'train_split' increases consecutively with a certain number of
-        # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
-        #  A new sequence must start from the index 7. We could then split each unique values to represent the length
-        # of each split
-        _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
-        num_instances_per_seqs = np.ceil(num_instances_train / num_instances_dataset * seq_train_length)
-        num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
-        # at least one element of each sequence should be selected
+        if num_instances_train > np.sum(train_split):
+            num_instances_per_seqs = None
+        else:
+            # get the length of each sequence of training data (after split)
+            # as we already know that the elements in 'train_split' increases consecutively with a certain number of
+            # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
+            #  A new sequence must start from the index 7. We could then split each unique values to represent the length
+            # of each split
+            _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
+            num_instances_per_seqs = np.ceil(num_instances_train / num_instances_dataset * seq_train_length)
+            num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
+            # at least one element of each sequence should be selected
 
         # TODO consider the case where num_instances_train is greater than num_instances_dataset,
         # In which case we simply iterate through all the datasets
@@ -410,7 +419,11 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                         window_size: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter='window_size',
                                                                   value_range=(20, 50),
-                                                                  default_value=30)
+                                                                  default_value=30),
+                                        num_batch_per_epoch: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter="num_batches_per_epoch",
+                                                                  value_range=(30, 200),
+                                                                  default_value=100)
                                         ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 2a4683604..ef614996f 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -1,5 +1,8 @@
 from functools import partial
 
+import numpy as np
+from typing import List, Union
+
 import sktime.performance_metrics.forecasting as forecasting_metrics
 import sklearn.metrics
 
@@ -7,6 +10,8 @@
 
 from autoPyTorch.pipeline.components.training.metrics.base import make_metric
 
+
+
 # Standard regression scores
 mean_absolute_error = make_metric('mean_absolute_error',
                                   sklearn.metrics.mean_absolute_error,
@@ -49,8 +54,27 @@
 
 # Standard Forecasting Scores
 
+
 # To avoid storing unnecessary scale values here, we scale all the values under
 # AutoPytorch.evaluation.time_series_forecasting_train_evaluator
+
+def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> float:
+    """
+    compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
+    this function aims at reducing the memroy requirement
+    Args:
+        past_target:  Optional[List, np.ndarray] past target observations
+        sp: seasonality parameter to compute sp
+
+    Returns:
+        mase_coefficient: inverse of mase_denominator
+    """
+    mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
+                                                               past_target[:-sp],
+                                                               multioutput="uniform_average")
+    return 1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
+
+
 mean_MASE_forecasting = make_metric('mean_MASE_forecasting',
                                     forecasting_metrics.mean_absolute_error,
                                     optimum=0,
@@ -60,7 +84,7 @@
                                     aggregation='mean',
                                     )
 
-median_MASE_forecasting = make_metric('median_absolute_scaled_error_forecasting',
+median_MASE_forecasting = make_metric('median_MASE_forecasting',
                                       forecasting_metrics.mean_absolute_error,
                                       optimum=0,
                                       worst_possible_result=MAXINT,
@@ -99,7 +123,7 @@
                                    aggregation='mean',
                                    )
 
-median_MAE_forecasting = make_metric('median_absolute_error_forecasting',
+median_MAE_forecasting = make_metric('median_MAE_forecasting',
                                      forecasting_metrics.mean_absolute_error,
                                      optimum=0,
                                      worst_possible_result=MAXINT,
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index ca0cfe4a6..2776f4d67 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -82,7 +82,10 @@ def get_metrics(dataset_properties: Dict[str, Any],
                                                 'binary': 'accuracy',
                                                 'multiclass-multioutput': 'f1'}),
                            regression=dict({'continuous': 'r2',
-                                            'continuous-multioutput': 'r2'}))
+                                            'continuous-multioutput': 'r2'}),
+                           forecasting=dict({'continuous': 'mean_MASE_forecasting',
+                                             'continuous-multioutput': 'mean_MASE_forecasting'})
+                           )
 
     supported_metrics = get_supported_metrics(dataset_properties)
     metrics: List[autoPyTorchMetric] = list()
@@ -103,6 +106,8 @@ def get_metrics(dataset_properties: Dict[str, Any],
                 metrics.append(supported_metrics[default_metrics['classification'][dataset_properties['output_type']]])
             if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in REGRESSION_TASKS:
                 metrics.append(supported_metrics[default_metrics['regression'][dataset_properties['output_type']]])
+            if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in FORECASTING_TASKS:
+                metrics.append(supported_metrics[default_metrics['forecasting'][dataset_properties['output_type']]])
 
     return metrics
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index e54006d10..7caeb5ee1 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -255,6 +255,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
             metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
         if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
             metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
+
         additional_losses = X['additional_losses'] if 'additional_losses' in X else None
         self.choice.prepare(
             model=X['network'],
@@ -268,7 +269,8 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
             scheduler=X['lr_scheduler'],
             task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
             labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
-            step_interval=X['step_interval']
+            step_interval=X['step_interval'],
+            dataset_properties=X['dataset_properties'],
         )
         total_parameter_count, trainable_parameter_count = self.count_parameters(X['network'])
         self.run_summary = RunSummary(
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index 4d46a3fc0..2135a4a46 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -205,7 +205,8 @@ def prepare(
         scheduler: _LRScheduler,
         task_type: int,
         labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
-        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch
+        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+        **kwargs: Dict
     ) -> None:
 
         # Save the device to be used
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
new file mode 100644
index 000000000..958f125f1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
@@ -0,0 +1,17 @@
+from typing import Dict, Optional, Union
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import \
+    ForecastingBaseTrainerComponent
+from autoPyTorch.pipeline.components.training.trainer.MixUpTrainer import MixUpTrainer
+
+
+class ForecastingMixUpTrainer(ForecastingBaseTrainerComponent, MixUpTrainer):
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ForecastingMixUpTrainer',
+            'name': 'MixUp Regularized Trainer',
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
new file mode 100644
index 000000000..7f9baa0fc
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
@@ -0,0 +1,17 @@
+from typing import Dict, Optional, Union
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import \
+    ForecastingBaseTrainerComponent
+from autoPyTorch.pipeline.components.training.trainer.StandardTrainer import StandardTrainer
+
+
+class ForecastingStandardTrainer(ForecastingBaseTrainerComponent, StandardTrainer):
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ForecastingStandardTrainer',
+            'name': 'Forecasting Standard Trainer',
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
new file mode 100644
index 000000000..9ae1e7f9e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -0,0 +1,48 @@
+import collections
+import os
+
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
+    ForecastingBaseTrainerComponent,
+)
+
+
+trainer_directory = os.path.split(__file__)[0]
+_trainers = find_components(__package__,
+                            trainer_directory,
+                            ForecastingBaseTrainerComponent)
+_addons = ThirdPartyComponents(ForecastingBaseTrainerComponent)
+
+
+def add_trainer(trainer: ForecastingBaseTrainerComponent) -> None:
+    _addons.add_component(trainer)
+
+
+from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
+
+
+class ForecastingTrainerChoice(TrainerChoice):
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available trainer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all components available
+                as choices for learning rate scheduling
+        """
+        components: Dict[str, autoPyTorchComponent] = collections.OrderedDict()
+        components.update(_trainers)
+        components.update(_addons.components)
+        return components
+
+
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
new file mode 100644
index 000000000..d3298d8f1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -0,0 +1,234 @@
+from abc import ABC
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import warnings
+import numpy as np
+
+import pandas as pd
+
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.tensorboard.writer import SummaryWriter
+
+from torch.distributions import (
+    AffineTransform,
+    TransformedDistribution,
+)
+
+from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
+from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
+
+from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
+
+
+class ForecastingBaseTrainerComponent(BaseTrainerComponent, ABC):
+    def prepare(
+            self,
+            metrics: List[Any],
+            model: torch.nn.Module,
+            criterion: Type[torch.nn.Module],
+            budget_tracker: BudgetTracker,
+            optimizer: Optimizer,
+            device: torch.device,
+            metrics_during_training: bool,
+            scheduler: _LRScheduler,
+            task_type: int,
+            labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
+            step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+            dataset_properties: Optional[Dict]=None
+    ) -> None:
+        for metric in metrics:
+            if metric in MASE_LOSSES:
+                warnings.warn("MASE Losses are unsupported for trainer!")
+                metrics.remove(metric)
+
+        super().prepare(metrics=metrics,
+                        model=model,
+                        criterion=criterion,
+                        budget_tracker=budget_tracker,
+                        optimizer=optimizer,
+                        device=device,
+                        metrics_during_training=metrics_during_training,
+                        scheduler=scheduler,
+                        task_type=task_type,
+                        labels=labels,
+                        step_interval=step_interval
+                        )
+        metric_kwargs = {"sp": dataset_properties.get("sp", 1),
+                         "n_prediction_steps": dataset_properties.get("n_prediction_steps", 1)}
+        self.metrics_kwargs = metric_kwargs
+
+    def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
+                    writer: Optional[SummaryWriter],
+                    ) -> Tuple[float, Dict[str, float]]:
+        """
+        Train the model for a single epoch.
+
+        Args:
+            train_loader (torch.utils.data.DataLoader): generator of features/label
+            epoch (int): The current epoch used solely for tracking purposes
+
+        Returns:
+            float: training loss
+            Dict[str, float]: scores for each desired metric
+        """
+
+        loss_sum = 0.0
+        N = 0
+        self.model.train()
+        outputs_data = list()
+        targets_data = list()
+
+        for step, (data, targets) in enumerate(train_loader):
+            if self.budget_tracker.is_max_time_reached():
+                break
+
+            loss, outputs = self.train_step(data, targets)
+
+            if self.metrics_during_training:
+                # save for metric evaluation
+                outputs_data.append(outputs.detach().cpu())
+                targets_data.append(targets.detach().cpu())
+
+            batch_size = data["value"].size(0)
+            loss_sum += loss * batch_size
+            N += batch_size
+
+            if writer:
+                writer.add_scalar(
+                    'Train/loss',
+                    loss,
+                    epoch * len(train_loader) + step,
+                )
+
+        self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
+
+        if self.metrics_during_training:
+            return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
+        else:
+            return loss_sum / N, {}
+
+    def rescale_output_distribution(self,
+                                      outputs: torch.distributions.Distribution,
+                                      loc: Optional[torch.Tensor],
+                                      scale: Optional[torch.Tensor]):
+        # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
+        if loc is not None or scale is not None:
+            transfomr = AffineTransform(loc=0.0 if loc is None else loc,
+                                        scale=1.0 if scale is None else scale,
+                                        )
+            outputs = TransformedDistribution(outputs, [transfomr])
+        return outputs
+
+    def train_step(self, data: Dict[str, torch.Tensor], targets:  Dict[str, Union[torch.Tensor, np.ndarray]])\
+            -> Tuple[float, torch.Tensor]:
+        """
+        Allows to train 1 step of gradient descent, given a batch of train/labels
+
+        Args:
+            data (torch.Tensor): input features to the network
+            targets (torch.Tensor): ground truth to calculate loss
+
+        Returns:
+            torch.Tensor: The predictions of the network
+            float: the loss incurred in the prediction
+        """
+        X = data['value']
+        loc = data['loc']
+        scale = data['scale']
+
+        # prepare
+        X = X.float().to(self.device)
+
+        targets = self.cast_targets(targets)
+
+        X, criterion_kwargs = self.data_preparation(X, targets)
+
+        # training
+        self.optimizer.zero_grad()
+        outputs = self.model(X)
+
+        outputs = self.rescale_output_distribution(outputs, loc=loc, scale=scale)
+
+        loss_func = self.criterion_preparation(**criterion_kwargs)
+        loss = loss_func(self.criterion, outputs)
+        loss.backward()
+        self.optimizer.step()
+        self._scheduler_step(step_interval=StepIntervalUnit.batch, loss=loss.item())
+
+        return loss.item(), outputs
+
+    def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
+                 writer: Optional[SummaryWriter],
+                 ) -> Tuple[float, Dict[str, float]]:
+        """
+        Evaluate the model in both metrics and criterion
+
+        Args:
+            test_loader (torch.utils.data.DataLoader): generator of features/label
+            epoch (int): the current epoch for tracking purposes
+
+        Returns:
+            float: test loss
+            Dict[str, float]: scores for each desired metric
+        """
+        self.model.eval()
+
+        loss_sum = 0.0
+        N = 0
+        outputs_data = list()
+        targets_data = list()
+
+        with torch.no_grad():
+            for step, (data, targets) in enumerate(test_loader):
+                X = data['value']
+                loc = data['loc']
+                scale = data['scale']
+
+                batch_size = X.shape[0]
+
+                # prepare
+                X = X.float().to(self.device)
+
+                targets = self.cast_targets(targets)
+
+                X, criterion_kwargs = self.data_preparation(X, targets)
+
+                outputs = self.model(X)
+
+                outputs = self.rescale_output_distribution(outputs, loc=loc, scale=scale)
+
+                loss = self.criterion(outputs, targets)
+
+                loss_sum += loss.item() * batch_size
+                N += batch_size
+                if loc is None and scale is None:
+                    outputs_data.append(outputs.mean.detach().cpu())
+                else:
+                    if loc is None:
+                        loc = 0.
+                    if scale is None:
+                        scale = 1.
+                    outputs_data.append(outputs.base_dist.mean * scale + loc)
+                targets_data.append(targets.detach().cpu())
+
+                if writer:
+                    writer.add_scalar(
+                        'Val/loss',
+                        loss.item(),
+                        epoch * len(test_loader) + step,
+                    )
+
+        self._scheduler_step(step_interval=StepIntervalUnit.valid, loss=loss_sum / N)
+
+        self.model.train()
+        return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
+
+    def compute_metrics(self, outputs_data: List[torch.Tensor], targets_data: List[torch.Tensor]
+                        ) -> Dict[str, float]:
+        # TODO: change once Ravin Provides the PR
+        outputs_data = torch.cat(outputs_data, dim=0).numpy()
+        targets_data = torch.cat(targets_data, dim=0).numpy()
+        return calculate_score(targets_data, outputs_data, self.task_type, self.metrics, **self.metrics_kwargs)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 612681b22..f02e46739 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -32,7 +32,7 @@
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
     TimeSeriesForecastingDataLoader
-from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
@@ -218,7 +218,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                                           random_state=self.random_state)),
             ("lr_scheduler", SchedulerChoice(default_dataset_properties,
                                              random_state=self.random_state)),
-            ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("trainer", ForecastingTrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps
 

From 11737f66203c2b96c31e39e5a4bd4d3ab25f47fc Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 3 Dec 2021 22:45:08 +0100
Subject: [PATCH 074/347] forecasting network

---
 .../setup/network/forecasting_network.py      | 65 +++++++++++++++++++
 .../network_backbone/TimeSeriesMLPBackbone.py | 12 ++--
 .../distributed_network_head.py               |  4 ++
 .../forecasting_base_trainer.py               |  2 +-
 .../pipeline/time_series_forecasting.py       |  4 +-
 5 files changed, 78 insertions(+), 9 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network/forecasting_network.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
new file mode 100644
index 000000000..c4c9eceb4
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -0,0 +1,65 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
+from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+
+
+class ForecastingNetworkComponent(NetworkComponent):
+    def __init__(
+            self,
+            network: Optional[torch.nn.Module] = None,
+            random_state: Optional[np.random.RandomState] = None,
+            auto_regressive: Optional[bool] = False,
+    ) -> None:
+        super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
+        self.auto_regressive = auto_regressive
+
+    def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
+        """
+        Performs batched prediction given a loader object
+        """
+        assert self.network is not None
+        self.network.eval()
+
+        # Batch prediction
+        Y_batch_preds = list()
+
+        for i, (X_batch, Y_batch) in enumerate(loader):
+            # Predict on batch
+            X = X_batch['value']
+            loc = X_batch['loc']
+            scale = X_batch['scale']
+
+            X = X.float().to(self.device)
+
+            with torch.no_grad():
+                Y_batch_pred = self.network(X).mean
+                if loc is not None or scale is not None:
+                    if loc is None:
+                        loc = 0.
+                    if scale is None:
+                        scale = 1.
+                    Y_batch_pred = Y_batch_pred * scale + loc
+
+            Y_batch_preds.append(Y_batch_pred.cpu())
+
+        return torch.cat(Y_batch_preds, 0).cpu().numpy()
+
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+                                        **kwargs: Any
+                                        ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        return cs
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
index c9673f3d2..f55e5a184 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
@@ -17,12 +17,11 @@ class _TimeSeriesMLP(nn.Module):
     def __init__(self,
                  module_layers: nn.Module,
                  ):
-        self.module_layers = module_layers
         super().__init__()
+        self.module_layers = module_layers
 
     def forward(self, x: torch.Tensor):
-        # https://discuss.pytorch.org/t/how-could-i-flatten-two-dimensions-of-a-tensor/44570/4
-        x = x.view(-1, *x.shape[2:])
+        x = x.view(x.shape[0], -1)
         return self.module_layers(x)
 
 
@@ -33,16 +32,17 @@ class TimeSeriesMLPBackbone(MLPBackbone):
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
         requirements_list = super()._required_fit_arguments
-        requirements_list.append(FitRequirement('window_size', (str,), user_defined=False, dataset_property=False))
+        requirements_list.append(FitRequirement('window_size', (int,), user_defined=False, dataset_property=False))
         return requirements_list
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
+        X['MLP_backbone'] = True
         return super().fit(X, y)
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        in_features = input_shape[0] * self.window_size
-        return self._build_backbone(in_features)
+        in_features = input_shape[-1] * self.window_size
+        return _TimeSeriesMLP(self._build_backbone(in_features))
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
index 3ff766a23..200785f8b 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
@@ -30,6 +30,7 @@ def __init__(self,
             FitRequirement('train_with_log_prob', (str,), user_defined=True, dataset_property=True),
             FitRequirement('n_prediction_steps', (int,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False)
         ])
         self.head: Optional[nn.Module] = None
         self.config = kwargs
@@ -52,6 +53,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         # TODO consider Auto-regressive model on vanilla network head
         if auto_regressive:
             output_shape[0] = 1
+        mlp_backbone = X.get("MLP_backbone", False)
+        if mlp_backbone:
+            input_shape = (X["window_size"], input_shape[-1])
         self.head = self.build_head(
             input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape),
             output_shape=output_shape,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index d3298d8f1..39a79a8c3 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -41,7 +41,7 @@ def prepare(
     ) -> None:
         for metric in metrics:
             if metric in MASE_LOSSES:
-                warnings.warn("MASE Losses are unsupported for trainer!")
+                warnings.warn("MASE Losses are not supported for trainer! We remove them here")
                 metrics.remove(metric)
 
         super().prepare(metrics=metrics,
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index f02e46739..d01bffdfa 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -23,7 +23,7 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
-from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
 from autoPyTorch.pipeline.components.setup.network_initializer import (
@@ -211,7 +211,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                                                        random_state=self.random_state)),
             ("network_head", NetworkHeadChoice(default_dataset_properties,
                                                random_state=self.random_state)),
-            ("network", NetworkComponent(random_state=self.random_state)),
+            ("network", ForecastingNetworkComponent(random_state=self.random_state)),
             ("network_init", NetworkInitializerChoice(default_dataset_properties,
                                                       random_state=self.random_state)),
             ("optimizer", OptimizerChoice(default_dataset_properties,

From 3f8cf1aa13cf095660abfd5f03d37d0939e90fe3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 6 Dec 2021 20:44:15 +0100
Subject: [PATCH 075/347] target scaler

---
 .../data/time_series_forecasting_validator.py |   2 +
 autoPyTorch/datasets/time_series_dataset.py   |  95 +++++++++------
 ...time_series_forecasting_train_evaluator.py |  15 ++-
 .../TimeSeriesTransformer.py                  |  13 +-
 .../TargetMaxAbsScaler.py                     |  17 +++
 .../TargetMinMaxScaler.py                     |  17 +++
 .../TargetNoScaler.py                         |  17 +++
 .../TargetStandardScaler.py                   |  17 +++
 .../forecasting_target_scaling/__init__.py    | 115 ++++++++++++++++++
 .../base_target_scaler.py                     |  69 +++++++++++
 .../forecasting_target_scaling/utils.py       |  47 +++++++
 .../setup/network/forecasting_network.py      |  24 ++--
 .../distributed_network_head/distribution.py  |  10 +-
 .../time_series_forecasting_data_loader.py    |   4 +-
 .../components/training/metrics/base.py       |  33 +----
 .../components/training/metrics/metrics.py    |  21 +---
 .../components/training/metrics/utils.py      |   2 +-
 .../components/training/trainer/__init__.py   |  87 +++++++------
 .../training/trainer/base_trainer.py          |  14 ++-
 .../trainer/forecasting_trainer/__init__.py   |  51 +++++++-
 .../forecasting_base_trainer.py               |  61 ++++++----
 .../pipeline/time_series_forecasting.py       |  17 ++-
 22 files changed, 554 insertions(+), 194 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index a6af528a3..04469f92a 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -113,6 +113,8 @@ def transform(
 
             X_transformed = self.feature_validator.transform(X_flat)
             y_transformed = self.target_validator.transform(y_flat)
+            if y_transformed.ndim == 1:
+                y_transformed = np.expand_dims(y_transformed, -1)
             return X_transformed, sequence_lengths, y_transformed
 
             num_train_data = np.sum(sequence_lengths)
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 2c4418cb3..b4915a4d4 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -33,6 +33,8 @@
     TimeSeriesTransformer
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.constants_forecasting import SEASONALITY_MAP
+from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
+
 
 TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
 TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
@@ -48,6 +50,7 @@ def __init__(self,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  n_prediction_steps: int = 1,
+                 sp: int = 1,
                  ):
         """
         A dataset representing a time series sequence.
@@ -72,6 +75,7 @@ def __init__(self,
         # or for augmentation
         self.train_transform = train_transforms
         self.val_transform = val_transforms
+        self.sp = sp
 
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
@@ -94,10 +98,13 @@ def __getitem__(self, index: int, train: bool = True) \
         else:
             X = self.X[:index + 1]
 
+        if not train:
+            mase_coefficient = compute_mase_coefficient(X, sp=self.sp)
+
         if self.train_transform is not None and train:
-            X, loc, scale = self.train_transform(X)
+            X = self.train_transform(X)
         elif self.val_transform is not None and not train:
-            X, loc, scale = self.val_transform(X)
+            X = self.val_transform(X)
         else:
             loc = None
             scale = None
@@ -113,12 +120,12 @@ def __getitem__(self, index: int, train: bool = True) \
             # Y_Past does not need to be fed to the network, we keep it as np array
         else:
             Y_future = None
-
-        # TODO consider static information and missing information
-        return {"value": torch.from_numpy(X),
-                "loc": torch.from_numpy(loc) if loc is not None else loc,
-                "scale": torch.from_numpy(scale) if scale is not None else scale}, \
-               Y_future
+        if train:
+            # TODO consider static information and missing information
+            return {"past_target": torch.from_numpy(X)},  Y_future
+        else:
+            return {"past_target": torch.from_numpy(X),
+                    "mase_coefficient": mase_coefficient},  Y_future
 
     def __len__(self) -> int:
         return self.X.shape[0]
@@ -157,7 +164,7 @@ def __init__(self,
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+                 target_variables: Optional[Union[Tuple[int], int]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  resampling_strategy: Union[
                      CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
@@ -174,8 +181,9 @@ def __init__(self,
                  train_with_log_prob: bool = True,
                  ):
         """
-        :param target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] used for multi-variant forecasting
-        tasks, the target_variables indicates which values in X corresponds to Y
+        :param target_variables:  Optional[Union[Tuple[int], int]] used for multi-variant forecasting
+        tasks, the target_variables indicates which values in X corresponds to Y.
+        TODO add supports on X for pandas and target variables can be str or Tuple[str]
         :param freq: Optional[Union[str, int]] frequency of the series sequences, used to determine the (possible)
         period
         :param n_prediction_steps: The number of steps you want to forecast into the future
@@ -190,6 +198,28 @@ def __init__(self,
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
+        if freq is None:
+            self.freq = None
+            self.freq_value = None
+
+        if isinstance(freq, str):
+            if freq not in SEASONALITY_MAP:
+                Warning("The given freq name is not supported by our dataset, we will use the default "
+                        "configuration space on the hyperparameter window_size, if you want to adapt this value"
+                        "you could pass freq with a numerical value")
+            freq_value = SEASONALITY_MAP.get(freq, None)
+        if isinstance(freq, list):
+            tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
+            freq_value = tmp_freq
+
+        seasonality = SEASONALITY_MAP.get(freq, 1)
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        self.seasonality = seasonality
+
+        self.freq: Optional[str] = freq
+        self.freq_value: Optional[int] = freq_value
+
         self.dataset_name = dataset_name
 
         if self.dataset_name is None:
@@ -215,6 +245,14 @@ def __init__(self,
 
         self.shift_input_data = shift_input_data
         self.target_variables = target_variables
+        if target_variables is None:
+            if self.num_target != 1:
+                raise ValueError("target_variables must be specified if more the input has more than one feature value")
+            self.target_columns = (0, ) # to keep the output dimension unchanged
+        elif isinstance(target_variables, int):
+            self.target_columns = (target_variables, )
+        else:
+            self.target_columns = target_variables
 
         X, sequence_lengths, Y = self.validator.transform(X, Y,
                                                           shift_input_data=shift_input_data,
@@ -222,7 +260,8 @@ def __init__(self,
         if X_test is not None:
             X_test, self.sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test,
                                                                                    shift_input_data=shift_input_data,
-                                                                                   n_prediction_steps=n_prediction_steps)
+                                                                                   n_prediction_steps=n_prediction_steps
+                                                                                   )
         else:
             self.sequence_lengths_tests = None
 
@@ -243,7 +282,8 @@ def __init__(self,
         # initialize datasets
         sequences_kwargs = {"train_transforms": self.train_transform,
                             "val_transforms": self.val_transform,
-                            "n_prediction_steps": n_prediction_steps}
+                            "n_prediction_steps": n_prediction_steps,
+                            "sp": self.seasonality}
 
         self.y_train_mean = [0] * len(self.sequence_lengths_train)
         self.y_train_std = [1] * len(self.sequence_lengths_train)
@@ -284,7 +324,7 @@ def __init__(self,
             self.output_shape = [self.n_prediction_steps, num_target]
 
         # TODO: Look for a criteria to define small enough to preprocess
-        self.is_small_preprocess = False
+        self.is_small_preprocess = True
 
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
@@ -295,28 +335,6 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
-        if freq is None:
-            self.freq = None
-            self.freq_value = None
-
-        if isinstance(freq, str):
-            if freq not in SEASONALITY_MAP:
-                Warning("The given freq name is not supported by our dataset, we will use the default "
-                        "configuration space on the hyperparameter window_size, if you want to adapt this value"
-                        "you could pass freq with a numerical value")
-            freq_value = SEASONALITY_MAP.get(freq, None)
-        if isinstance(freq, list):
-            tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
-            freq_value = tmp_freq
-
-        seasonality = SEASONALITY_MAP.get(freq, 1)
-        if isinstance(seasonality, list):
-            seasonality = min(seasonality)  # Use to calculate MASE
-        self.seasonality = seasonality
-
-        self.freq: Optional[str] = freq
-        self.freq_value: Optional[int] = freq_value
-
         # TODO in the future, if training losses types are considered as a type of hyperparameters, we need to remove
         #  this line and create  conditional configspace under
         #  autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice .
@@ -546,7 +564,8 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
             'upper_window_size': self.upper_window_size,
-            'train_with_log_prob': self.train_with_log_prob
+            'train_with_log_prob': self.train_with_log_prob,
+            'target_columns': self.target_columns
         })
         return info
 
@@ -554,7 +573,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
         dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
                                    'upper_window_size': self.upper_window_size,
-                                   'sp': self.seasonality,  # For metric compuation
+                                   'sp': self.seasonality,  # For metric computation
                                    'sequence_lengths_train': self.sequence_lengths_train})
         return dataset_properties
 
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index e789da39c..6cc1f6f2e 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -66,6 +66,7 @@ def __init__(self, backend: Backend, queue: Queue,
         self.datamanager: TimeSeriesForecastingDataset
         self.n_prediction_steps = self.datamanager.n_prediction_steps
         self.num_sequences = self.datamanager.num_sequences
+        self.num_targets = self.datamanager.num_target
         self.seq_length_min = np.min(self.num_sequences)
         seasonality = SEASONALITY_MAP.get(self.datamanager.freq, 1)
         if isinstance(seasonality, list):
@@ -124,7 +125,7 @@ def fit_predict_and_loss(self) -> None:
             self.finish_up(
                 loss=loss,
                 train_loss=train_loss,
-                opt_pred=y_opt_pred.flatten() * mase_cofficient,
+                opt_pred=y_opt_pred * mase_cofficient,
                 valid_pred=y_valid_pred,
                 test_pred=y_test_pred,
                 additional_run_info=additional_run_info,
@@ -287,16 +288,17 @@ def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.n
         mase_coefficient: np.ndarray(self.num_sequence * self.n_prediction_steps)
             inverse of the mase_denominator
         """
-        mase_coefficient = np.ones(len(test_split))
+        mase_coefficient = np.ones([len(test_split), self.num_targets])
         if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
             for seq_idx, test_idx in enumerate(test_split):
-                seq = self.datamanager[test_idx][0]
+                seq = self.datamanager[test_idx][0]['past_target']
                 if seq.shape[-1] > 1:
                     seq = seq[self.datamanager.target_variables].squeeze()
                 else:
                     seq = seq.squeeze()
                 mase_coefficient[seq_idx] = compute_mase_coefficient(seq, self.seasonality)
-        mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps)
+
+        mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
         return mase_coefficient
 
     def _predict(self, pipeline: BaseEstimator,
@@ -304,9 +306,10 @@ def _predict(self, pipeline: BaseEstimator,
                  test_indices: Union[np.ndarray, List],
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
         # TODO consider multile outputs
-        opt_pred = np.ones([len(test_indices), self.n_prediction_steps])
+        opt_pred = np.ones([len(test_indices), self.n_prediction_steps, self.num_targets])
         for seq_idx, test_idx in enumerate(test_indices):
-            opt_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0], pipeline).squeeze()
+            opt_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0]['past_target'], pipeline)
+        opt_pred = opt_pred.reshape(-1, self.num_targets)
 
         #TODO we consider X_valid and X_test as a multiple sequences???
         if self.X_valid is not None:
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 46d24518e..aa3c05007 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -22,8 +22,6 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.add_fit_requirements([
             FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
-        self.loc = 0.
-        self.scale = 1.
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
         """
@@ -53,6 +51,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
             remainder='passthrough'
         )
 
+
         """
         # Where to get the data -- Prioritize X_train if any else
         # get from backend
@@ -61,6 +60,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
         """
+        X_train = X['backend'].load_datamanager().train_tensors[0]
+        self.preprocessor.fit(X_train)
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -80,14 +81,8 @@ def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torc
         if self.preprocessor is None:
             raise ValueError("cant call {} without fitting the column transformer first."
                              .format(self.__class__.__name__))
-        self.preprocessor.fit(X)
 
         #if len(X.shape) == 2:
         #    # expand batch dimension when called on a single record
         #    X = X[np.newaxis, ...]
-
-        scaler = self.preprocessor.named_transformers_['numerical_pipeline']['timeseriesscaler']
-        loc = scaler.loc
-        scale = scaler.scale
-
-        return self.preprocessor.transform(X), loc, scale
+        return self.preprocessor.transform(X)
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
new file mode 100644
index 000000000..1751fce32
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, Optional, Union
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
+    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+
+
+class TargetMaxAbsScaler(BaseTargetScaler):
+    @property
+    def scaler_mode(self):
+        return 'max_abs'
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TargetMaxAbsScaler',
+            'name': 'TargetMaxAbsScaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
new file mode 100644
index 000000000..1aaf95762
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, Optional, Union
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
+    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+
+
+class TargetMinMaxScaler(BaseTargetScaler):
+    @property
+    def scaler_mode(self):
+        return 'min_max'
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TargetMinMaxScaler',
+            'name': 'TargetMinMaxScaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py
new file mode 100644
index 000000000..6eb6332f6
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, Optional, Union
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
+    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+
+
+class TargetNoScaler(BaseTargetScaler):
+    @property
+    def scaler_mode(self):
+        return 'none'
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TargetNoScaler',
+            'name': 'TargetNoScaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
new file mode 100644
index 000000000..57dcda878
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, Optional, Union
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
+    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+
+
+class TargetStandardScaler(BaseTargetScaler):
+    @property
+    def scaler_mode(self):
+        return 'standard'
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TargetStandardScaler',
+            'name': 'TargetStandardScaler'
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
new file mode 100644
index 000000000..a40da3792
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
@@ -0,0 +1,115 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling.\
+    base_target_scaler import BaseTargetScaler
+
+scaling_directory = os.path.split(__file__)[0]
+_scalers = find_components(__package__,
+                           scaling_directory,
+                           BaseTargetScaler)
+
+_addons = ThirdPartyComponents(BaseTargetScaler)
+
+
+def add_scaler(scaler: BaseTargetScaler) -> None:
+    _addons.add_component(scaler)
+
+
+class TargetScalerChoice(ScalerChoice):
+    """
+    Allows for dynamically choosing scaling component at runtime, not
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available scaler components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseScalers components available
+                as choices for scaling
+        """
+        components = OrderedDict()
+        components.update(_scalers)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, Any]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        available_scalers = self.get_available_components(dataset_properties=dataset_properties,
+                                                          include=include,
+                                                          exclude=exclude)
+
+        if len(available_scalers) == 0:
+            raise ValueError("no scalers found, please add a scaler")
+
+        if default is None:
+            defaults = ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'NoScaler']
+            for default_ in defaults:
+                if default_ in available_scalers:
+                    default = default_
+                    break
+
+        # add only no scaler to choice hyperparameters in case the dataset is only categorical
+        if len(dataset_properties['numerical_features']) == 0:
+            default = 'NoScaler'
+            if include is not None and default not in include:
+                raise ValueError("Provided {} in include, however, "
+                                 "the dataset is incompatible with it".format(include))
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         ['NoScaler'],
+                                                         default_value=default)
+        else:
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         list(available_scalers.keys()),
+                                                         default_value=default)
+        cs.add_hyperparameter(preprocessor)
+
+        # add only child hyperparameters of early_preprocessor choices
+        for name in preprocessor.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
+                                                                                   **updates)
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, config_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the fit dictionary
+        It recursively makes sure that the children and parent level requirements
+        are honored before fit.
+        Args:
+            dataset_properties (Dict[str, Any]): dictionary holding the dataset properties
+
+        """
+        super()._check_dataset_properties(dataset_properties)
+        assert "target_columns" in dataset_properties, \
+            "Dataset properties must contain information about the type of target_columns"
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
new file mode 100644
index 000000000..b7f2d8349
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
@@ -0,0 +1,69 @@
+from typing import Any, Dict, List, Optional, Union, Tuple
+
+import numpy as np
+
+from sklearn.pipeline import Pipeline, make_pipeline
+#from sktime.transformations.panel.compose import ColumnTransformer
+from sklearn.compose import ColumnTransformer
+
+import torch
+
+from autoPyTorch.utils.common import FitRequirement, subsampler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling\
+    .utils import TargetScaler
+
+
+class BaseTargetScaler(autoPyTorchTimeSeriesPreprocessingComponent):
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.random_state = random_state
+        self.preprocessor: Optional[Pipeline] = None
+        self.add_fit_requirements([
+            FitRequirement('target_columns', (Tuple,), user_defined=True, dataset_property=True),
+        ])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "BaseBatchScaler":
+        """
+        Creates a column transformer for the chosen tabular
+        preprocessors
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            "TabularColumnTransformer": an instance of self
+        """
+        self.check_requirements(X, y)
+        self.target_columns = X['dataset_properties']['target_columns']
+        self.scaler = TargetScaler(mode=self.scaler_mode)
+        return self
+
+    @property
+    def scaler_mode(self):
+        raise NotImplementedError
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the time series transformer to fit dictionary
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            X (Dict[str, Any]): updated fit dictionary
+        """
+        X.update({'target_scaler': self})
+        return X
+
+    def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
+
+        if self.scaler is None:
+            raise ValueError("cant call {} without fitting the column transformer first."
+                             .format(self.__class__.__name__))
+
+        if len(X.shape) == 2:
+            # expand batch dimension when called on a single record
+            X = X[np.newaxis, ...]
+        X[:, :, self.target_columns], loc, scale = self.scaler.transform(X[:, :, self.target_columns])
+        return X, loc, scale
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
new file mode 100644
index 000000000..128040cef
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict, Callable, Optional, Union, Tuple
+
+import torch
+import sklearn
+from sklearn.base import BaseEstimator
+
+
+# Similar to / inspired by
+# https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
+class TargetScaler(BaseEstimator):
+    """
+    To accelerate training, this scaler is only applied under trainer (after the data is loaded by dataloader)
+    """
+    def __init__(self, mode: str):
+        self.mode = mode
+
+    def fit(self, X: Dict, y: Any = None) -> "TimeSeriesScalerBatch":
+        return self
+
+    def transform(self, X: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if self.mode == "standard":
+            loc = torch.mean(X, dim=-2, keepdim=True)
+            scale = torch.std(X, dim=-2, keepdim=True)
+            scale[scale == 0.0] = 1.0
+            return (X - loc) / scale, loc, scale
+
+        elif self.mode == "min_max":
+            min_ = torch.min(X, dim=-2, keepdim=True)[0]
+            max_ = torch.max(X, dim=-2, keepdim=True)[0]
+
+            diff_ = max_ - min_
+            loc = min_
+            scale = diff_
+            scale[scale == 0.0] = 1.0
+            return (X - loc) / scale, loc, scale
+
+        elif self.mode == "max_abs":
+            max_abs_ = torch.max(torch.abs(X), dim=-2, keepdim=True)[0]
+            max_abs_[max_abs_ == 0.0] = 1.0
+            scale = max_abs_
+            return X / scale, None, scale
+
+        elif self.mode == "none":
+            return X, None, None
+
+        else:
+            raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index c4c9eceb4..21f95a07d 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -9,7 +9,8 @@
 
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
+    base_target_scaler import BaseTargetScaler
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 
@@ -24,7 +25,8 @@ def __init__(
         super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
         self.auto_regressive = auto_regressive
 
-    def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
+    def predict(self, loader: torch.utils.data.DataLoader,
+                target_scaler: Optional[BaseTargetScaler] = None) -> torch.Tensor:
         """
         Performs batched prediction given a loader object
         """
@@ -36,11 +38,17 @@ def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
 
         for i, (X_batch, Y_batch) in enumerate(loader):
             # Predict on batch
-            X = X_batch['value']
-            loc = X_batch['loc']
-            scale = X_batch['scale']
+            X = X_batch['past_target']
 
-            X = X.float().to(self.device)
+            X = X.float()
+
+            if target_scaler is None:
+                loc = 0.
+                scale = 1.
+            else:
+                X, loc, scale = target_scaler(X)
+
+            X = X.to(self.device)
 
             with torch.no_grad():
                 Y_batch_pred = self.network(X).mean
@@ -49,17 +57,15 @@ def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
                         loc = 0.
                     if scale is None:
                         scale = 1.
-                    Y_batch_pred = Y_batch_pred * scale + loc
+                Y_batch_pred = Y_batch_pred.cpu() * scale + loc
 
             Y_batch_preds.append(Y_batch_pred.cpu())
 
         return torch.cat(Y_batch_preds, 0).cpu().numpy()
 
-
     @staticmethod
     def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                                         **kwargs: Any
                                         ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         return cs
-
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
index 4fe8715ff..8ac7623d6 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
@@ -103,7 +103,7 @@ def arg_dims(self) -> Dict[str, int]:
 
     def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         scale = F.softplus(scale)
-        return loc.squeeze(-1).squeeze(-1), scale.squeeze(-1).squeeze(-1)
+        return loc.squeeze(-1), scale.squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -119,7 +119,7 @@ def domain_map(self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor) \
             -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         scale = F.softplus(scale)
         df = 2.0 + F.softplus(df)
-        return df.squeeze(-1).squeeze(-1), loc.squeeze(-1).squeeze(-1), scale.squeeze(-1).squeeze(-1)
+        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -137,7 +137,7 @@ def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor)
         epsilon = 1e-10
         concentration1 = F.softplus(concentration1) + epsilon
         concentration0 = F.softplus(concentration0) + epsilon
-        return concentration1.squeeze(-1).squeeze(-1), concentration0.squeeze(-1).squeeze(-1)
+        return concentration1.squeeze(-1), concentration0.squeeze(-1).squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -155,7 +155,7 @@ def domain_map(self, concentration: torch.Tensor, rate: torch.Tensor) \
         epsilon = 1e-10
         concentration = F.softplus(concentration) + epsilon
         rate = F.softplus(rate) + epsilon
-        return concentration.squeeze(-1).squeeze(-1), rate.squeeze(-1).squeeze(-1)
+        return concentration.squeeze(-1), rate.squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
@@ -169,7 +169,7 @@ def arg_dims(self) -> Dict[str, int]:
 
     def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor,]:
         rate_pos = F.softplus(rate).clone()
-        return rate_pos.squeeze(-1).squeeze(-1),
+        return rate_pos.squeeze(-1),
 
     @property
     def dist_cls(self) -> type(Distribution):
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index c5e0a2739..88e032dbe 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -350,8 +350,8 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         applying the transformations meant to validation objects
         This is a lazy loaded test set, each time only one piece of series
         """
-        # TODO any better way to deal with prediction data loader for multiple sequences
-        if isinstance(X, np.ndarray):
+        # TODO more supported inputs
+        if isinstance(X, (np.ndarray, torch.Tensor)):
             X = X[-self.subseq_length - self.n_prediction_steps + 1:]
 
             if y is not None:
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index c01c9be78..0857244d1 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, Dict
 
 import numpy as np
 
@@ -7,6 +7,7 @@
 from sklearn.utils.multiclass import type_of_target
 
 
+
 class autoPyTorchMetric(object, metaclass=ABCMeta):
 
     def __init__(self,
@@ -197,7 +198,8 @@ def __call__(
             y_pred: np.ndarray,
             sp: int,
             n_prediction_steps: int,
-            horizon_weight: Optional[List[float]] = None
+            horizon_weight: Optional[List[float]] = None,
+            **kwarg: Dict,
     ) -> float:
         """Evaluate time series forecastin losses given input data
         The description is nearly the same as the one defined under
@@ -222,31 +224,8 @@ def __call__(
         score : float
             Score function applied to prediction of estimator on X.
         """
-        type_true = type_of_target(y_true)
-        if type_true == 'binary' and type_of_target(y_pred) == 'continuous' and \
-                len(y_pred.shape) == 1:
-            # For a pred autoPyTorchMetric, no threshold, nor probability is required
-            # If y_true is binary, and y_pred is continuous
-            # it means that a rounding is necessary to obtain the binary class
-            y_pred = np.around(y_pred, decimals=0)
-        elif len(y_pred.shape) == 1 or y_pred.shape[1] == 1 or \
-                type_true == 'continuous':
-            # must be regression, all other task types would return at least
-            # two probabilities
-            pass
-        elif type_true in ['binary', 'multiclass']:
-            y_pred = np.argmax(y_pred, axis=1)
-        elif type_true == 'multilabel-indicator':
-            y_pred[y_pred > 0.5] = 1.0
-            y_pred[y_pred <= 0.5] = 0.0
-        elif type_true in ['continuous-multioutput', 'multiclass-multioutput']:
-            pass
-        else:
-            raise ValueError(type_true)
 
         agg = self._kwargs['aggregation']
-        y_true = y_true.reshape([-1, n_prediction_steps])
-        y_pred = y_pred.reshape([-1, n_prediction_steps])
 
         if not len(y_pred) == len(y_true):
             raise ValueError(f"The length of y_true, y_pred and y_train must equal, however, they are "
@@ -260,9 +239,9 @@ def __call__(
                                                                  horizon_weight=horizon_weight,
                                                                  **self._kwargs)
         if agg == 'mean':
-            return np.mean(losses_all)
+            return self._sign * np.mean(losses_all)
         elif agg == 'median':
-            return np.median(losses_all)
+            return self._sign * np.median(losses_all)
         else:
             raise ValueError(f'Unsupported aggregation type {agg}')
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index ef614996f..6cf5a9489 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -71,7 +71,7 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
     """
     mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
                                                                past_target[:-sp],
-                                                               multioutput="uniform_average")
+                                                               multioutput="raw_values")
     return 1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
 
 
@@ -96,24 +96,6 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
 MASE_LOSSES = [mean_MASE_forecasting, median_MASE_forecasting]
 
 
-mean_MSSE_forecasting = make_metric('mean_MSSE_forecasting',
-                                    forecasting_metrics.mean_squared_scaled_error,
-                                    optimum=0,
-                                    worst_possible_result=MAXINT,
-                                    greater_is_better=False,
-                                    do_forecasting=True,
-                                    aggregation='mean',
-                                    )
-
-median_MSSE_forecasting = make_metric('median_MSSE_forecasting',
-                                      forecasting_metrics.mean_squared_scaled_error,
-                                      optimum=0,
-                                      worst_possible_result=MAXINT,
-                                      greater_is_better=False,
-                                      do_forecasting=True,
-                                      aggregation='median',
-                                      )
-
 mean_MAE_forecasting = make_metric('mean_MAE_forecasting',
                                    forecasting_metrics.mean_absolute_error,
                                    optimum=0,
@@ -199,7 +181,6 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
 
 FORECASTING_METRICS = dict()
 for scorer in [mean_MASE_forecasting, median_MASE_forecasting,
-               mean_MSSE_forecasting, median_MSSE_forecasting,
                mean_MAE_forecasting, median_MAE_forecasting,
                mean_MAPE_forecasting, median_MAPE_forecasting,
                mean_MSE_forecasting, median_MSE_forecasting]:
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 2776f4d67..03cf4cca6 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -124,7 +124,7 @@ def calculate_score(
         for metric_ in metrics:
             if metric_ in MASE_LOSSES and 'mase_cofficient' in score_kwargs:
                 target_scaled = target * score_kwargs['mase_cofficient']
-                cprediction_scaled = cprediction_scaled * score_kwargs['mase_cofficient']
+                cprediction_scaled = cprediction * score_kwargs['mase_cofficient']
                 score_dict[metric_.name] = metric_(target_scaled, cprediction_scaled, **score_kwargs)
             else:
                 score_dict[metric_.name] = metric_(target, cprediction, **score_kwargs)
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index 7caeb5ee1..ed77337ea 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -66,19 +66,21 @@ def __init__(self,
                          random_state=random_state)
         self.run_summary: Optional[RunSummary] = None
         self.writer: Optional[SummaryWriter] = None
-        self._fit_requirements: Optional[List[FitRequirement]] = [
-            FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
-            FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
-            FitRequirement(
-                "optimizer", (Optimizer,), user_defined=False, dataset_property=False),
-            FitRequirement("train_data_loader",
-                           (torch.utils.data.DataLoader,),
-                           user_defined=False, dataset_property=False),
-            FitRequirement("val_data_loader",
-                           (torch.utils.data.DataLoader,),
-                           user_defined=False, dataset_property=False)]
         self.checkpoint_dir: Optional[str] = None
 
+    @property
+    def _fit_requirements(self) -> Optional[List[FitRequirement]]:
+        return [FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
+                FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
+                FitRequirement(
+                    "optimizer", (Optimizer,), user_defined=False, dataset_property=False),
+                FitRequirement("train_data_loader",
+                               (torch.utils.data.DataLoader,),
+                               user_defined=False, dataset_property=False),
+                FitRequirement("val_data_loader",
+                               (torch.utils.data.DataLoader,),
+                               user_defined=False, dataset_property=False)]
+
     def get_fit_requirements(self) -> Optional[List[FitRequirement]]:
         return self._fit_requirements
 
@@ -98,11 +100,11 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         return components
 
     def get_hyperparameter_search_space(
-        self,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        default: Optional[str] = None,
-        include: Optional[List[str]] = None,
-        exclude: Optional[List[str]] = None,
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
     ) -> ConfigurationSpace:
         """Returns the configuration space of the current chosen components
 
@@ -204,7 +206,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
             name=f"{X['num_run']}_{time.time()}",
             # Log to a user provided port else to the default logging port
             port=X['logger_port'
-                   ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+            ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         )
 
         # Call the actual fit function.
@@ -216,6 +218,33 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
 
         return cast(autoPyTorchComponent, self.choice)
 
+    def prepare_trainer(self, X):
+        """
+        prepare trainer, forecasting tasks require more parameters
+        """
+        # Support additional user metrics
+        metrics = get_metrics(dataset_properties=X['dataset_properties'])
+        if 'additional_metrics' in X:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
+        if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
+
+        additional_losses = X['additional_losses'] if 'additional_losses' in X else None
+        self.choice.prepare(
+            model=X['network'],
+            metrics=metrics,
+            criterion=get_loss(X['dataset_properties'],
+                               name=additional_losses),
+            budget_tracker=self.budget_tracker,
+            optimizer=X['optimizer'],
+            device=get_device_from_fit_dictionary(X),
+            metrics_during_training=X['metrics_during_training'],
+            scheduler=X['lr_scheduler'],
+            task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
+            labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
+            step_interval=X['step_interval'],
+        )
+
     def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice':
         """
         Fits a component by using an input dictionary with pre-requisites
@@ -249,29 +278,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
             max_epochs=X['epochs'] if 'epochs' in X else None,
         )
 
-        # Support additional user metrics
-        metrics = get_metrics(dataset_properties=X['dataset_properties'])
-        if 'additional_metrics' in X:
-            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
-        if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
-            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
-
-        additional_losses = X['additional_losses'] if 'additional_losses' in X else None
-        self.choice.prepare(
-            model=X['network'],
-            metrics=metrics,
-            criterion=get_loss(X['dataset_properties'],
-                               name=additional_losses),
-            budget_tracker=self.budget_tracker,
-            optimizer=X['optimizer'],
-            device=get_device_from_fit_dictionary(X),
-            metrics_during_training=X['metrics_during_training'],
-            scheduler=X['lr_scheduler'],
-            task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
-            labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
-            step_interval=X['step_interval'],
-            dataset_properties=X['dataset_properties'],
-        )
+        self.prepare_trainer(X)
         total_parameter_count, trainable_parameter_count = self.count_parameters(X['network'])
         self.run_summary = RunSummary(
             total_parameter_count,
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index 2135a4a46..d619011e4 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -16,7 +16,8 @@
 from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
-from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS
+from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS, \
+    FORECASTING_METRICS
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 from autoPyTorch.utils.implementations import get_loss_weight_strategy
 
@@ -123,11 +124,12 @@ def get_best_epoch(self, loss_type: str = 'val_loss') -> int:
         # If we compute validation scores, prefer the performance
         # metric to the loss
         if self.optimize_metric is not None:
-            scorer = CLASSIFICATION_METRICS[
-                self.optimize_metric
-            ] if self.optimize_metric in CLASSIFICATION_METRICS else REGRESSION_METRICS[
-                self.optimize_metric
-            ]
+            if self.optimize_metric in CLASSIFICATION_METRICS:
+                scorer = CLASSIFICATION_METRICS[self.optimize_metric]
+            elif self.optimize_metric in REGRESSION_METRICS:
+                scorer = REGRESSION_METRICS[self.optimize_metric]
+            else:
+                scorer = FORECASTING_METRICS[self.optimize_metric]
             # Some metrics maximize, other minimize!
             opt_func = np.argmax if scorer._sign > 0 else np.argmin
             return int(opt_func(
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 9ae1e7f9e..23e1563ae 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -3,17 +3,24 @@
 
 from typing import Any, Dict, List, Optional, Tuple, cast
 
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
+    ForecastingBaseTrainerComponent,
+)
+
+from autoPyTorch.constants import STRING_TO_TASK_TYPES
 
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
     find_components,
 )
+from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
+from autoPyTorch.pipeline.components.training.losses import get_loss
+from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
+    base_target_scaler import BaseTargetScaler
 
-from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
-    ForecastingBaseTrainerComponent,
-)
-
+from autoPyTorch.utils.common import get_device_from_fit_dictionary
 
 trainer_directory = os.path.split(__file__)[0]
 _trainers = find_components(__package__,
@@ -30,6 +37,40 @@ def add_trainer(trainer: ForecastingBaseTrainerComponent) -> None:
 
 
 class ForecastingTrainerChoice(TrainerChoice):
+    @property
+    def _fit_requirements(self) -> Optional[List[FitRequirement]]:
+        fit_requirements = super()._fit_requirements
+        fit_requirements.append(FitRequirement("target_scaler", (BaseTargetScaler,),
+                                               user_defined=False, dataset_property=False))
+        return fit_requirements
+
+    def prepare_trainer(self, X):
+        # Support additional user metrics
+        metrics = get_metrics(dataset_properties=X['dataset_properties'])
+        if 'additional_metrics' in X:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
+        if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
+            metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
+
+        additional_losses = X['additional_losses'] if 'additional_losses' in X else None
+
+        self.choice.prepare(
+            model=X['network'],
+            metrics=metrics,
+            criterion=get_loss(X['dataset_properties'],
+                               name=additional_losses),
+            budget_tracker=self.budget_tracker,
+            optimizer=X['optimizer'],
+            device=get_device_from_fit_dictionary(X),
+            metrics_during_training=X['metrics_during_training'],
+            scheduler=X['lr_scheduler'],
+            task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
+            labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
+            step_interval=X['step_interval'],
+            dataset_properties=X['dataset_properties'],
+            target_scaler=X['target_scaler'],
+        )
+
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available trainer components
 
@@ -44,5 +85,3 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components.update(_trainers)
         components.update(_addons.components)
         return components
-
-
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 39a79a8c3..6bda6332a 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -16,6 +16,10 @@
     TransformedDistribution,
 )
 
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
+    base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
+    TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
@@ -37,13 +41,11 @@ def prepare(
             task_type: int,
             labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
             step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
-            dataset_properties: Optional[Dict]=None
+            dataset_properties: Optional[Dict] = None,
+            target_scaler: BaseTargetScaler = TargetNoScaler(),
     ) -> None:
-        for metric in metrics:
-            if metric in MASE_LOSSES:
-                warnings.warn("MASE Losses are not supported for trainer! We remove them here")
-                metrics.remove(metric)
-
+        # metrics_during_training is not appliable when computing scaled values
+        metrics_during_training = False
         super().prepare(metrics=metrics,
                         model=model,
                         criterion=criterion,
@@ -59,6 +61,7 @@ def prepare(
         metric_kwargs = {"sp": dataset_properties.get("sp", 1),
                          "n_prediction_steps": dataset_properties.get("n_prediction_steps", 1)}
         self.metrics_kwargs = metric_kwargs
+        self.target_scaler = target_scaler  # typing: BaseTargetScaler
 
     def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                     writer: Optional[SummaryWriter],
@@ -74,7 +77,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
-
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -92,7 +94,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                 outputs_data.append(outputs.detach().cpu())
                 targets_data.append(targets.detach().cpu())
 
-            batch_size = data["value"].size(0)
+            batch_size = data["past_target"].size(0)
             loss_sum += loss * batch_size
             N += batch_size
 
@@ -111,18 +113,18 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             return loss_sum / N, {}
 
     def rescale_output_distribution(self,
-                                      outputs: torch.distributions.Distribution,
-                                      loc: Optional[torch.Tensor],
-                                      scale: Optional[torch.Tensor]):
+                                    outputs: torch.distributions.Distribution,
+                                    loc: Optional[torch.Tensor],
+                                    scale: Optional[torch.Tensor]):
         # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
         if loc is not None or scale is not None:
-            transfomr = AffineTransform(loc=0.0 if loc is None else loc,
-                                        scale=1.0 if scale is None else scale,
+            transfomr = AffineTransform(loc=0.0 if loc is None else loc.to(self.device),
+                                        scale=1.0 if scale is None else scale.to(self.device),
                                         )
             outputs = TransformedDistribution(outputs, [transfomr])
         return outputs
 
-    def train_step(self, data: Dict[str, torch.Tensor], targets:  Dict[str, Union[torch.Tensor, np.ndarray]])\
+    def train_step(self, data: Dict[str, torch.Tensor], targets: Dict[str, Union[torch.Tensor, np.ndarray]]) \
             -> Tuple[float, torch.Tensor]:
         """
         Allows to train 1 step of gradient descent, given a batch of train/labels
@@ -135,12 +137,14 @@ def train_step(self, data: Dict[str, torch.Tensor], targets:  Dict[str, Union[to
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        X = data['value']
-        loc = data['loc']
-        scale = data['scale']
+        X = data['past_target']
 
         # prepare
-        X = X.float().to(self.device)
+        X = X.float()
+
+        X, loc, scale = self.target_scaler(X)
+
+        X = X.to(self.device)
 
         targets = self.cast_targets(targets)
 
@@ -181,16 +185,22 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         outputs_data = list()
         targets_data = list()
 
+        mase_coefficients = list()
+
         with torch.no_grad():
             for step, (data, targets) in enumerate(test_loader):
-                X = data['value']
-                loc = data['loc']
-                scale = data['scale']
+                X = data['past_target']
+
+                mase_coefficients.append(data['mase_coefficient'])
 
                 batch_size = X.shape[0]
 
                 # prepare
-                X = X.float().to(self.device)
+                X = X.float()
+
+                X, loc, scale = self.target_scaler(X)
+
+                X = X.to(self.device)
 
                 targets = self.cast_targets(targets)
 
@@ -211,7 +221,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                         loc = 0.
                     if scale is None:
                         scale = 1.
-                    outputs_data.append(outputs.base_dist.mean * scale + loc)
+                    outputs_data.append(outputs.base_dist.mean.detach().cpu() * scale + loc)
                 targets_data.append(targets.detach().cpu())
 
                 if writer:
@@ -220,6 +230,10 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                         loss.item(),
                         epoch * len(test_loader) + step,
                     )
+        # mase_coefficent has the shape [B, 1, 1]
+        # to be compatible with outputs_data with shape [B, n_prediction_steps, num_output]
+        mase_coefficients = np.expand_dims(torch.cat(mase_coefficients, dim=0).numpy(), axis=[1])
+        self.metrics_kwargs.update({'mase_cofficient': mase_coefficients})
 
         self._scheduler_step(step_interval=StepIntervalUnit.valid, loss=loss_sum / N)
 
@@ -231,4 +245,5 @@ def compute_metrics(self, outputs_data: List[torch.Tensor], targets_data: List[t
         # TODO: change once Ravin Provides the PR
         outputs_data = torch.cat(outputs_data, dim=0).numpy()
         targets_data = torch.cat(targets_data, dim=0).numpy()
+
         return calculate_score(targets_data, outputs_data, self.task_type, self.metrics, **self.metrics_kwargs)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index d01bffdfa..36a375d58 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from sklearn.base import RegressorMixin
+from sklearn.pipeline import Pipeline
 
 import torch
 
@@ -29,6 +30,8 @@
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling import \
+    TargetScalerChoice
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
     TimeSeriesForecastingDataLoader
@@ -81,6 +84,8 @@ def __init__(self,
             config, steps, dataset_properties, include, exclude,
             random_state, init_params, search_space_updates)
 
+        self.target_scaler = None
+
         # Because a pipeline is passed to a worker, we need to honor the random seed
         # in this context. A tabular regression pipeline will implement a torch
         # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html
@@ -105,6 +110,12 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
                              metrics=metrics)['r2']
         return r2
 
+
+    def fit(self, X: Dict[str, Any], y: Optional[np.ndarray] = None,
+            **fit_params: Any) -> Pipeline:
+        super().fit(X, y, ** fit_params)
+        self.target_scaler = X['target_scaler']
+
     def _get_hyperparameter_search_space(self,
                                          dataset_properties: Dict[str, Any],
                                          include: Optional[Dict[str, Any]] = None,
@@ -202,7 +213,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
-            ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+            # ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
             ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
             ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
@@ -218,6 +229,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                                           random_state=self.random_state)),
             ("lr_scheduler", SchedulerChoice(default_dataset_properties,
                                              random_state=self.random_state)),
+            ("target_scaler", TargetScalerChoice(default_dataset_properties,
+                                                 random_state=self.random_state)),
             ("trainer", ForecastingTrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps
@@ -284,4 +297,4 @@ def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray
             batch_size = X.shape[0]
 
         loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size)
-        return self.named_steps['network'].predict(loader)
+        return self.named_steps['network'].predict(loader, self.target_scaler)

From 5f355999c4e2f40a2ecc81796939df4f2c0dd805 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 6 Dec 2021 22:54:28 +0100
Subject: [PATCH 076/347] maint

---
 autoPyTorch/api/base_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 42aec5f8c..4bda47df6 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -934,7 +934,7 @@ def _search(
 
 
         # Incorporate budget to pipeline config
-        if budget_type not in ('epochs', 'runtime') or (budget_type == 'resolution' and not time_series_forecasting):
+        if budget_type not in ('epochs', 'runtime') and (budget_type == 'resolution' and not time_series_forecasting):
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type

From 29812bdaf6ea19fc357494d78968baf48ceaa080 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 6 Dec 2021 23:50:50 +0100
Subject: [PATCH 077/347] maint

---
 autoPyTorch/api/base_task.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 4bda47df6..e4ab30f37 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -33,6 +33,7 @@
 from autoPyTorch.automl_common.common.utils.backend import Backend, create
 from autoPyTorch.constants import (
     REGRESSION_TASKS,
+    FORECASTING_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
 )
@@ -77,7 +78,7 @@ def send_warnings_to_log(
     X_ = X.copy()
     with warnings.catch_warnings():
         warnings.showwarning = send_warnings_to_log
-        if task in REGRESSION_TASKS:
+        if task in REGRESSION_TASKS or task in FORECASTING_TASKS:
             # Voting regressor does not support batch size
             prediction = pipeline.predict(X_)
         else:

From c2bc325a4c2741380810cccdf843a3a545cbccbd Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 8 Dec 2021 00:01:35 +0100
Subject: [PATCH 078/347] network embedding and update lstm for DeepAR model

---
 autoPyTorch/datasets/time_series_dataset.py   |  4 +-
 .../forecasting_target_scaling/__init__.py    |  2 +-
 .../components/setup/network/base_network.py  |  7 +--
 .../setup/network/forecasting_network.py      | 20 +++++++-
 .../setup/network_backbone/LSTMBackbone.py    | 49 +++++++++++++------
 .../setup/network_backbone/utils.py           |  9 +++-
 .../LearnedEntityEmbedding.py                 |  2 +-
 .../setup/network_embedding/NoEmbedding.py    |  2 +-
 .../base_network_embedding.py                 | 11 +++--
 .../distributed_fully_connected.py            | 20 ++++++--
 .../distributed_network_head.py               |  4 +-
 .../pipeline/time_series_forecasting.py       | 10 ++--
 12 files changed, 100 insertions(+), 40 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index b4915a4d4..486513f4b 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -243,6 +243,8 @@ def __init__(self,
         self.num_features = self.validator.feature_validator.num_features  # type: int
         self.num_target = self.validator.target_validator.out_dimensionality  # type: int
 
+        self.categories = self.validator.feature_validator.categories
+
         self.shift_input_data = shift_input_data
         self.target_variables = target_variables
         if target_variables is None:
@@ -563,7 +565,7 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'categorical_features': self.categorical_features,
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
-            'upper_window_size': self.upper_window_size,
+            'categories': self.categories,
             'train_with_log_prob': self.train_with_log_prob,
             'target_columns': self.target_columns
         })
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
index a40da3792..83e0c9e7a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
@@ -112,4 +112,4 @@ def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
         """
         super()._check_dataset_properties(dataset_properties)
         assert "target_columns" in dataset_properties, \
-            "Dataset properties must contain information about the type of target_columns"
+            "Dataset properties must contain information about the target_columns"
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index eec9f18f7..49928b77c 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -30,7 +30,7 @@ def __init__(
         self.add_fit_requirements([
             FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            # FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
         ])
         self.network = network
         self.final_activation: Optional[torch.nn.Module] = None
@@ -49,10 +49,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         # Make sure that input dictionary X has the required
         # information to fit this stage
         self.check_requirements(X, y)
-        if 'network_embedding' in X.keys():
-            self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
-        else:
-            self.network = torch.nn.Sequential(X['network_backbone'], X['network_head'])
+        self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
 
         # Properly set the network training device
         if self.device is None:
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 21f95a07d..d7c3198d0 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,5 +1,4 @@
 from typing import Any, Dict, Optional, Union
-
 from ConfigSpace.configuration_space import ConfigurationSpace
 
 import numpy as np
@@ -15,6 +14,25 @@
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 
 
+class ForecastingNet(nn.Module):
+    def __init__(self,
+                 network_backbone: nn.Module,
+                 network_head: nn.Module,
+                 network_embedding: Optional[nn.embedding] = None,
+                 network_properties: Dict = {}):
+        super(ForecastingNet, self).__init__()
+        if network_embedding is not None:
+            self.backbone = nn.Sequential(network_embedding, network_backbone)
+        else:
+            self.backbone = nn.Sequential(network_backbone)
+        self.backbone_output_tuple = network_properties.get("backbone_output_tuple", False)
+        self.network_head = network_head
+
+    def forward(self, X: torch.Tensor):
+        # TODO find a proper way to pass hx to lstm
+        pass
+
+
 class ForecastingNetworkComponent(NetworkComponent):
     def __init__(
             self,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
index b2fe70c89..637555283 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
@@ -11,7 +11,8 @@
 import torch
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import\
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import \
     NetworkBackboneComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
@@ -22,6 +23,7 @@ def __init__(self,
                  config: Dict[str, Any]):
         super().__init__()
         self.config = config
+        self.only_return_final_stage = True
         self.lstm = nn.LSTM(input_size=in_features,
                             hidden_size=config["hidden_size"],
                             num_layers=config["num_layers"],
@@ -29,23 +31,28 @@ def __init__(self,
                             bidirectional=config["bidirectional"],
                             batch_first=True)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor,
+                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
         B, T, _ = x.shape
 
-        hidden_states, (_, _) = self.lstm(x)
-        if not self.config["bidirectional"]:
-            return hidden_states[:, -1, :]
+        outputs, hidden_state, = self.lstm(x, hx)
+
+        if self.only_return_final_stage:
+            if not self.config["bidirectional"]:
+                return outputs[:, -1, :],
+            else:
+                # concatenate last forward hidden state with first backward hidden state
+                outputs_by_direction = outputs.view(B,
+                                                    T,
+                                                    2,
+                                                    self.config["hidden_size"])
+                out = torch.cat([
+                    outputs_by_direction[:, -1, 0, :],
+                    outputs_by_direction[:, 0, 1, :]
+                ], dim=1)
+                return out,
         else:
-            # concatenate last forward hidden state with first backward hidden state
-            hidden_states_by_direction = hidden_states.view(B,
-                                                            T,
-                                                            2,
-                                                            self.config["hidden_size"])
-            out = torch.cat([
-                hidden_states_by_direction[:, -1, 0, :],
-                hidden_states_by_direction[:, 0, 1, :]
-            ], dim=1)
-            return out
+            return outputs, hidden_state
 
 
 class LSTMBackbone(NetworkBackboneComponent):
@@ -63,6 +70,18 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         self.backbone = backbone
         return backbone
 
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        X['network_output_tuple'] = True
+        return super().fit(X, y)
+
+    @property
+    def only_return_final_stage(self):
+        return self.backbone.only_return_final_stage
+
+    @only_return_final_stage.setter
+    def only_return_final_stage(self, only_return_final_stage):
+        self.backbone.only_return_final_stage = only_return_final_stage
+
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         return {
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
index bb1a93ac1..1f9530a6c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -16,18 +16,23 @@
 }
 
 
-def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...]
+def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], network_output_tuple: bool = False
                      ) -> Tuple[int, ...]:
     """
     Run a dummy forward pass to get the output shape of the backbone.
     Can and should be overridden by subclasses that know the output shape
     without running a dummy forward pass.
     :param input_shape: shape of the input
+    : network_output_tuple: bool, if the network backbone output a tuple. if yes, the shape of the first output is
+    returned
     :return: output_shape
     """
     placeholder = torch.randn((2, *input_shape), dtype=torch.float)
     with torch.no_grad():
-        output = network(placeholder)
+        if network_output_tuple:
+            output = network(placeholder)[0]
+        else:
+            output = network(placeholder)
     return tuple(output.shape[1:])
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 49ecf40b7..8ad6549a2 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -131,5 +131,5 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'name': 'LearnedEntityEmbedding',
             'handles_tabular': True,
             'handles_image': False,
-            'handles_time_series': False,
+            'handles_time_series': True,
         }
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index aded4f84d..028dfb77b 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -42,5 +42,5 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'name': 'NoEmbedding',
             'handles_tabular': True,
             'handles_image': False,
-            'handles_time_series': False,
+            'handles_time_series': True,
         }
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 8652c347c..770741e2f 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -38,9 +38,14 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
             num_numerical_columns = 0
         else:
             X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
-
-            numerical_column_transformer = X['tabular_transformer'].preprocessor. \
-                named_transformers_['numerical_pipeline']
+            if 'tabular_transformer' in X:
+                numerical_column_transformer = X['tabular_transformer'].preprocessor. \
+                    named_transformers_['numerical_pipeline']
+            elif 'time_series_transformer' in X:
+                numerical_column_transformer = X['time_series_transformer'].preprocessor. \
+                    named_transformers_['numerical_pipeline']
+            else:
+                raise ValueError("Either a tabular or time_series transformer must be contained!")
             num_numerical_columns = numerical_column_transformer.transform(
                 X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
         num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
index e0ff4f925..675eb18f0 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
@@ -28,7 +28,16 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
-        return FullyConnectedHead._build_head(self, input_shape)
+        layers = []
+        in_features = input_shape[-1]
+        for i in range(1, self.config["num_layers"]):
+            layers.append(nn.Linear(in_features=in_features,
+                                    out_features=self.config[f"units_layer_{i}"]))
+            layers.append(_activations[self.config["activation"]]())
+            in_features = self.config[f"units_layer_{i}"]
+        head_base_output_features = in_features
+
+        return layers, head_base_output_features
 
     @staticmethod
     def get_hyperparameter_search_space(
@@ -45,9 +54,9 @@ def get_hyperparameter_search_space(
         dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
                                                                         value_range=tuple(ALL_DISTRIBUTIONS.keys()),
                                                                         default_value=list(ALL_DISTRIBUTIONS.keys())[0]),
-        auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
-                                                                               value_range=(True, False),
-                                                                               default_value=False)
+        #auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+        #                                                                       value_range=(True, False),
+        #                                                                       default_value=False)
     ) -> ConfigurationSpace:
         cs = FullyConnectedHead.get_hyperparameter_search_space(dataset_properties=dataset_properties,
                                                                 num_layers=num_layers,
@@ -55,5 +64,6 @@ def get_hyperparameter_search_space(
                                                                 activation=activation)
 
         add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
-        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        # TODO let dataset_properties decide if autoregressive models is applied
+        #add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
index 200785f8b..6b2cafc95 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
@@ -54,10 +54,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if auto_regressive:
             output_shape[0] = 1
         mlp_backbone = X.get("MLP_backbone", False)
+        network_output_tuple = X.get("network_output_tuple", False)
         if mlp_backbone:
             input_shape = (X["window_size"], input_shape[-1])
         self.head = self.build_head(
-            input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape),
+            input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape,
+                                         network_output_tuple=network_output_tuple),
             output_shape=output_shape,
         )
         return self
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 36a375d58..655fc525e 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -21,10 +21,12 @@
     TimeSeriesTransformer
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
+    ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
+from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
 from autoPyTorch.pipeline.components.setup.network_initializer import (
@@ -110,10 +112,9 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
                              metrics=metrics)['r2']
         return r2
 
-
     def fit(self, X: Dict[str, Any], y: Optional[np.ndarray] = None,
             **fit_params: Any) -> Pipeline:
-        super().fit(X, y, ** fit_params)
+        super().fit(X, y, **fit_params)
         self.target_scaler = X['target_scaler']
 
     def _get_hyperparameter_search_space(self,
@@ -152,7 +153,6 @@ def _get_hyperparameter_search_space(self,
             warnings.warn('Time series forecasting is being used, however the target_type'
                           'is not given as "time_series_forecasting". Overriding it.')
             dataset_properties['target_type'] = 'time_series_forecasting'
-
         # get the base search space given this
         # dataset properties. Then overwrite with custom
         # regression requirements
@@ -218,6 +218,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
             ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
                                                             random_state=self.random_state)),
+            ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
+                                                         random_state=self.random_state)),
             ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
                                                        random_state=self.random_state)),
             ("network_head", NetworkHeadChoice(default_dataset_properties,

From 528516987221c8e3c7e34a6459da267d0ff95316 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 8 Dec 2021 20:02:51 +0100
Subject: [PATCH 079/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py   |   5 +-
 .../setup/network/forecasting_network.py      |  21 +--
 .../InceptionTimeBackbone.py                  |  10 +-
 .../LSTMBackbone.py                           |  13 +-
 .../TCNBackbone.py                            |  14 +-
 .../TimeSeriesMLPBackbone.py                  |  10 +-
 .../forecasting_network_backbone/__init__.py  |  47 +++++++
 .../base_forecasting_backbone.py              |  20 +++
 .../components/setup/network_head/__init__.py |  11 --
 .../distributed_network_head/__init__.py      |  17 ---
 .../forecasting_network_head/__init__.py      |  54 ++++++++
 .../distributed_fully_connected.py            |   4 +-
 .../distributed_lstm_head.py                  | 131 ++++++++++++++++++
 .../distributed_network_head.py               |   2 +-
 .../distribution.py                           |   0
 .../components/training/metrics/metrics.py    |  13 +-
 .../pipeline/time_series_forecasting.py       |   9 +-
 17 files changed, 319 insertions(+), 62 deletions(-)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_network_backbone}/InceptionTimeBackbone.py (95%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_network_backbone}/LSTMBackbone.py (92%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_network_backbone}/TCNBackbone.py (95%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_network_backbone}/TimeSeriesMLPBackbone.py (83%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
 rename autoPyTorch/pipeline/components/setup/network_head/{distributed_network_head => forecasting_network_head}/distributed_fully_connected.py (94%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py
 rename autoPyTorch/pipeline/components/setup/network_head/{distributed_network_head => forecasting_network_head}/distributed_network_head.py (97%)
 rename autoPyTorch/pipeline/components/setup/network_head/{distributed_network_head => forecasting_network_head}/distribution.py (100%)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 486513f4b..cbe940abf 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -105,9 +105,6 @@ def __getitem__(self, index: int, train: bool = True) \
             X = self.train_transform(X)
         elif self.val_transform is not None and not train:
             X = self.val_transform(X)
-        else:
-            loc = None
-            scale = None
 
         # In case of prediction, the targets are not provided
         Y = self.Y
@@ -215,7 +212,7 @@ def __init__(self,
         seasonality = SEASONALITY_MAP.get(freq, 1)
         if isinstance(seasonality, list):
             seasonality = min(seasonality)  # Use to calculate MASE
-        self.seasonality = seasonality
+        self.seasonality = int(seasonality)
 
         self.freq: Optional[str] = freq
         self.freq_value: Optional[int] = freq_value
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index d7c3198d0..bcc0c5414 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Union, Tuple
 from ConfigSpace.configuration_space import ConfigurationSpace
 
 import numpy as np
@@ -13,24 +13,24 @@
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 
-
+"""
 class ForecastingNet(nn.Module):
     def __init__(self,
+                 network_embedding: nn.Module,
                  network_backbone: nn.Module,
                  network_head: nn.Module,
-                 network_embedding: Optional[nn.embedding] = None,
                  network_properties: Dict = {}):
         super(ForecastingNet, self).__init__()
-        if network_embedding is not None:
-            self.backbone = nn.Sequential(network_embedding, network_backbone)
-        else:
-            self.backbone = nn.Sequential(network_backbone)
+        self.embedding = network_embedding
+        self.backbone = network_backbone
         self.backbone_output_tuple = network_properties.get("backbone_output_tuple", False)
+        self.accept_hidden_states_as_input = network_properties.get("_accept_hidden_states_as_input", False)
         self.network_head = network_head
 
-    def forward(self, X: torch.Tensor):
-        # TODO find a proper way to pass hx to lstm
-        pass
+    def forward(self, X: torch.Tensor, hx: Optional[Tuple[torch.Tensor]]=None):
+
+"""
+
 
 
 class ForecastingNetworkComponent(NetworkComponent):
@@ -69,6 +69,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
             X = X.to(self.device)
 
             with torch.no_grad():
+
                 Y_batch_pred = self.network(X).mean
                 if loc is not None or scale is not None:
                     if loc is None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/InceptionTimeBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
similarity index 95%
rename from autoPyTorch/pipeline/components/setup/network_backbone/InceptionTimeBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
index 869f808ed..38b6b9594 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/InceptionTimeBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
@@ -9,7 +9,8 @@
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
+    import BaseForecastingNetworkBackbone
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
@@ -122,10 +123,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class InceptionTimeBackbone(NetworkBackboneComponent):
+class InceptionTimeBackbone(BaseForecastingNetworkBackbone):
     """
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
+    @property
+    def backbone_properties(self):
+        # TODO consider property for the network
+        backbone_properties = {}
+        return backbone_properties
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         backbone = _InceptionTime(in_features=input_shape[-1],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/LSTMBackbone.py
similarity index 92%
rename from autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/LSTMBackbone.py
index 637555283..03a49d3bc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/LSTMBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/LSTMBackbone.py
@@ -12,8 +12,8 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import \
-    NetworkBackboneComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
+    import BaseForecastingNetworkBackbone
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -55,7 +55,7 @@ def forward(self, x: torch.Tensor,
             return outputs, hidden_state
 
 
-class LSTMBackbone(NetworkBackboneComponent):
+class LSTMBackbone(BaseForecastingNetworkBackbone):
     """
     Standard searchable LSTM backbone for time series data
     """
@@ -70,6 +70,13 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         self.backbone = backbone
         return backbone
 
+    @property
+    def backbone_properties(self):
+        backbone_properties = {'network_output_tuple': True,
+                               'accept_additional_input': True,
+                               'hidden_states': True}
+        return backbone_properties
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         X['network_output_tuple'] = True
         return super().fit(X, y)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
similarity index 95%
rename from autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
index be9683ffe..a667296e2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/TCNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
@@ -13,8 +13,8 @@
 from torch.nn.utils import weight_norm
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import \
-    NetworkBackboneComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
+    import BaseForecastingNetworkBackbone
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -97,11 +97,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class TCNBackbone(NetworkBackboneComponent):
+class TCNBackbone(BaseForecastingNetworkBackbone):
     """
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
-    _fixed_seq_length = False
+
+    @property
+    def backbone_properties(self):
+        # TODO
+        backbone_properties = {}
+        return backbone_properties
+
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_0"]]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
similarity index 83%
rename from autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
index f55e5a184..00e2880ec 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/TimeSeriesMLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
@@ -3,6 +3,8 @@
 
 from typing import Tuple
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
+    import BaseForecastingNetworkBackbone
 
 import torch
 from torch import nn
@@ -25,10 +27,16 @@ def forward(self, x: torch.Tensor):
         return self.module_layers(x)
 
 
-class TimeSeriesMLPBackbone(MLPBackbone):
+class TimeSeriesMLPBackbone(BaseForecastingNetworkBackbone, MLPBackbone):
     _fixed_seq_length = True
     window_size = 1
 
+    @property
+    def backbone_properties(self):
+        # TODO
+        backbone_properties = {}
+        return backbone_properties
+
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
         requirements_list = super()._required_fit_arguments
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
new file mode 100644
index 000000000..fd65a18e0
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
@@ -0,0 +1,47 @@
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
+    import (
+    BaseForecastingNetworkBackbone,
+)
+
+directory = os.path.split(__file__)[0]
+_backbones = find_components(__package__,
+                             directory,
+                             BaseForecastingNetworkBackbone)
+_addons = ThirdPartyComponents(BaseForecastingNetworkBackbone)
+
+
+def add_backbone(backbone: BaseForecastingNetworkBackbone) -> None:
+    _addons.add_component(backbone)
+
+
+class NetworkBackboneChoice(autoPyTorchChoice):
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_backbones)
+        components.update(_addons.components)
+        return components
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
new file mode 100644
index 000000000..9eacbb453
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
@@ -0,0 +1,20 @@
+from abc import abstractmethod
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
+
+from abc import abstractmethod
+from typing import Any, Dict, Iterable, Optional, Tuple, List
+
+
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+
+
+class BaseForecastingNetworkBackbone(NetworkBackboneComponent):
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        X['backbone_properties'] = self.backbone_properties
+        return super().fit(X, y)
+
+    @property
+    @abstractmethod
+    def backbone_properties(self):
+        raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index 1e00d02d1..2a376f092 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -25,13 +25,6 @@
                          NetworkHeadComponent)
 _addons = ThirdPartyComponents(NetworkHeadComponent)
 
-# avoid path pollution, (otherwise FC layer will not be correctly detected)
-from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head import _distributed_heads, \
-    _distributed_addons
-
-from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distributed_network_head import \
-    DistributionNetworkHeadComponents
-
 
 def add_head(head: NetworkHeadComponent) -> None:
     _addons.add_component(head)
@@ -53,9 +46,6 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components.update(_heads)
         components.update(_addons.components)
 
-        components.update(_distributed_heads)
-        components.update(_distributed_addons.components)
-
         return components
 
     def get_available_components(
@@ -211,7 +201,6 @@ def get_hyperparameter_search_space(
                 parent_hyperparameter=parent_hyperparameter
             )
 
-
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py
deleted file mode 100644
index 4b83096d7..000000000
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import os
-
-from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distributed_network_head import (
-    DistributionNetworkHeadComponents,
-)
-
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    find_components,
-)
-
-directory = os.path.split(__file__)[0]
-_distributed_heads = find_components(__package__,
-                         directory,
-                         DistributionNetworkHeadComponents)
-
-_distributed_addons = ThirdPartyComponents(DistributionNetworkHeadComponents)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
new file mode 100644
index 000000000..0597f9652
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
@@ -0,0 +1,54 @@
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head import (
+    NetworkHeadComponent,
+)
+
+from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distributed_network_head import (
+    DistributionNetworkHeadComponents,
+)
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    find_components,
+)
+
+directory = os.path.split(__file__)[0]
+_distributed_heads = find_components(__package__,
+                                     directory,
+                                     DistributionNetworkHeadComponents)
+
+_distributed_addons = ThirdPartyComponents(DistributionNetworkHeadComponents)
+
+
+class ForecastingNetworkHeadChoice(NetworkHeadChoice):
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available head components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all NetworkHeadComponents available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_distributed_heads)
+        components.update(_distributed_addons.components)
+
+        return components
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_fully_connected.py
similarity index 94%
rename from autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
rename to autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_fully_connected.py
index 675eb18f0..fb358310c 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_fully_connected.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_fully_connected.py
@@ -9,8 +9,8 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distribution import ALL_DISTRIBUTIONS
-from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distributed_network_head import \
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distributed_network_head import \
     DistributionNetworkHeadComponents
 from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py
new file mode 100644
index 000000000..6ff3200d1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py
@@ -0,0 +1,131 @@
+from typing import Any, Dict, Optional, Tuple
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import \
+    NetworkBackboneComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class _LSTM_Decoder(nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 config: Dict[str, Any]):
+        super().__init__()
+        self.config = config
+        self.only_return_final_stage = True
+        self.lstm = nn.LSTM(input_size=in_features,
+                            hidden_size=config["hidden_size"],
+                            num_layers=config["num_layers"],
+                            dropout=config.get("dropout", 0.0),
+                            bidirectional=config["bidirectional"],
+                            batch_first=True)
+
+    def forward(self, x: torch.Tensor,
+                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+        B, T, _ = x.shape
+
+        outputs, hidden_state, = self.lstm(x, hx)
+
+        if self.only_return_final_stage:
+            if not self.config["bidirectional"]:
+                return outputs[:, -1, :],
+            else:
+                # concatenate last forward hidden state with first backward hidden state
+                outputs_by_direction = outputs.view(B,
+                                                    T,
+                                                    2,
+                                                    self.config["hidden_size"])
+                out = torch.cat([
+                    outputs_by_direction[:, -1, 0, :],
+                    outputs_by_direction[:, 0, 1, :]
+                ], dim=1)
+                return out,
+        else:
+            return outputs, hidden_state
+
+
+class LSTMBackbone(NetworkBackboneComponent):
+    """
+    Standard searchable LSTM decoder for time series data, similar to Seq2Seq
+    """
+    _fixed_seq_length = False
+
+    def __init__(self, **kwargs: Dict):
+        super().__init__(**kwargs)
+
+    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        backbone = _LSTM(in_features=input_shape[-1],
+                         config=self.config)
+        self.backbone = backbone
+        return backbone
+
+    @property
+    def network_properities(self):
+        network_properities = {'network_output_tuple': True,
+                               'accept_additional_input': True}
+        return network_properities
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        X['network_output_tuple'] = True
+        return super().fit(X, y)
+
+    @property
+    def only_return_final_stage(self):
+        return self.backbone.only_return_final_stage
+
+    @only_return_final_stage.setter
+    def only_return_final_stage(self, only_return_final_stage):
+        self.backbone.only_return_final_stage = only_return_final_stage
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'LSTMBackbone',
+            'name': 'LSTMBackbone',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_layers',
+                                                                              value_range=(1, 3),
+                                                                              default_value=1),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
+                                                                               value_range=(True, False),
+                                                                               default_value=False),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
+                                                                           value_range=(0., 0.5),
+                                                                           default_value=0.2),
+            bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
+                                                                                 value_range=(True, False),
+                                                                                 default_value=True)
+    ) -> ConfigurationSpace:
+        cs = CS.ConfigurationSpace()
+
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameters([num_layers, use_dropout, dropout])
+
+        # Add plain hyperparameters
+        # Hidden size is given by the encoder architecture
+        add_hyperparameter(cs, bidirectional, CategoricalHyperparameter)
+
+        cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
+                                           CS.GreaterThanCondition(dropout, num_layers, 1)))
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py
similarity index 97%
rename from autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
rename to autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py
index 6b2cafc95..43ae58dd7 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distributed_network_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py
@@ -10,7 +10,7 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
 
-from autoPyTorch.pipeline.components.setup.network_head.distributed_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
 
 
 class DistributionNetworkHeadComponents(NetworkHeadComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
similarity index 100%
rename from autoPyTorch/pipeline/components/setup/network_head/distributed_network_head/distribution.py
rename to autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 6cf5a9489..7780c81f5 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -69,9 +69,16 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
     Returns:
         mase_coefficient: inverse of mase_denominator
     """
-    mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
-                                                               past_target[:-sp],
-                                                               multioutput="raw_values")
+    if sp > len(past_target):
+        # in this case, we simply consider the mean value of the entire sequence
+        # TODO condsider if there is a better way of handling this
+        mase_denominator = forecasting_metrics.mean_absolute_error(past_target,
+                                                                   np.zeros_like(past_target),
+                                                                   multioutput="raw_values")
+    else:
+        mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
+                                                                   past_target[:-sp],
+                                                                   multioutput="raw_values")
     return 1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
 
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 655fc525e..8ffc5adf5 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -27,8 +27,9 @@
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone import \
+    BaseForecastingNetworkBackbone
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head import ForecastingNetworkHeadChoice
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
@@ -220,9 +221,9 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                                                             random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
-            ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
+            ("network_backbone", BaseForecastingNetworkBackbone(default_dataset_properties,
                                                        random_state=self.random_state)),
-            ("network_head", NetworkHeadChoice(default_dataset_properties,
+            ("network_head", ForecastingNetworkHeadChoice(default_dataset_properties,
                                                random_state=self.random_state)),
             ("network", ForecastingNetworkComponent(random_state=self.random_state)),
             ("network_init", NetworkInitializerChoice(default_dataset_properties,

From e08cfd36d40280c3264c7cf618c0b286c52ed587 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 11 Dec 2021 00:49:07 +0100
Subject: [PATCH 080/347] training loss as hps, more architectures

---
 autoPyTorch/constants.py                      |   2 +-
 autoPyTorch/datasets/time_series_dataset.py   |  14 +-
 .../forecasting_training_losses/__init__.py   | 199 ++++++++++++++++++
 .../base_forecasting_losses.py                |  28 +++
 .../distribution_losses.py                    |  33 +++
 .../regression_losses.py                      |  55 +++++
 .../setup/network_backbone/__init__.py        |  12 +-
 .../InceptionTimeBackbone.py                  |   2 +-
 .../{LSTMBackbone.py => RNNBackbone.py}       |  64 +++---
 .../TCNBackbone.py                            |   2 +-
 .../TimeSeriesMLPBackbone.py                  |  23 +-
 .../forecasting_network_backbone/__init__.py  |   9 +-
 .../base_forecasting_backbone.py              |  12 +-
 .../components/setup/network_head/__init__.py |  34 +--
 ....py => ForecastingFullyConnectedHeader.py} |  76 ++++---
 .../ForecastingRNNHeader.py                   | 123 +++++++++++
 .../forecasting_network_head/__init__.py      | 160 +++++++++++++-
 .../distributed_lstm_head.py                  | 131 ------------
 .../distributed_network_head.py               | 113 ----------
 .../forecasting_head.py                       | 168 +++++++++++++++
 .../time_series_forecasting_data_loader.py    |   8 +-
 .../pipeline/components/training/losses.py    |   1 +
 .../trainer/forecasting_trainer/__init__.py   |   3 +-
 .../pipeline/time_series_forecasting.py       |  78 +++++--
 24 files changed, 965 insertions(+), 385 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py
 create mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py
 create mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/{LSTMBackbone.py => RNNBackbone.py} (69%)
 rename autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/{distributed_fully_connected.py => ForecastingFullyConnectedHeader.py} (56%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 92562be61..318203421 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -8,7 +8,7 @@
 
 REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION]
 CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION, TIMESERIES_CLASSIFICATION]
-FORECASTING_TASKS = [TIMESERIES_FORECASTING]
+FORECASTING_TASKS = [TIMESERIES_FORECASTING]  # TODO extend FORECASTING TASKS to Classification and regression tasks
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index cbe940abf..7751b479f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -77,6 +77,8 @@ def __init__(self,
         self.val_transform = val_transforms
         self.sp = sp
 
+        self.mase_coefficient = compute_mase_coefficient(self.X, sp=self.sp)
+
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
         """
@@ -98,9 +100,6 @@ def __getitem__(self, index: int, train: bool = True) \
         else:
             X = self.X[:index + 1]
 
-        if not train:
-            mase_coefficient = compute_mase_coefficient(X, sp=self.sp)
-
         if self.train_transform is not None and train:
             X = self.train_transform(X)
         elif self.val_transform is not None and not train:
@@ -122,7 +121,7 @@ def __getitem__(self, index: int, train: bool = True) \
             return {"past_target": torch.from_numpy(X)},  Y_future
         else:
             return {"past_target": torch.from_numpy(X),
-                    "mase_coefficient": mase_coefficient},  Y_future
+                    "mase_coefficient": self.mase_coefficient},  Y_future
 
     def __len__(self) -> int:
         return self.X.shape[0]
@@ -175,7 +174,6 @@ def __init__(self,
                  dataset_name: Optional[str] = None,
                  shift_input_data: bool = True,
                  normalize_y: bool = True,
-                 train_with_log_prob: bool = True,
                  ):
         """
         :param target_variables:  Optional[Union[Tuple[int], int]] used for multi-variant forecasting
@@ -189,7 +187,6 @@ def __init__(self,
         such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
         :param normalize_y: bool
         if y values needs to be normalized with mean 0 and variance 1
-        :param train_with_log_prob: bool
         if the dataset is trained with log_prob losses, this needs to be specified in the very beginning such that the
         header's configspace can be built beforehand.
         """
@@ -334,10 +331,6 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
-        # TODO in the future, if training losses types are considered as a type of hyperparameters, we need to remove
-        #  this line and create  conditional configspace under
-        #  autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice .
-        self.train_with_log_prob = train_with_log_prob
 
     def __getitem__(self, idx, train=True):
         if idx < 0:
@@ -563,7 +556,6 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
             'categories': self.categories,
-            'train_with_log_prob': self.train_with_log_prob,
             'target_columns': self.target_columns
         })
         return info
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py
new file mode 100644
index 000000000..e348ade3e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py
@@ -0,0 +1,199 @@
+import os
+from typing import Dict, List, Optional
+from collections import OrderedDict
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+import ConfigSpace.hyperparameters as CSH
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+
+from autoPyTorch.constants import REGRESSION_TASKS, CLASSIFICATION_TASKS, FORECASTING_TASKS, STRING_TO_TASK_TYPES
+
+from autoPyTorch.pipeline.components.setup.forecasting_training_losses.base_forecasting_losses import\
+    ForecastingLossComponents
+
+directory = os.path.split(__file__)[0]
+_optimizers = find_components(__package__,
+                              directory,
+                              ForecastingLossComponents)
+_addons = ThirdPartyComponents(ForecastingLossComponents)
+
+
+class ForecastingLossChoices(autoPyTorchChoice):
+    """This class select the training loss
+    training loss can be one of the following choice: distriubtion (log_prob), regression and quantile (TODO)
+    each losses corresponds to a network output head:
+    DistributionHead (log_prob)
+    RegressionHead
+
+    """
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available optimizer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseOptimizerComponents  available
+                as choices
+        """
+        components = OrderedDict()
+        components.update(_optimizers)
+        components.update(_addons.components)
+        return components
+
+    def get_available_components(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        include: List[str] = None,
+        exclude: List[str] = None,
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate heads
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        available_comp = self.get_components()
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == ForecastingLossChoices or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            task_type = STRING_TO_TASK_TYPES[task_type]
+
+            if task_type in CLASSIFICATION_TASKS and not bool(properties['handles_classification']):
+                continue
+            elif task_type in [*REGRESSION_TASKS, *FORECASTING_TASKS] and not bool(properties['handles_regression']):
+                continue
+
+            components_dict[name] = entry
+        return components_dict
+
+    def get_hyperparameter_search_space(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        default: Optional[str] = None,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
+            default (Optional[str]): Default component to use
+            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
+                list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]: which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_losses = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        if len(available_losses) == 0:
+            raise ValueError("No Loss found")
+
+        if default is None:
+            defaults = [
+                'DistributionLoss',
+                'RegressionLoss',
+            ]
+            for default_ in defaults:
+                if default_ in available_losses:
+                    default = default_
+                    break
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_losses):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_losses,
+                                                               choice_hyperparameter.value_range))
+            optimizer = CSH.CategoricalHyperparameter('__choice__',
+                                                      choice_hyperparameter.value_range,
+                                                      default_value=choice_hyperparameter.default_value)
+        else:
+            optimizer = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_losses.keys()),
+                default_value=default
+            )
+        cs.add_hyperparameter(optimizer)
+        for name in optimizer.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_losses[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                  **updates)
+            parent_hyperparameter = {'parent': optimizer, 'value': name}
+            cs.add_configuration_space(
+                name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        assert self.choice is not None, "Cannot call transform before the object is initialized"
+        return self.choice.transform(X)
+
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py
new file mode 100644
index 000000000..2b7040935
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py
@@ -0,0 +1,28 @@
+from typing import Dict, Any
+
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+
+from autoPyTorch.utils.common import FitRequirement
+
+
+class ForecastingLossComponents(autoPyTorchComponent):
+    _required_properties = ["name", "handles_tabular", "handles_image", "handles_time_series",
+                            'handles_regression', 'handles_classification']
+    loss = None
+    required_net_out_put_type = None
+
+    def __init__(self,
+                 **kwargs: Any):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
+        ])
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
+        self.check_requirements(X, y)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({"loss": self.loss})
+        X.update({'required_net_out_put_type': self.required_net_out_put_type})
+        return X
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py
new file mode 100644
index 000000000..a6c39a488
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py
@@ -0,0 +1,33 @@
+from typing import Optional, Dict, Union
+
+from ConfigSpace import ConfigurationSpace
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_losses.base_forecasting_losses import \
+    ForecastingLossComponents
+from autoPyTorch.pipeline.components.training.losses import LogProbLoss
+
+
+class DistributionLoss(ForecastingLossComponents):
+    loss = LogProbLoss
+    required_net_out_put_type = 'distribution'
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'DistributionLoss',
+            'name': 'DistributionLoss',
+            "handles_tabular": False,
+            "handles_image": False,
+            "handles_time_series": True,
+            'handles_regression': True,
+            'handles_classification': False
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py
new file mode 100644
index 000000000..8f3e0bb75
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py
@@ -0,0 +1,55 @@
+from typing import Optional, Dict, Union
+
+import numpy as np
+from ConfigSpace import ConfigurationSpace, CategoricalHyperparameter
+
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter
+)
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_losses.base_forecasting_losses import \
+    ForecastingLossComponents
+from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss
+
+
+class RegressionLosses(ForecastingLossComponents):
+    required_net_out_put_type = 'regression'
+
+    def __init__(self,
+                 loss_name: str,
+                 random_state: Optional[np.random.RandomState] = None,
+                 ):
+        super(RegressionLosses).__init__()
+        if loss_name == "l1":
+            self.loss = L1Loss
+        elif loss_name == 'mse':
+            self.loss = MSELoss
+        else:
+            raise ValueError(f"Unsupported loss type {loss_name}!")
+        self.random_state = random_state
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'RegressionLoss',
+            'name': 'RegressionLoss',
+            "handles_tabular": True,
+            "handles_image": True,
+            "handles_time_series": True,
+            'handles_regression': True,
+            'handles_classification': False
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            loss_name: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="loss_name",
+                                                                             value_range=('l1', 'mse'),
+                                                                             default_value='mse'),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, loss_name, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index 8d5339389..87c1ccf70 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -149,12 +149,7 @@ def get_hyperparameter_search_space(
             raise ValueError("No backbone found")
 
         if default is None:
-            defaults = [
-                'ShapedMLPBackbone',
-                'MLPBackbone',
-                'ConvNetImageBackbone',
-                'InceptionTimeBackbone',
-            ]
+            defaults = self._defaults_network
             for default_ in defaults:
                 if default_ in available_backbones:
                     default = default_
@@ -192,6 +187,11 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
+    @property
+    def _defaults_network(self):
+        return ['ShapedMLPBackbone',
+                'MLPBackbone']
+
     def transform(self, X: np.ndarray) -> np.ndarray:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
index 38b6b9594..3e780d3fc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
@@ -128,7 +128,7 @@ class InceptionTimeBackbone(BaseForecastingNetworkBackbone):
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
     @property
-    def backbone_properties(self):
+    def encoder_properties(self):
         # TODO consider property for the network
         backbone_properties = {}
         return backbone_properties
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/LSTMBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
similarity index 69%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/LSTMBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
index 03a49d3bc..c68c941fc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/LSTMBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
@@ -12,24 +12,30 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
     import BaseForecastingNetworkBackbone
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
-class _LSTM(nn.Module):
+class _RNN(nn.Module):
+    # we only consder GRU and LSTM here
     def __init__(self,
                  in_features: int,
                  config: Dict[str, Any]):
         super().__init__()
         self.config = config
         self.only_return_final_stage = True
-        self.lstm = nn.LSTM(input_size=in_features,
-                            hidden_size=config["hidden_size"],
-                            num_layers=config["num_layers"],
-                            dropout=config.get("dropout", 0.0),
-                            bidirectional=config["bidirectional"],
-                            batch_first=True)
+        if config['cell_type'] == 'lstm':
+            cell_type = nn.LSTM
+        else:
+            cell_type = nn.GRU
+
+        self.lstm = cell_type(input_size=in_features,
+                              hidden_size=config["hidden_size"],
+                              num_layers=config["num_layers"],
+                              dropout=config.get("dropout", 0.0),
+                              bidirectional=config["bidirectional"],
+                              batch_first=True)
 
     def forward(self, x: torch.Tensor,
                 hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
@@ -39,7 +45,7 @@ def forward(self, x: torch.Tensor,
 
         if self.only_return_final_stage:
             if not self.config["bidirectional"]:
-                return outputs[:, -1, :],
+                return outputs[:, [-1], :], hidden_state
             else:
                 # concatenate last forward hidden state with first backward hidden state
                 outputs_by_direction = outputs.view(B,
@@ -47,15 +53,15 @@ def forward(self, x: torch.Tensor,
                                                     2,
                                                     self.config["hidden_size"])
                 out = torch.cat([
-                    outputs_by_direction[:, -1, 0, :],
-                    outputs_by_direction[:, 0, 1, :]
-                ], dim=1)
-                return out,
+                    outputs_by_direction[:, [-1], 0, :],
+                    outputs_by_direction[:, [0], 1, :]
+                ], dim=-1)
+                return out, hidden_state
         else:
             return outputs, hidden_state
 
 
-class LSTMBackbone(BaseForecastingNetworkBackbone):
+class RNNBackbone(BaseForecastingNetworkBackbone):
     """
     Standard searchable LSTM backbone for time series data
     """
@@ -65,20 +71,25 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        backbone = _LSTM(in_features=input_shape[-1],
-                         config=self.config)
+        backbone = _RNN(in_features=input_shape[-1],
+                        config=self.config)
         self.backbone = backbone
         return backbone
 
     @property
-    def backbone_properties(self):
-        backbone_properties = {'network_output_tuple': True,
-                               'accept_additional_input': True,
-                               'hidden_states': True}
-        return backbone_properties
+    def encoder_properties(self):
+        encoder_properties = {'network_output_tuple': True,
+                              'accept_additional_input': True,
+                              'hidden_states': True}
+        arch_kwargs = {'hidden_size': self.config['hidden_size'],
+                       'num_layers': self.config['num_layers'],
+                       'bidirectional': self.config['bidirectional'],
+                       'cell_type': self.config['cell_type']}  # used for initialize
+        encoder_properties.update({"arch_kwargs": arch_kwargs})
+        return encoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        X['network_output_tuple'] = True
+        # the setting are utilized to build decoder
         return super().fit(X, y)
 
     @property
@@ -92,8 +103,8 @@ def only_return_final_stage(self, only_return_final_stage):
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         return {
-            'shortname': 'LSTMBackbone',
-            'name': 'LSTMBackbone',
+            'shortname': 'RNNBackbone',
+            'name': 'RNNBackbone',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
@@ -102,6 +113,9 @@ def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict] = None,
+            cell_type: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="cell_type",
+                                                                             value_range=['lstm', 'gru'],
+                                                                             default_value='lstm'),
             num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_layers',
                                                                               value_range=(1, 3),
                                                                               default_value=1),
@@ -121,12 +135,14 @@ def get_hyperparameter_search_space(
         cs = CS.ConfigurationSpace()
 
         # TODO consider lstm layers with different hidden size
+        # TODO bidirectional needs to be set as false for DeepAR model
         num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
         use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
         dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
         cs.add_hyperparameters([num_layers, use_dropout, dropout])
 
         # Add plain hyperparameters
+        add_hyperparameter(cs, cell_type, CategoricalHyperparameter)
         add_hyperparameter(cs, hidden_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, bidirectional, CategoricalHyperparameter)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
index a667296e2..4d17cb8ca 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
@@ -103,7 +103,7 @@ class TCNBackbone(BaseForecastingNetworkBackbone):
     """
 
     @property
-    def backbone_properties(self):
+    def encoder_properties(self):
         # TODO
         backbone_properties = {}
         return backbone_properties
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
index 00e2880ec..b412a9622 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
@@ -17,12 +17,25 @@
 
 class _TimeSeriesMLP(nn.Module):
     def __init__(self,
+                 window_size: int,
                  module_layers: nn.Module,
                  ):
         super().__init__()
+        self.window_size = window_size
         self.module_layers = module_layers
 
     def forward(self, x: torch.Tensor):
+        """
+
+        Args:
+            x: torch.Tensor(batch_size, window_size, num_features)
+
+        Returns:
+
+        """
+        if x.shape[1] > self.window_size:
+            # we need to ensure that the input size fits the
+            x = x[:, -self.window_size:]
         x = x.view(x.shape[0], -1)
         return self.module_layers(x)
 
@@ -32,9 +45,10 @@ class TimeSeriesMLPBackbone(BaseForecastingNetworkBackbone, MLPBackbone):
     window_size = 1
 
     @property
-    def backbone_properties(self):
-        # TODO
-        backbone_properties = {}
+    def encoder_properties(self):
+        backbone_properties = {
+            'fixed_input_shape': True,  # the network has a fixed input shape, this is used to indicate output shape
+                               }
         return backbone_properties
 
     @property
@@ -45,12 +59,11 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
-        X['MLP_backbone'] = True
         return super().fit(X, y)
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1] * self.window_size
-        return _TimeSeriesMLP(self._build_backbone(in_features))
+        return _TimeSeriesMLP(self.window_size, self._build_backbone(in_features))
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
index fd65a18e0..8406ebd5f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
@@ -14,6 +14,7 @@
     autoPyTorchComponent,
     find_components,
 )
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
     import (
     BaseForecastingNetworkBackbone,
@@ -30,7 +31,7 @@ def add_backbone(backbone: BaseForecastingNetworkBackbone) -> None:
     _addons.add_component(backbone)
 
 
-class NetworkBackboneChoice(autoPyTorchChoice):
+class ForecastingNetworkBackboneChoice(NetworkBackboneChoice):
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
@@ -44,4 +45,8 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components = OrderedDict()
         components.update(_backbones)
         components.update(_addons.components)
-        return components
\ No newline at end of file
+        return components
+
+    @property
+    def _defaults_network(self):
+        return ['RNNBackbone', 'TSMLPBackbone']
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
index 9eacbb453..920fe7946 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
@@ -5,16 +5,22 @@
 from typing import Any, Dict, Iterable, Optional, Tuple, List
 
 
-
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 
 
 class BaseForecastingNetworkBackbone(NetworkBackboneComponent):
+    """
+    Base forecasting network, its output needs to be a 3-d Tensor:
+    """
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        X['backbone_properties'] = self.backbone_properties
         return super().fit(X, y)
 
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X = super().transform(X)
+        X.update({'encoder_properties': self.encoder_properties})
+        return X
+
     @property
     @abstractmethod
-    def backbone_properties(self):
+    def encoder_properties(self):
         raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index 2a376f092..84ca63b87 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -106,10 +106,6 @@ def get_available_components(
                 continue
             elif 'time_series' in task_type and not bool(properties['handles_time_series']):
                 continue
-            train_with_log_prob = dataset_properties.get("train_with_log_prob", False)
-            if train_with_log_prob:
-                if not issubclass(entry, DistributionNetworkHeadComponents):
-                    continue
 
             # target_type = dataset_properties['target_type']
             # Apply some automatic filtering here for
@@ -152,26 +148,16 @@ def get_hyperparameter_search_space(
 
         if len(available_heads) == 0:
             raise ValueError("No head found")
-        train_with_log_prob = dataset_properties.get("train_with_log_prob", False)
-        if train_with_log_prob:
-            if default is None:
-                defaults = [
-                    'DistributionFullyConnectedHead',
-                ]
-                for default_ in defaults:
-                    if default_ in available_heads:
-                        default = default_
-                        break
-        else:
-            if default is None:
-                defaults = [
-                    'FullyConnectedHead',
-                    'FullyConvolutional2DHead',
-                ]
-                for default_ in defaults:
-                    if default_ in available_heads:
-                        default = default_
-                        break
+
+        if default is None:
+            defaults = [
+                'FullyConnectedHead',
+                'FullyConvolutional2DHead',
+            ]
+            for default_ in defaults:
+                if default_ in available_heads:
+                    default = default_
+                    break
 
         updates = self._get_search_space_updates()
         if '__choice__' in updates.keys():
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py
similarity index 56%
rename from autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_fully_connected.py
rename to autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py
index fb358310c..24bd5eebe 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_fully_connected.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py
@@ -1,3 +1,4 @@
+from abc import ABC
 from typing import Dict, Optional, Tuple, Union, List
 
 from torch import nn
@@ -9,25 +10,19 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distributed_network_head import \
-    DistributionNetworkHeadComponents
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import \
+    ForecastingHead
 from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
 
 
-class DistributionFullyConnectedHead(DistributionNetworkHeadComponents, FullyConnectedHead):
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'DistributionFullyConnectedHead',
-            'name': 'DistributionFullyConnectedHead',
-            'handles_tabular': False,
-            'handles_image': False,
-            'handles_time_series': True,
-        }
+class ForecastingFullyConnectedHeader(ForecastingHead, FullyConnectedHead):
+    @property
+    def decoder_properties(self):
+        decoder_properties = {}
+        return decoder_properties
 
-    def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
+    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List[nn.Module], int]:
         layers = []
         in_features = input_shape[-1]
         for i in range(1, self.config["num_layers"]):
@@ -39,31 +34,46 @@ def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], in
 
         return layers, head_base_output_features
 
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ForecastingFullyConnectedHead',
+            'name': 'ForecastingFullyConnectedHead',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
-                                                                          value_range=(1, 4),
-                                                                          default_value=2),
-        units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
-                                                                           value_range=(64, 512),
-                                                                           default_value=128),
-        activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
-                                                                          value_range=tuple(_activations.keys()),
-                                                                          default_value=list(_activations.keys())[0]),
-        dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
-                                                                        value_range=tuple(ALL_DISTRIBUTIONS.keys()),
-                                                                        default_value=list(ALL_DISTRIBUTIONS.keys())[0]),
-        #auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
-        #                                                                       value_range=(True, False),
-        #                                                                       default_value=False)
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
+                                                                              value_range=(1, 4),
+                                                                              default_value=2),
+            units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
+                                                                               value_range=(64, 512),
+                                                                               default_value=128),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
+                                                                              value_range=tuple(_activations.keys()),
+                                                                              default_value=list(_activations.keys())[
+                                                                                  0]),
+            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
+                                                                            value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                                                                            default_value=
+                                                                            list(ALL_DISTRIBUTIONS.keys())[0]),
+            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False)
     ) -> ConfigurationSpace:
         cs = FullyConnectedHead.get_hyperparameter_search_space(dataset_properties=dataset_properties,
                                                                 num_layers=num_layers,
                                                                 units_layer=units_layer,
                                                                 activation=activation)
 
+        # These two HPs are inactivate if loss type is regression
+        # TODO add that in the pipeline part
         add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
-        # TODO let dataset_properties decide if autoregressive models is applied
-        #add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        # TODO let dataset_properties decide if autoregressive models is appliable
+        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py
new file mode 100644
index 000000000..7cb3e3865
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py
@@ -0,0 +1,123 @@
+from abc import ABC
+from typing import Any, Dict, Optional, Tuple, Union
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    Constant
+)
+
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import \
+    ForecastingHead
+
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class _RNN_Decoder(nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 hidden_size: int,
+                 num_layers: int,
+                 cell_type: str,
+                 config: Dict[str, Any]):
+        super().__init__()
+        self.config = config
+        if cell_type == 'lstm':
+            cell = nn.LSTM
+        else:
+            cell = nn.GRU
+        self.lstm = cell(input_size=in_features,
+                         hidden_size=hidden_size,
+                         num_layers=num_layers,
+                         dropout=config.get("dropout", 0.0),
+                         bidirectional=False,
+                         batch_first=True)
+
+    def forward(self, x: torch.Tensor,
+                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+        outputs, hidden_state, = self.lstm(x, hx)
+        return outputs, hidden_state
+
+
+class ForecastingRNNHeader(ForecastingHead):
+    """
+    Standard searchable RNN decoder for time series data, only works when the encoder is
+    """
+    def __init__(self, **kwargs: Dict):
+        super().__init__(**kwargs)
+        # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
+        self.auto_regressive = True
+
+    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> nn.Module:
+        head = _RNN_Decoder(in_features=input_shape[-1],
+                            config=self.config,
+                            **arch_kwargs)
+        self.head = head
+        return head
+
+    @property
+    def decoder_properties(self):
+        decoder_properties = {'network_output_tuple': True,
+                              'accept_additional_input': True,
+                              'recurrent': True}
+        return decoder_properties
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        X['network_output_tuple'] = True
+        return super().fit(X, y)
+
+    @property
+    def only_return_final_stage(self):
+        return self.backbone.only_return_final_stage
+
+    @only_return_final_stage.setter
+    def only_return_final_stage(self, only_return_final_stage):
+        self.backbone.only_return_final_stage = only_return_final_stage
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'ForecastingRNNHead',
+            'name': 'ForecastingRNNHead',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
+                                                                               value_range=(True, False),
+                                                                               default_value=False),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
+                                                                           value_range=(0., 0.5),
+                                                                           default_value=0.2),
+            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
+                                                                            value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                                                                            default_value=
+                                                                            list(ALL_DISTRIBUTIONS.keys())[0]),
+    ) -> ConfigurationSpace:
+        cs = CS.ConfigurationSpace()
+
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([use_dropout, dropout])
+
+        # Add plain hyperparameters
+        # Hidden size is given by the encoder architecture
+        cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
+
+        add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
+
+        return cs
+
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
index 0597f9652..c05c78cd3 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
@@ -19,8 +19,8 @@
 )
 
 from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distributed_network_head import (
-    DistributionNetworkHeadComponents,
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import (
+    ForecastingHead
 )
 
 from autoPyTorch.pipeline.components.base_component import (
@@ -29,11 +29,11 @@
 )
 
 directory = os.path.split(__file__)[0]
-_distributed_heads = find_components(__package__,
-                                     directory,
-                                     DistributionNetworkHeadComponents)
+_heads = find_components(__package__,
+                         directory,
+                         ForecastingHead)
 
-_distributed_addons = ThirdPartyComponents(DistributionNetworkHeadComponents)
+_addons = ThirdPartyComponents(ForecastingHead)
 
 
 class ForecastingNetworkHeadChoice(NetworkHeadChoice):
@@ -48,7 +48,151 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
                 as choices for learning rate scheduling
         """
         components = OrderedDict()
-        components.update(_distributed_heads)
-        components.update(_distributed_addons.components)
+
+        components.update(_heads)
+        components.update(_addons.components)
 
         return components
+
+    def get_available_components(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            include: List[str] = None,
+            exclude: List[str] = None,
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate heads
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        available_comp = self.get_components()
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == NetworkHeadChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # heads based on the dataset!
+            # TODO: Think if there is any case where a head
+            # is not recommended for a certain dataset
+
+            components_dict[name] = entry
+        return components_dict
+
+    def get_hyperparameter_search_space(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Describes the dataset to work on
+            default (Optional[str]): Default head to use
+            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
+                list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]: which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_heads = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        if len(available_heads) == 0:
+            raise ValueError("No head found")
+
+        if default is None:
+            defaults = [
+                'ForecastingFullyConnectedHead',
+                'ForecastingRNNHead',
+            ]
+            for default_ in defaults:
+                if default_ in available_heads:
+                    default = default_
+                    break
+
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_heads):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_heads,
+                                                               choice_hyperparameter.value_range))
+            head = CSH.CategoricalHyperparameter('__choice__',
+                                                 choice_hyperparameter.value_range,
+                                                 default_value=choice_hyperparameter.default_value)
+        else:
+            head = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_heads.keys()),
+                default_value=default)
+        cs.add_hyperparameter(head)
+        for name in head.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                 **updates)
+            parent_hyperparameter = {'parent': head, 'value': name}
+            cs.add_configuration_space(
+                name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py
deleted file mode 100644
index 6ff3200d1..000000000
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_lstm_head.py
+++ /dev/null
@@ -1,131 +0,0 @@
-from typing import Any, Dict, Optional, Tuple
-
-import ConfigSpace as CS
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter
-)
-
-import torch
-from torch import nn
-
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import \
-    NetworkBackboneComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-
-
-class _LSTM_Decoder(nn.Module):
-    def __init__(self,
-                 in_features: int,
-                 config: Dict[str, Any]):
-        super().__init__()
-        self.config = config
-        self.only_return_final_stage = True
-        self.lstm = nn.LSTM(input_size=in_features,
-                            hidden_size=config["hidden_size"],
-                            num_layers=config["num_layers"],
-                            dropout=config.get("dropout", 0.0),
-                            bidirectional=config["bidirectional"],
-                            batch_first=True)
-
-    def forward(self, x: torch.Tensor,
-                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
-        B, T, _ = x.shape
-
-        outputs, hidden_state, = self.lstm(x, hx)
-
-        if self.only_return_final_stage:
-            if not self.config["bidirectional"]:
-                return outputs[:, -1, :],
-            else:
-                # concatenate last forward hidden state with first backward hidden state
-                outputs_by_direction = outputs.view(B,
-                                                    T,
-                                                    2,
-                                                    self.config["hidden_size"])
-                out = torch.cat([
-                    outputs_by_direction[:, -1, 0, :],
-                    outputs_by_direction[:, 0, 1, :]
-                ], dim=1)
-                return out,
-        else:
-            return outputs, hidden_state
-
-
-class LSTMBackbone(NetworkBackboneComponent):
-    """
-    Standard searchable LSTM decoder for time series data, similar to Seq2Seq
-    """
-    _fixed_seq_length = False
-
-    def __init__(self, **kwargs: Dict):
-        super().__init__(**kwargs)
-
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        backbone = _LSTM(in_features=input_shape[-1],
-                         config=self.config)
-        self.backbone = backbone
-        return backbone
-
-    @property
-    def network_properities(self):
-        network_properities = {'network_output_tuple': True,
-                               'accept_additional_input': True}
-        return network_properities
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        X['network_output_tuple'] = True
-        return super().fit(X, y)
-
-    @property
-    def only_return_final_stage(self):
-        return self.backbone.only_return_final_stage
-
-    @only_return_final_stage.setter
-    def only_return_final_stage(self, only_return_final_stage):
-        self.backbone.only_return_final_stage = only_return_final_stage
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
-        return {
-            'shortname': 'LSTMBackbone',
-            'name': 'LSTMBackbone',
-            'handles_tabular': False,
-            'handles_image': False,
-            'handles_time_series': True,
-        }
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict] = None,
-            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_layers',
-                                                                              value_range=(1, 3),
-                                                                              default_value=1),
-            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
-                                                                               value_range=(True, False),
-                                                                               default_value=False),
-            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
-                                                                           value_range=(0., 0.5),
-                                                                           default_value=0.2),
-            bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
-                                                                                 value_range=(True, False),
-                                                                                 default_value=True)
-    ) -> ConfigurationSpace:
-        cs = CS.ConfigurationSpace()
-
-        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
-        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
-        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
-        cs.add_hyperparameters([num_layers, use_dropout, dropout])
-
-        # Add plain hyperparameters
-        # Hidden size is given by the encoder architecture
-        add_hyperparameter(cs, bidirectional, CategoricalHyperparameter)
-
-        cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
-                                           CS.GreaterThanCondition(dropout, num_layers, 1)))
-
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py
deleted file mode 100644
index 43ae58dd7..000000000
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distributed_network_head.py
+++ /dev/null
@@ -1,113 +0,0 @@
-from abc import abstractmethod
-from typing import Any, Dict, Iterable, Tuple, List, Optional
-
-import numpy as np
-import torch
-from torch import nn
-
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
-from autoPyTorch.utils.common import FitRequirement
-
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
-
-
-class DistributionNetworkHeadComponents(NetworkHeadComponent):
-    """
-    Base class for network heads used for distribution output.
-     Holds the head module and the config which was used to create it.
-    """
-    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series",
-                            "n_prediction_steps", "train_with_log_prob"]
-
-    def __init__(self,
-                 **kwargs: Any):
-        super().__init__()
-        self.add_fit_requirements([
-            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
-            FitRequirement('train_with_log_prob', (str,), user_defined=True, dataset_property=True),
-            FitRequirement('n_prediction_steps', (int,), user_defined=True, dataset_property=True),
-            FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
-            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False)
-        ])
-        self.head: Optional[nn.Module] = None
-        self.config = kwargs
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        """
-        Builds the head component and assigns it to self.head
-
-        Args:
-            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
-            y (Any): not used. To comply with sklearn API
-        Returns:
-            Self
-        """
-        input_shape = X['dataset_properties']['input_shape']
-        output_shape = X['dataset_properties']['output_shape']
-
-        auto_regressive = self.config.get("auto_regressive", False)
-        X.update({"auto_regressive": auto_regressive})
-        # TODO consider Auto-regressive model on vanilla network head
-        if auto_regressive:
-            output_shape[0] = 1
-        mlp_backbone = X.get("MLP_backbone", False)
-        network_output_tuple = X.get("network_output_tuple", False)
-        if mlp_backbone:
-            input_shape = (X["window_size"], input_shape[-1])
-        self.head = self.build_head(
-            input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape,
-                                         network_output_tuple=network_output_tuple),
-            output_shape=output_shape,
-        )
-        return self
-
-    def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> nn.Module:
-        """
-        Builds the head module and returns it
-
-        Args:
-            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
-            output_shape (Tuple[int, ...]): shape of the output of the head
-
-        Returns:
-            nn.Module: head module
-        """
-        base_header_layer, num_head_base_output_features = self._build_head(input_shape)
-        # TODO consider other form of proj layers
-        proj_layer = self.build_proj_layer(dist_cls=self.config["dist_cls"],
-                                           num_head_base_output_features=num_head_base_output_features,
-                                           output_shape=output_shape,
-                                           )
-        return nn.Sequential(*base_header_layer, proj_layer)
-
-    @abstractmethod
-    def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
-        """
-        Builds the head module and returns it
-
-        Args:
-            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
-            output_shape (Tuple[int, ...]): shape of the output of the head
-            n_prediction_steps (int): how many steps need to be predicted in advance
-
-        Returns:
-            nn.Module: head module
-        """
-        raise NotImplementedError()
-
-    @staticmethod
-    def build_proj_layer(dist_cls: str,
-                         num_head_base_output_features: int,
-                         output_shape: Tuple[int, ...],) -> \
-            torch.distributions.Distribution:
-        """
-        Builds a layer that maps the head output features to a torch distribution
-        """
-        if dist_cls not in ALL_DISTRIBUTIONS.keys():
-            raise ValueError(f'Unsupported distribution class type: {dist_cls}')
-        proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
-                                                 output_shape=output_shape,)
-        return proj_layer
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
new file mode 100644
index 000000000..c10dcbecb
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -0,0 +1,168 @@
+from abc import abstractmethod, ABC
+from typing import Any, Dict, Iterable, Tuple, List, Optional
+
+import numpy as np
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import\
+    ALL_DISTRIBUTIONS, ProjectionLayer
+
+
+class ForecastingHead(NetworkHeadComponent):
+    """
+    Base class for network heads used for forecasting.
+     Holds the head module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series",
+                            "n_prediction_steps"]
+
+    def __init__(self,
+                 **kwargs: Any):
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
+            FitRequirement('encoder_properties', (str,), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_steps', (int,), user_defined=True, dataset_property=True),
+            FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('loss_type', (str,), user_defined=False, dataset_property=False)
+            # TODO add loss type
+        ])
+        self.head: Optional[nn.Module] = None
+        self.required_net_out_put_type: Optional[str] = None
+        self.auto_regressive = kwargs.get('auto_regressive', False)
+
+        self.config = kwargs
+
+    @property
+    def decoder_properties(self):
+        decoder_property = {}
+        return decoder_property
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Builds the head component and assigns it to self.head
+
+        Args:
+            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
+            y (Any): not used. To comply with sklearn API
+        Returns:
+            Self
+        """
+        input_shape = X['dataset_properties']['input_shape']
+        output_shape = X['dataset_properties']['output_shape']
+
+        self.required_net_out_put_type = X['required_net_out_put_type']
+
+        auto_regressive = self.config.get("auto_regressive", False)
+        X.update({"auto_regressive": auto_regressive})
+        encoder_properties = X['encoder_properties']
+
+        # TODO consider Auto-regressive model on vanilla network head
+        if auto_regressive:
+            output_shape[0] = 1
+        fixed_input_shape = encoder_properties.get("fixed_input_shape", False)
+        network_output_tuple = encoder_properties.get("network_output_tuple", False)
+        arch_kwargs = encoder_properties.get("arch_kwargs", {})
+        if fixed_input_shape:
+            input_shape = (X["window_size"], input_shape[-1])
+        self.head = self.build_head(
+            input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape,
+                                         network_output_tuple=network_output_tuple),
+            output_shape=output_shape,
+            **arch_kwargs,
+        )
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the network head into the fit dictionary 'X' and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X = super().transform(X)
+        X.update({'decoder_properties': self.decoder_properties})
+        return X
+
+    def build_head(self,
+                   input_shape: Tuple[int, ...],
+                   output_shape: Tuple[int, ...],
+                   **arch_kwargs: Dict) -> nn.Module:
+        """
+        Builds the head module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            output_shape (Tuple[int, ...]): shape of the output of the head
+            arch_kwargs (Dict): additional paramter for initializing architectures.
+
+        Returns:
+            nn.Module: head module
+        """
+        base_header_layer, num_head_base_output_features = self._build_head(input_shape, **arch_kwargs)
+        proj_layer = []
+        # TODO consider local output layer introduced in Wen et al, A Multi-Horizon Quantile Recurrent Forecaster
+
+        output_layer = self.build_proj_layer(num_head_base_output_features=num_head_base_output_features,
+                                             output_shape=output_shape,
+                                             net_out_put_type=self.required_net_out_put_type,
+                                             dist_cls=self.config.get('dist_cls', None)
+                                             )
+        proj_layer.append(output_layer)
+        return nn.Sequential(*base_header_layer, *proj_layer)
+
+    @abstractmethod
+    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List[nn.Module], int]:
+        """
+        Builds the head module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            output_shape (Tuple[int, ...]): shape of the output of the head
+            n_prediction_steps (int): how many steps need to be predicted in advance
+
+        Returns:
+            nn.Module: head module
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def build_proj_layer(num_head_base_output_features: int,
+                         output_shape: Tuple[int, ...],
+                         net_out_put_type: str,
+                         dist_cls: Optional[str] = None) -> torch.nn.Module:
+        """
+        a final layer that project the head output to the final distribution
+        Args:
+            num_head_base_output_features (int): output feature of head base,
+            is used to initialize size of the linear layer
+            output_shape (Tuple[int, ..]): deserved output shape
+            net_out_put_type (str), type of the loss, it determines the output of the network
+            dist_cls (str), distribution class, only activate if output is a distribution
+
+        Returns:
+            proj_layer: nn.Module
+            projection layer that maps the features to the final output
+        """
+        if net_out_put_type == 'distribution':
+            if dist_cls not in ALL_DISTRIBUTIONS.keys():
+                raise ValueError(f'Unsupported distribution class type: {dist_cls}')
+            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
+                                                     output_shape=output_shape, )
+            return proj_layer
+        elif net_out_put_type == 'regression':
+            proj_layer = nn.Sequential(nn.Linear(num_head_base_output_features, np.product(output_shape)),
+                                       nn.Unflatten(-1, *output_shape))
+            return proj_layer
+        else:
+            raise ValueError(f"Unsupported network type "
+                             f"{net_out_put_type} (should be regression or distribution)")
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 88e032dbe..2f5c5832a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -171,7 +171,6 @@ class TimeSeriesForecastingDataLoader(FeatureDataLoader):
     def __init__(self,
                  batch_size: int = 64,
                  window_size: int = 1,
-                 upper_sequence_length: int = np.iinfo(np.int32).max,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
                  random_state: Optional[np.random.RandomState] = None) -> None:
@@ -181,14 +180,12 @@ def __init__(self,
             batch_size: batch size
             sequence_length: length of each sequence
             sample_interval: sample interval ,its value is the interval of the resolution
-            upper_sequence_length: upper limit of sequence length, to avoid a sequence length larger than dataset length
-            or specified by the users
+
             num_batches_per_epoch: how
-            n_prediction_steps: how many stpes to predict in advance
+            n_prediction_steps: how many steps to predict in advance
         """
         super().__init__(batch_size=batch_size, random_state=random_state)
         self.window_size: int = window_size
-        self.upper_sequence_length = upper_sequence_length
         self.n_prediction_steps = n_prediction_steps
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
@@ -428,6 +425,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         cs = ConfigurationSpace()
         add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, window_size, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, num_batch_per_epoch, UniformIntegerHyperparameter)
         return cs
 
     def __str__(self) -> str:
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 3215eefac..1ae17ff95 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -65,6 +65,7 @@ def forward(self, input_dist: torch.distributions.Distribution, target_tensor: t
                                              regression=MSELoss,
                                              forecasting=LogProbLoss)
 
+LOSS_TYPES = ['regression', 'distribution']
 
 def get_default(task: int) -> Type[Loss]:
     """
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 23e1563ae..2225b879c 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -57,8 +57,7 @@ def prepare_trainer(self, X):
         self.choice.prepare(
             model=X['network'],
             metrics=metrics,
-            criterion=get_loss(X['dataset_properties'],
-                               name=additional_losses),
+            criterion=X['loss'],
             budget_tracker=self.budget_tracker,
             optimizer=X['optimizer'],
             device=get_device_from_fit_dictionary(X),
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 8ffc5adf5..e1c5d1041 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -1,9 +1,12 @@
 import copy
 import warnings
+from collections import OrderedDict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause
+from ConfigSpace.hyperparameters import Constant
+from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause, ForbiddenInClause
+from ConfigSpace.conditions import EqualsCondition, NotEqualsCondition, AndConjunction
 
 import numpy as np
 
@@ -14,21 +17,19 @@
 
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.base_pipeline import BasePipeline, PipelineStepType
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesTransformer
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone import \
-    BaseForecastingNetworkBackbone
+    ForecastingNetworkBackboneChoice
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head import ForecastingNetworkHeadChoice
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
@@ -36,6 +37,7 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling import \
     TargetScalerChoice
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
+from autoPyTorch.pipeline.components.setup.forecasting_training_losses import ForecastingLossChoices
 from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
     TimeSeriesForecastingDataLoader
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
@@ -76,13 +78,6 @@ def __init__(self,
                  init_params: Optional[Dict[str, Any]] = None,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  ):
-        if 'upper_sequence_length' not in dataset_properties:
-            warnings.warn('max_sequence_length is not given in dataset property , might exists the risk of selecting '
-                          'length that is greater than the maximal allowed length of the dataset')
-            self.upper_sequence_length = np.iinfo(np.int32).max
-        else:
-            self.upper_sequence_length = dataset_properties['upper_sequence_length']
-
         super().__init__(
             config, steps, dataset_properties, include, exclude,
             random_state, init_params, search_space_updates)
@@ -191,6 +186,59 @@ def _get_hyperparameter_search_space(self,
                                 raise ValueError("Cannot find a legal default configuration")
                             cs.get_hyperparameter('network_embedding:__choice__').default_value = default
 
+        # dist_cls and auto_regressive are only activate if the network outputs distribution
+        if 'loss' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():
+            hp_loss = cs.get_hyperparameter('loss:__choice__')
+            hp_distribution_heads = []
+            for hp_name in cs.get_hyperparameter_names():
+                if hp_name.startswith('network_head:'):
+                    if hp_name.endswith(':dist_cls') or hp_name.endswith(':auto_regressive'):
+                        hp_distribution_heads.append(cs.get_hyperparameter(hp_name))
+
+            # in this case we cannot deactivate the hps, we might need to think about this
+            if 'distribution_losses' in hp_loss.choices:
+                for hp_dist in hp_distribution_heads:
+                    hp_dist_parent_condition = cs.get_parent_conditions_of(hp_dist.name)
+                    new_cond = AndConjunction(EqualsCondition(hp_dist, hp_loss, 'distribution_losses'),
+                                              *hp_dist_parent_condition)
+
+                    # TODO: this is only a temporal solution, we need to create a PR for ConfigSpace to allow replacing or
+                    # deleting conditions!!!
+
+                    # delete the old condition
+                    cs._parents[hp_dist.name] = OrderedDict()
+                    cs._parents[hp_dist.name]['__HPOlib_configuration_space_root__'] = None
+                    cs.add_condition(new_cond)
+            else:
+                # we set a placeholder and use it to inactivate the related values
+                placeholder = Constant("loss_place_holder", 0)
+                cs.add_hyperparameter(placeholder)
+                for hp_dist in hp_distribution_heads:
+                    hp_dist_parent_condition = cs.get_parent_conditions_of(hp_dist.name)
+                    new_cond = AndConjunction(NotEqualsCondition(hp_dist, placeholder, 0),
+                                              *hp_dist_parent_condition)
+
+                    # delete the old condition
+                    cs._parents[hp_dist.name] = OrderedDict()
+                    cs._parents[hp_dist.name]['__HPOlib_configuration_space_root__'] = None
+                    cs.add_condition(new_cond)
+
+        # rnn head only allow rnn backbone
+        if 'network_backbone' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():
+            hp_backbone_choice = cs.get_hyperparameter('network_backbone:__choice__')
+            hp_head_choice = cs.get_hyperparameter('network_head:__choice__')
+
+            if 'ForecastingRNNHeader' in hp_head_choice.choices:
+                if len(hp_head_choice.choices) == 1 and 'RNNBackbone' not in hp_backbone_choice.choices:
+                    raise ValueError("RNN Header is only compatible with RNNBackbone, RNNHead is not allowed to be "
+                                     "the only network head choice if the backbone choices do not contain RNN!")
+                backbone_choices = [choice for choice in hp_backbone_choice.choices if choice != 'RNNBackbone']
+                forbidden_clause_backbone = ForbiddenInClause(hp_backbone_choice, backbone_choices)
+                forbidden_clause_head = ForbiddenEqualsClause(hp_head_choice, 'ForecastingRNNHeader')
+
+                cs.add_forbidden_clause(ForbiddenAndConjunction(forbidden_clause_backbone, forbidden_clause_head))
+            cs.get_hyperparameter_names()
+
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
         return cs
@@ -213,15 +261,15 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             default_dataset_properties.update(dataset_properties)
         # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
+            ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
             ("imputer", SimpleImputer(random_state=self.random_state)),
             # ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
             ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
-            ("data_loader", TimeSeriesForecastingDataLoader(upper_sequence_length=self.upper_sequence_length,
-                                                            random_state=self.random_state)),
+            ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
-            ("network_backbone", BaseForecastingNetworkBackbone(default_dataset_properties,
+            ("network_backbone", ForecastingNetworkBackboneChoice(default_dataset_properties,
                                                        random_state=self.random_state)),
             ("network_head", ForecastingNetworkHeadChoice(default_dataset_properties,
                                                random_state=self.random_state)),

From 777340700e8388770f5b9ce187deefef82b4a384 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 12 Dec 2021 18:59:45 +0100
Subject: [PATCH 081/347] new overall network design for forecasting tasks

---
 .../forecasting_target_scaling/utils.py       |   2 +-
 .../DistributionLoss.py                       |  56 +++++++
 .../RegressionLoss.py}                        |   4 +-
 .../__init__.py                               |   2 +-
 .../base_forecasting_loss.py}                 |   4 +-
 .../distribution_losses.py                    |  33 ----
 .../components/setup/network/base_network.py  |  12 +-
 .../setup/network/forecasting_network.py      | 157 +++++++++++++++---
 .../network_backbone/base_network_backbone.py |   1 -
 .../InceptionTimeBackbone.py                  |   8 +-
 .../RNNBackbone.py                            |  55 +++---
 .../TCNBackbone.py                            |  25 ++-
 .../TimeSeriesMLPBackbone.py                  |  83 ++++++---
 .../base_forecasting_backbone.py              |  44 ++++-
 .../setup/network_backbone/utils.py           |   8 +-
 .../ForecastingFullyConnectedHeader.py        |  79 ---------
 .../ForecastingMLPHead.py                     | 146 ++++++++++++++++
 ...tingRNNHeader.py => ForecastingRNNHead.py} |  35 ++--
 .../forecasting_network_head/distribution.py  |  29 +++-
 .../forecasting_head.py                       |  89 +++++++---
 .../setup/network_head/fully_connected.py     |  13 +-
 .../time_series_forecasting_data_loader.py    |   6 +-
 .../forecasting_base_trainer.py               |  17 +-
 .../pipeline/time_series_forecasting.py       |  68 ++++----
 24 files changed, 672 insertions(+), 304 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
 rename autoPyTorch/pipeline/components/setup/{forecasting_training_losses/regression_losses.py => forecasting_training_loss/RegressionLoss.py} (96%)
 rename autoPyTorch/pipeline/components/setup/{forecasting_training_losses => forecasting_training_loss}/__init__.py (99%)
 rename autoPyTorch/pipeline/components/setup/{forecasting_training_losses/base_forecasting_losses.py => forecasting_training_loss/base_forecasting_loss.py} (89%)
 delete mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
 rename autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/{ForecastingRNNHeader.py => ForecastingRNNHead.py} (79%)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index 128040cef..8f4ede183 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -29,7 +29,7 @@ def transform(self, X: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tenso
             max_ = torch.max(X, dim=-2, keepdim=True)[0]
 
             diff_ = max_ - min_
-            loc = min_
+            loc = min_ - 1e-10
             scale = diff_
             scale[scale == 0.0] = 1.0
             return (X - loc) / scale, loc, scale
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
new file mode 100644
index 000000000..067882e75
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -0,0 +1,56 @@
+from typing import Optional, Dict, Union, Any
+import numpy as np
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
+    ForecastingLossComponents
+from autoPyTorch.pipeline.components.training.losses import LogProbLoss
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
+
+
+class DistributionLoss(ForecastingLossComponents):
+    loss = LogProbLoss
+    required_net_out_put_type = 'distribution'
+
+    def __init__(self,
+                 dist_cls: str,
+                 random_state: Optional[np.random.RandomState] = None,
+                 ):
+        super(DistributionLoss, self).__init__()
+        self.dist_cls = dist_cls
+        self.random_state = random_state
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'DistributionLoss',
+            'name': 'DistributionLoss',
+            "handles_tabular": False,
+            "handles_image": False,
+            "handles_time_series": True,
+            'handles_regression': True,
+            'handles_classification': False
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        required_padding_value = ALL_DISTRIBUTIONS[self.dist_cls].value_in_support
+        X.update({"dist_cls": self.dist_cls,
+                  "required_padding_value": required_padding_value})
+        return super().transform(X)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
+                                                                        value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                                                                        default_value=
+                                                                        list(ALL_DISTRIBUTIONS.keys())[0])
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
similarity index 96%
rename from autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py
rename to autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
index 8f3e0bb75..4043612e9 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/regression_losses.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
@@ -9,7 +9,7 @@
 )
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.forecasting_training_losses.base_forecasting_losses import \
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
 from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss
 
@@ -21,7 +21,7 @@ def __init__(self,
                  loss_name: str,
                  random_state: Optional[np.random.RandomState] = None,
                  ):
-        super(RegressionLosses).__init__()
+        super(RegressionLosses, self).__init__()
         if loss_name == "l1":
             self.loss = L1Loss
         elif loss_name == 'mse':
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
similarity index 99%
rename from autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py
rename to autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index e348ade3e..2e98722ea 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -16,7 +16,7 @@
 
 from autoPyTorch.constants import REGRESSION_TASKS, CLASSIFICATION_TASKS, FORECASTING_TASKS, STRING_TO_TASK_TYPES
 
-from autoPyTorch.pipeline.components.setup.forecasting_training_losses.base_forecasting_losses import\
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import\
     ForecastingLossComponents
 
 directory = os.path.split(__file__)[0]
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
similarity index 89%
rename from autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py
rename to autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
index 2b7040935..1f09dd4e1 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/base_forecasting_losses.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
@@ -23,6 +23,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        X.update({"loss": self.loss})
-        X.update({'required_net_out_put_type': self.required_net_out_put_type})
+        X.update({"loss": self.loss,
+                  'required_net_out_put_type': self.required_net_out_put_type})
         return X
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py b/autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py
deleted file mode 100644
index a6c39a488..000000000
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_losses/distribution_losses.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from typing import Optional, Dict, Union
-
-from ConfigSpace import ConfigurationSpace
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.forecasting_training_losses.base_forecasting_losses import \
-    ForecastingLossComponents
-from autoPyTorch.pipeline.components.training.losses import LogProbLoss
-
-
-class DistributionLoss(ForecastingLossComponents):
-    loss = LogProbLoss
-    required_net_out_put_type = 'distribution'
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'DistributionLoss',
-            'name': 'DistributionLoss',
-            "handles_tabular": False,
-            "handles_image": False,
-            "handles_time_series": True,
-            'handles_regression': True,
-            'handles_classification': False
-        }
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-    ) -> ConfigurationSpace:
-        cs = ConfigurationSpace()
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index 49928b77c..fe2a1732b 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -27,13 +27,17 @@ def __init__(
         super(NetworkComponent, self).__init__()
         self.random_state = random_state
         self.device = None
-        self.add_fit_requirements([
+        self.add_fit_requirements(self._required_fit_requirements)
+        self.network = network
+        self.final_activation: Optional[torch.nn.Module] = None
+
+    @property
+    def _required_fit_requirements(self):
+        return [
             FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
-        ])
-        self.network = network
-        self.final_activation: Optional[torch.nn.Module] = None
+        ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         """
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index bcc0c5414..36db59ea1 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,5 +1,8 @@
 from typing import Any, Dict, Optional, Union, Tuple
+
 from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
 
@@ -10,27 +13,78 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
+    import EncoderNetwork
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+
 
-"""
 class ForecastingNet(nn.Module):
     def __init__(self,
                  network_embedding: nn.Module,
-                 network_backbone: nn.Module,
+                 network_backbone: EncoderNetwork,
                  network_head: nn.Module,
-                 network_properties: Dict = {}):
+                 encoder_properties: Dict = {},
+                 decoder_properties: Dict = {},
+                 output_type: str = 'regression',
+                 forecast_strategy: str = 'mean',
+                 num_samples: int = 100,
+                 aggregation: str = 'mean'
+                 ):
         super(ForecastingNet, self).__init__()
         self.embedding = network_embedding
         self.backbone = network_backbone
-        self.backbone_output_tuple = network_properties.get("backbone_output_tuple", False)
-        self.accept_hidden_states_as_input = network_properties.get("_accept_hidden_states_as_input", False)
         self.network_head = network_head
 
-    def forward(self, X: torch.Tensor, hx: Optional[Tuple[torch.Tensor]]=None):
-
-"""
+        self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
+        self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
+
+        if self.decoder_has_hidden_states:
+            if not self.encoder_has_hidden_states:
+                raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
+                                 'for decoder!')
+
+        self.recurrent_decoder = decoder_properties['recurrent']
+
+        self.output_type = output_type
+        self.forecast_strategy = forecast_strategy
+        self.num_samples = num_samples
+        self.aggregation = aggregation
+
+    def forward(self, X: torch.Tensor, hx: Optional[Tuple[torch.Tensor]] = None):
+        X = self.embedding(X)
+        if self.encoder_has_hidden_states:
+            X, hidden_state = self.backbone(X)
+        else:
+            X = self.backbone(X)
+
+        X = self.network_head(X)
+        return X
+
+    def pred_from_net_output(self, net_output):
+        if self.output_type == 'regression':
+            return net_output
+        elif self.output_type == 'distribution':
+            if self.forecast_strategy == 'mean':
+                return net_output.mean
+            elif self.forecast_strategy == 'sample':
+                samples = net_output.sample(self.num_samples)
+                if self.aggregation == 'mean':
+                    return torch.mean(samples, dim=0)
+                elif self.aggregation == 'median':
+                    return torch.median(samples, 0)
+                else:
+                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
+            else:
+                raise ValueError(f'Unknown forecast_strategy: {self.forecast_strategy}')
+        else:
+            raise ValueError(f'Unknown output_type: {self.output_type}')
 
+    def predict(self, X: torch.Tensor):
+        net_output = self(X)
+        return self.pred_from_net_output(net_output)
 
 
 class ForecastingNetworkComponent(NetworkComponent):
@@ -38,10 +92,56 @@ def __init__(
             self,
             network: Optional[torch.nn.Module] = None,
             random_state: Optional[np.random.RandomState] = None,
-            auto_regressive: Optional[bool] = False,
+            forecast_strategy: str = 'mean',
+            num_samples: Optional[int] = None,
+            aggregation: Optional[str] = None
     ) -> None:
         super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
-        self.auto_regressive = auto_regressive
+        self.forecast_strategy = forecast_strategy
+        self.num_samples = num_samples
+        self.aggregation = aggregation
+        self.output_type = None
+
+    @property
+    def _required_fit_requirements(self):
+        return [
+            FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
+            FitRequirement("encoder_properties", (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement("decoder_properties", (Dict,), user_defined=False, dataset_property=False),
+        ]
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
+        # Make sure that input dictionary X has the required
+        # information to fit this stage
+        self.check_requirements(X, y)
+        self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
+
+        self.network = ForecastingNet(network_embedding=X['network_embedding'],
+                                      network_backbone=X['network_backbone'],
+                                      network_head=X['network_head'],
+                                      encoder_properties=X['encoder_properties'],
+                                      decoder_properties=X['decoder_properties'],
+                                      output_type=X['required_net_out_put_type'],
+                                      forecast_strategy=self.forecast_strategy,
+                                      num_samples=self.num_samples,
+                                      aggregation=self.aggregation,
+                                      )
+
+        # Properly set the network training device
+        if self.device is None:
+            self.device = get_device_from_fit_dictionary(X)
+
+        self.to(self.device)
+
+        if STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in CLASSIFICATION_TASKS:
+            self.final_activation = nn.Softmax(dim=1)
+
+        self.is_fitted_ = True
+
+        return self
 
     def predict(self, loader: torch.utils.data.DataLoader,
                 target_scaler: Optional[BaseTargetScaler] = None) -> torch.Tensor:
@@ -69,13 +169,11 @@ def predict(self, loader: torch.utils.data.DataLoader,
             X = X.to(self.device)
 
             with torch.no_grad():
-
-                Y_batch_pred = self.network(X).mean
-                if loc is not None or scale is not None:
-                    if loc is None:
-                        loc = 0.
-                    if scale is None:
-                        scale = 1.
+                Y_batch_pred = self.network.predict(X)
+                if loc is None:
+                    loc = 0.
+                if scale is None:
+                    scale = 1.
                 Y_batch_pred = Y_batch_pred.cpu() * scale + loc
 
             Y_batch_preds.append(Y_batch_pred.cpu())
@@ -83,8 +181,27 @@ def predict(self, loader: torch.utils.data.DataLoader,
         return torch.cat(Y_batch_preds, 0).cpu().numpy()
 
     @staticmethod
-    def get_hyperparameter_search_space(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-                                        **kwargs: Any
-                                        ) -> ConfigurationSpace:
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
+                                                                                     value_range=('sample', 'mean'),
+                                                                                     default_value='mean'),
+            num_samples: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_samples',
+                                                                               value_range=(50, 200),
+                                                                               default_value=100),
+            aggregation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='aggregation',
+                                                                               value_range=('mean', 'median'),
+                                                                               default_value='mean')
+    ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
+        forecast_strategy = get_hyperparameter(forecast_strategy, CategoricalHyperparameter)
+        num_samples = get_hyperparameter(num_samples, UniformIntegerHyperparameter)
+        aggregation = get_hyperparameter(aggregation, CategoricalHyperparameter)
+
+        cond_num_sample = EqualsCondition(num_samples, forecast_strategy, 'sample')
+        cond_aggregation = EqualsCondition(aggregation, forecast_strategy, 'sample')
+
+        cs.add_hyperparameters([forecast_strategy, num_samples, aggregation])
+        cs.add_conditions([cond_aggregation, cond_num_sample])
+
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index eb3f75336..a159003be 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -26,7 +26,6 @@ class NetworkBackboneComponent(autoPyTorchComponent):
     Base class for network backbones. Holds the backbone module and the config which was used to create it.
     """
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
-    _fixed_seq_length = False  # only used for time series tasks, if the input seq_length needs to be fixed
 
     def __init__(self,
                  **kwargs: Any):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
index 3e780d3fc..ad400223f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
@@ -130,8 +130,10 @@ class InceptionTimeBackbone(BaseForecastingNetworkBackbone):
     @property
     def encoder_properties(self):
         # TODO consider property for the network
-        backbone_properties = {}
-        return backbone_properties
+        encoder_properties = {'has_hidden_states': False,
+                              'bijective_seq_output': True,
+                              'fixed_input_seq_length': False}
+        return encoder_properties
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         backbone = _InceptionTime(in_features=input_shape[-1],
@@ -159,6 +161,7 @@ def get_hyperparameter_search_space(
         num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                            value_range=(4, 64),
                                                                            default_value=32,
+                                                                           log=True
                                                                            ),
         kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
                                                                            value_range=(4, 64),
@@ -167,6 +170,7 @@ def get_hyperparameter_search_space(
         bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
                                                                                value_range=(16, 64),
                                                                                default_value=32,
+                                                                               log=True
                                                                                ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
index c68c941fc..aabbce9f8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
@@ -13,18 +13,17 @@
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
-    import BaseForecastingNetworkBackbone
+    import BaseForecastingNetworkBackbone, EncoderNetwork
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
-class _RNN(nn.Module):
+class _RNN(EncoderNetwork):
     # we only consder GRU and LSTM here
     def __init__(self,
                  in_features: int,
                  config: Dict[str, Any]):
         super().__init__()
         self.config = config
-        self.only_return_final_stage = True
         if config['cell_type'] == 'lstm':
             cell_type = nn.LSTM
         else:
@@ -37,15 +36,19 @@ def __init__(self,
                               bidirectional=config["bidirectional"],
                               batch_first=True)
 
-    def forward(self, x: torch.Tensor,
+    def forward(self,
+                x: torch.Tensor,
+                output_seq: bool = False,
                 hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
         B, T, _ = x.shape
 
         outputs, hidden_state, = self.lstm(x, hx)
 
-        if self.only_return_final_stage:
+        if output_seq:
+            return outputs, hidden_state
+        else:
             if not self.config["bidirectional"]:
-                return outputs[:, [-1], :], hidden_state
+                return outputs[:, -1, :], hidden_state
             else:
                 # concatenate last forward hidden state with first backward hidden state
                 outputs_by_direction = outputs.view(B,
@@ -53,12 +56,11 @@ def forward(self, x: torch.Tensor,
                                                     2,
                                                     self.config["hidden_size"])
                 out = torch.cat([
-                    outputs_by_direction[:, [-1], 0, :],
-                    outputs_by_direction[:, [0], 1, :]
+                    outputs_by_direction[:, -1, 0, :],
+                    outputs_by_direction[:, 0, 1, :]
                 ], dim=-1)
                 return out, hidden_state
-        else:
-            return outputs, hidden_state
+
 
 
 class RNNBackbone(BaseForecastingNetworkBackbone):
@@ -78,27 +80,19 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
 
     @property
     def encoder_properties(self):
-        encoder_properties = {'network_output_tuple': True,
-                              'accept_additional_input': True,
-                              'hidden_states': True}
-        arch_kwargs = {'hidden_size': self.config['hidden_size'],
+        encoder_properties = {'has_hidden_states': True,
+                              'bijective_seq_output': True,
+                              'fixed_input_seq_length': False
+                              }
+        return encoder_properties
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        rnn_kwargs = {'hidden_size': self.config['hidden_size'],
                        'num_layers': self.config['num_layers'],
                        'bidirectional': self.config['bidirectional'],
                        'cell_type': self.config['cell_type']}  # used for initialize
-        encoder_properties.update({"arch_kwargs": arch_kwargs})
-        return encoder_properties
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        # the setting are utilized to build decoder
-        return super().fit(X, y)
-
-    @property
-    def only_return_final_stage(self):
-        return self.backbone.only_return_final_stage
-
-    @only_return_final_stage.setter
-    def only_return_final_stage(self, only_return_final_stage):
-        self.backbone.only_return_final_stage = only_return_final_stage
+        X.update({'rnn_kwargs': rnn_kwargs})
+        return super().transform(X)
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
@@ -120,8 +114,9 @@ def get_hyperparameter_search_space(
                                                                               value_range=(1, 3),
                                                                               default_value=1),
             hidden_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='hidden_size',
-                                                                               value_range=(64, 512),
-                                                                               default_value=256),
+                                                                               value_range=(32, 512),
+                                                                               default_value=256,
+                                                                               log=True),
             use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
                                                                                value_range=(True, False),
                                                                                default_value=False),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
index 4d17cb8ca..dc5ce5850 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
@@ -14,7 +14,7 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
-    import BaseForecastingNetworkBackbone
+    import BaseForecastingNetworkBackbone, EncoderNetwork
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -71,7 +71,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.relu(out + res)
 
 
-class _TemporalConvNet(nn.Module):
+class _TemporalConvNet(EncoderNetwork):
     def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int = 2, dropout: float = 0.2):
         super(_TemporalConvNet, self).__init__()
         layers: List[Any] = []
@@ -89,12 +89,17 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int =
                                       dropout=dropout)]
         self.network = nn.Sequential(*layers)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
+        import pdb
+        pdb.set_trace()
         # swap sequence and feature dimensions for use with convolutional nets
         x = x.transpose(1, 2).contiguous()
         x = self.network(x)
         x = x.transpose(1, 2).contiguous()
-        return x
+        if output_seq:
+            return x
+        else:
+            return x[:, -1, :]
 
 
 class TCNBackbone(BaseForecastingNetworkBackbone):
@@ -104,10 +109,11 @@ class TCNBackbone(BaseForecastingNetworkBackbone):
 
     @property
     def encoder_properties(self):
-        # TODO
-        backbone_properties = {}
-        return backbone_properties
-
+        encoder_properties = {'has_hidden_states': False,
+                              'bijective_seq_output': True,
+                              'fixed_input_seq_length': False
+                              }
+        return encoder_properties
 
     def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_0"]]
@@ -140,7 +146,8 @@ def get_hyperparameter_search_space(
                                                                               default_value=5),
             num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                                value_range=(4, 64),
-                                                                               default_value=32),
+                                                                               default_value=32,
+                                                                               log=True),
             kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
                                                                                value_range=(4, 64),
                                                                                default_value=32),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
index b412a9622..d48910342 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
@@ -1,21 +1,21 @@
-from typing import Any, Dict, List, Optional, Union
-
-
-from typing import Tuple
-from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
-    import BaseForecastingNetworkBackbone
+from typing import Any, Dict, List, Optional, Union, Tuple
 
 import torch
 from torch import nn
 
+from ConfigSpace import ConfigurationSpace
+
 
+from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
+    import BaseForecastingNetworkBackbone, EncoderNetwork
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace
 
 
-class _TimeSeriesMLP(nn.Module):
+class _TimeSeriesMLP(EncoderNetwork):
     def __init__(self,
                  window_size: int,
                  module_layers: nn.Module,
@@ -24,19 +24,29 @@ def __init__(self,
         self.window_size = window_size
         self.module_layers = module_layers
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
 
         Args:
-            x: torch.Tensor(batch_size, window_size, num_features)
+            x: torch.Tensor(B, L_in, N)
+            output_seq (bool), if the MLP outputs a squence, in which case, the input will be rolled to fit the size of
+            the network. For Instance if self.window_size = 3, and we obtain a squence with [1, 2, 3, 4, 5]
+            the input of this mlp is rolled as :
+            [[1, 2, 3]
+            [2, 3, 4]
+            [3, 4 ,5]]
 
         Returns:
 
         """
-        if x.shape[1] > self.window_size:
-            # we need to ensure that the input size fits the
-            x = x[:, -self.window_size:]
-        x = x.view(x.shape[0], -1)
+        if output_seq:
+            x = x.unfold((1, self.window_size, 1)).transpose(-1, -2)
+            # x.shape = [B, L_in - self.window + 1, self.window, N]
+        else:
+            if x.shape[1] > self.window_size:
+                # we need to ensure that the input size fits the network shape
+                x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
+        x = x.flatten(-2)
         return self.module_layers(x)
 
 
@@ -46,10 +56,12 @@ class TimeSeriesMLPBackbone(BaseForecastingNetworkBackbone, MLPBackbone):
 
     @property
     def encoder_properties(self):
-        backbone_properties = {
-            'fixed_input_shape': True,  # the network has a fixed input shape, this is used to indicate output shape
-                               }
-        return backbone_properties
+        encoder_properties = {
+            'has_hidden_states': False,
+            'bijective_seq_output': False,
+            'fixed_input_seq_length': True,
+        }
+        return encoder_properties
 
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
@@ -75,3 +87,36 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_image': False,
             'handles_time_series': True,
         }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
+                                                                              value_range=(1, 15),
+                                                                              default_value=5,
+                                                                              ),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
+                                                                              value_range=tuple(_activations.keys()),
+                                                                              default_value=list(_activations.keys())[
+                                                                                  0],
+                                                                              ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                                                               value_range=(True, False),
+                                                                               default_value=False,
+                                                                               ),
+            num_units: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_units",
+                                                                             value_range=(16, 1024),
+                                                                             default_value=256,
+                                                                             log=True
+                                                                             ),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
+                                                                           value_range=(0, 0.8),
+                                                                           default_value=0.5,
+                                                                           ),
+    ) -> ConfigurationSpace:
+        return MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,
+                                                           num_groups=num_groups,
+                                                           activation=activation,
+                                                           use_dropout=use_dropout,
+                                                           num_units=num_units,
+                                                           dropout=dropout)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
index 920fe7946..5bd411020 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
@@ -1,6 +1,8 @@
 from abc import abstractmethod
 from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
 
+import torch
+from torch import nn
 from abc import abstractmethod
 from typing import Any, Dict, Iterable, Optional, Tuple, List
 
@@ -8,10 +10,28 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 
 
+class EncoderNetwork(nn.Module):
+    def forward(self, x: torch.Tensor, output_seq: bool = False):
+        """
+        Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
+        When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
+         needs to be set as True
+        When the decoder is a seq2seq model, the network needs to output a 2-d Tensor (B, N), in which case,
+        output_seq needs to be set as False
+
+        Args:
+            x: torch.Tensor(B, L_in, N)
+            output_seq (bool), if the network outputs a sequence tensor. If it is set True,
+            output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
+            If this value is set as False, the network only returns the last item of the sequence.
+        Returns:
+            net_output: torch.Tensor with shape either (B, N) or (B, L_out, N)
+
+        """
+        raise NotImplementedError
+
+
 class BaseForecastingNetworkBackbone(NetworkBackboneComponent):
-    """
-    Base forecasting network, its output needs to be a 3-d Tensor:
-    """
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         return super().fit(X, y)
 
@@ -21,6 +41,20 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return X
 
     @property
-    @abstractmethod
     def encoder_properties(self):
-        raise NotImplementedError
+        """
+        Encoder properties, this determines how the data flows over the forecasting networks
+
+        has_hidden_states, it determines if the network contains hidden states and thus return or accept the hidden
+        states
+        bijective_seq_output, determines if the network returns a sequence with the same sequence length as the input
+        sequence when output_seq is set True
+        fix_input_shape if the input shape is fixed, this is useful for building network head
+        """
+        # TODO make use of bijective_seq_output in trainer!!!
+        encoder_properties = {'has_hidden_states': False,
+                              'bijective_seq_output': False,
+                              'fixed_input_seq_length': False
+                              }
+        return encoder_properties
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
index 1f9530a6c..b7d4ba2af 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -16,20 +16,20 @@
 }
 
 
-def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], network_output_tuple: bool = False
+def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has_hidden_states: bool = False
                      ) -> Tuple[int, ...]:
     """
     Run a dummy forward pass to get the output shape of the backbone.
     Can and should be overridden by subclasses that know the output shape
     without running a dummy forward pass.
     :param input_shape: shape of the input
-    : network_output_tuple: bool, if the network backbone output a tuple. if yes, the shape of the first output is
-    returned
+    :param has_hidden_states: bool, if the network backbone contains a hidden_states. if yes, the network will return a Tuple,
+    we will then only consider the first item
     :return: output_shape
     """
     placeholder = torch.randn((2, *input_shape), dtype=torch.float)
     with torch.no_grad():
-        if network_output_tuple:
+        if has_hidden_states:
             output = network(placeholder)[0]
         else:
             output = network(placeholder)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py
deleted file mode 100644
index 24bd5eebe..000000000
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingFullyConnectedHeader.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from abc import ABC
-from typing import Dict, Optional, Tuple, Union, List
-
-from torch import nn
-
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
-
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import \
-    ForecastingHead
-from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
-
-
-class ForecastingFullyConnectedHeader(ForecastingHead, FullyConnectedHead):
-    @property
-    def decoder_properties(self):
-        decoder_properties = {}
-        return decoder_properties
-
-    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List[nn.Module], int]:
-        layers = []
-        in_features = input_shape[-1]
-        for i in range(1, self.config["num_layers"]):
-            layers.append(nn.Linear(in_features=in_features,
-                                    out_features=self.config[f"units_layer_{i}"]))
-            layers.append(_activations[self.config["activation"]]())
-            in_features = self.config[f"units_layer_{i}"]
-        head_base_output_features = in_features
-
-        return layers, head_base_output_features
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'ForecastingFullyConnectedHead',
-            'name': 'ForecastingFullyConnectedHead',
-            'handles_tabular': False,
-            'handles_image': False,
-            'handles_time_series': True,
-        }
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
-                                                                              value_range=(1, 4),
-                                                                              default_value=2),
-            units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
-                                                                               value_range=(64, 512),
-                                                                               default_value=128),
-            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
-                                                                              value_range=tuple(_activations.keys()),
-                                                                              default_value=list(_activations.keys())[
-                                                                                  0]),
-            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
-                                                                            value_range=tuple(ALL_DISTRIBUTIONS.keys()),
-                                                                            default_value=
-                                                                            list(ALL_DISTRIBUTIONS.keys())[0]),
-            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
-                                                                                   value_range=(True, False),
-                                                                                   default_value=False)
-    ) -> ConfigurationSpace:
-        cs = FullyConnectedHead.get_hyperparameter_search_space(dataset_properties=dataset_properties,
-                                                                num_layers=num_layers,
-                                                                units_layer=units_layer,
-                                                                activation=activation)
-
-        # These two HPs are inactivate if loss type is regression
-        # TODO add that in the pipeline part
-        add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
-        # TODO let dataset_properties decide if autoregressive models is appliable
-        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
new file mode 100644
index 000000000..0f82ee8a8
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
@@ -0,0 +1,146 @@
+from abc import ABC
+from typing import Dict, Optional, Tuple, Union, List
+
+from torch import nn
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+from ConfigSpace.conditions import GreaterThanCondition
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import \
+    ForecastingHead
+from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+
+
+class ForecastingMLPHeader(ForecastingHead, FullyConnectedHead):
+    @property
+    def decoder_properties(self):
+        decoder_properties = {'has_hidden_states': False,
+                              'recurrent': False,
+                              }
+        return decoder_properties
+
+    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List[nn.Module], int]:
+        layers = []
+        in_features = input_shape[-1]
+        n_prediction_steps = arch_kwargs['n_prediction_heads']
+        if self.config["num_layers"] > 0:
+            for i in range(1, self.config["num_layers"]):
+                layers.append(nn.Linear(in_features=in_features,
+                                        out_features=self.config[f"units_layer_{i}"]))
+                layers.append(_activations[self.config["activation"]]())
+                in_features = self.config[f"units_layer_{i}"]
+        layers.append(nn.Linear(in_features=in_features,
+                                out_features=self.config['units_final_layer'] * n_prediction_steps))
+        if 'activation' in self.config:
+            layers.append(_activations[self.config["activation"]]())
+        head_base_output_features = self.config['units_final_layer'] * n_prediction_steps
+
+        return layers, head_base_output_features
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'ForecastingMLPHead',
+            'name': 'ForecastingMLPHead',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
+                                                                              value_range=(0, 3),
+                                                                              default_value=1),
+            units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
+                                                                               value_range=(64, 512),
+                                                                               default_value=128,
+                                                                               log=True),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
+                                                                              value_range=tuple(_activations.keys()),
+                                                                              default_value=list(_activations.keys())[
+                                                                                  0]),
+            units_final_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_final_layer",
+                                                                                     value_range=(16, 128),
+                                                                                     default_value=32,
+                                                                                     log=True),
+            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False)
+    ) -> ConfigurationSpace:
+        """
+        Builds the mlp head layer. The decoder implementation follows the idea from:
+
+        Wen et al, A Multi-Horizon Quantile Recurrent Forecaster, NeurIPS 2017, Time Series Workshop
+        https://arxiv.org/abs/1711.11053
+
+        This model acts as the global MLP, local MLP is implemented under forecasting_head, that maps the output
+        features to the final output
+
+        Additionally, this model also determines if DeepAR is applied to do prediction
+
+        Salinas et al. DeepAR: Probabilistic Forecasting with Autoregressive Recurrent Networks
+        https://arxiv.org/abs/1704.04110
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Dataset Properties
+            num_layers (HyperparameterSearchSpace): number of decoder layers (the last layer is not included, thus it
+            could start from 0)
+            units_layer (HyperparameterSearchSpace): number of units of each layer (except for the last layer)
+            activation (HyperparameterSearchSpace): activation function
+            units_final_layer (HyperparameterSearchSpace): number of units of final layer. The size of this layer is
+            smaller as it needs to be expanded to adapt to the number of predictions
+            dist_cls (HyperparameterSearchSpace): only activate when required_output_tpe is distribution, the sorts of
+            distribution that the network could output
+            auto_regressive (HyperparameterSearchSpace): if the model acts as a DeepAR model
+        Returns:
+            cs (ConfigurationSpace): ConfigurationSpace
+        """
+        cs = ConfigurationSpace()
+
+        min_num_layers: int = num_layers.value_range[0]  # type: ignore
+        max_num_layers: int = num_layers.value_range[-1]  # type: ignore
+        num_layers_is_constant = (min_num_layers == max_num_layers)
+
+        num_layers_hp = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        activation_hp = get_hyperparameter(activation, CategoricalHyperparameter)
+        cs.add_hyperparameter(num_layers_hp)
+
+        if not num_layers_is_constant:
+            cs.add_hyperparameter(activation_hp)
+            # HERE WE replace 1 with 0 to be compatible with our modification
+            cs.add_condition(GreaterThanCondition(activation_hp, num_layers_hp, 0))
+        elif max_num_layers > 1:
+            # only add activation if we have more than 1 layer
+            cs.add_hyperparameter(activation_hp)
+
+        for i in range(1, max_num_layers + 1):
+            num_units_search_space = HyperparameterSearchSpace(
+                hyperparameter=f"units_layer_{i}",
+                value_range=units_layer.value_range,
+                default_value=units_layer.default_value,
+                log=units_layer.log,
+            )
+            num_units_hp = get_hyperparameter(num_units_search_space, UniformIntegerHyperparameter)
+            cs.add_hyperparameter(num_units_hp)
+
+            if i >= min_num_layers and not num_layers_is_constant:
+                # In the case of a constant, the max and min number of layers are the same.
+                # So no condition is needed. If it is not a constant but a hyperparameter,
+                # then a condition has to be made so that it accounts for the value of the
+                # hyperparameter.
+                cs.add_condition(GreaterThanCondition(num_units_hp, num_layers_hp, i))
+
+        add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
+
+        # TODO let dataset_properties decide if auto_regressive models is applicable
+        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHead.py
similarity index 79%
rename from autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py
rename to autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHead.py
index 7cb3e3865..0678768d0 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHeader.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHead.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, List
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -18,7 +18,7 @@
     ForecastingHead
 
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
 
 
 class _RNN_Decoder(nn.Module):
@@ -51,27 +51,41 @@ class ForecastingRNNHeader(ForecastingHead):
     """
     Standard searchable RNN decoder for time series data, only works when the encoder is
     """
+
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.auto_regressive = True
+        self.rnn_kwargs = None
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        fit_requirement = super(ForecastingRNNHeader, self)._required_fit_requirements
+        fit_requirement.append(FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False))
+        return fit_requirement
 
     def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> nn.Module:
+        # RNN decoder only allows RNN encoder, these parameters need to exists.
+        hidden_size = self.rnn_kwargs['hidden_size']
+        num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs['num_layers']
+        cell_type = self.rnn_kwargs['cell_type']
         head = _RNN_Decoder(in_features=input_shape[-1],
                             config=self.config,
-                            **arch_kwargs)
+                            hidden_size=hidden_size,
+                            num_layers=num_layers,
+                            cell_type=cell_type)
         self.head = head
         return head
 
     @property
     def decoder_properties(self):
-        decoder_properties = {'network_output_tuple': True,
-                              'accept_additional_input': True,
-                              'recurrent': True}
+        decoder_properties = {'has_hidden_states': True,
+                              'recurrent': True,
+                              }
         return decoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        X['network_output_tuple'] = True
+        self.rnn_kwargs = X['rnn_kwargs']
         return super().fit(X, y)
 
     @property
@@ -101,10 +115,6 @@ def get_hyperparameter_search_space(
             dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
                                                                            value_range=(0., 0.5),
                                                                            default_value=0.2),
-            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
-                                                                            value_range=tuple(ALL_DISTRIBUTIONS.keys()),
-                                                                            default_value=
-                                                                            list(ALL_DISTRIBUTIONS.keys())[0]),
     ) -> ConfigurationSpace:
         cs = CS.ConfigurationSpace()
 
@@ -117,7 +127,4 @@ def get_hyperparameter_search_space(
         # Hidden size is given by the encoder architecture
         cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
 
-        add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
-
         return cs
-
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 8ac7623d6..404f51b2f 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -38,12 +38,16 @@
 
 class ProjectionLayer(nn.Module):
     """
-    A projection layer that
+    A projection layer that project features to a torch distribution
     """
 
+    value_in_support = 0.0
+    # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
+
     def __init__(self,
                  num_in_features: int,
                  output_shape: Tuple[int, ...],
+                 auto_regressive: bool,
                  **kwargs, ):
         super().__init__(**kwargs)
 
@@ -60,8 +64,12 @@ def build_single_proj_layer(arg_dim):
             Returns:
 
             """
-            return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
-                                 nn.Unflatten(-1, (*output_shape, arg_dim)))
+            if not auto_regressive:
+                return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+                                     nn.Unflatten(-1, (*output_shape, arg_dim)))
+            else:
+                return nn.Sequential(nn.Unflatten(-1, *output_shape),
+                                     nn.Linear(num_in_features, arg_dim))
 
         self.proj = nn.ModuleList(
             [build_single_proj_layer(dim) for dim in self.arg_dims.values()]
@@ -127,6 +135,8 @@ def dist_cls(self) -> type(Distribution):
 
 
 class BetaOutput(ProjectionLayer):
+    value_in_support = 0.5
+
     @property
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration1": 1, "concentration0": 1}
@@ -137,14 +147,16 @@ def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor)
         epsilon = 1e-10
         concentration1 = F.softplus(concentration1) + epsilon
         concentration0 = F.softplus(concentration0) + epsilon
-        return concentration1.squeeze(-1), concentration0.squeeze(-1).squeeze(-1)
+        return concentration1.squeeze(-1), concentration0.squeeze(-1)
 
     @property
     def dist_cls(self) -> type(Distribution):
+        # TODO there is a bug with Beta implementation!!!
         return Beta
 
 
 class GammaOutput(ProjectionLayer):
+    value_in_support = 0.5
     @property
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration": 1, "rate": 1}
@@ -178,9 +190,12 @@ def dist_cls(self) -> type(Distribution):
 
 ALL_DISTRIBUTIONS = {'studentT': StudentTOutput,
                      'normal': NormalOutput,
-                     'beta': BetaOutput,
-                     'gamma': GammaOutput,
-                     'poisson': PoissonOutput}  # type: Dict[str, type(ProjectionLayer)]
+                     #'beta': BetaOutput,
+                     #'gamma': GammaOutput,
+                     #'poisson': PoissonOutput
+                    }  # type: Dict[str, ProjectionLayer]
+
+# TODO find components that are compatible with beta, gamma and poisson distrubtion!
 
 # TODO consider how to implement NegativeBinomialOutput without scale information
 # class NegativeBinomialOutput(ProjectionLayer):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index c10dcbecb..3e7cc8fe8 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -9,7 +9,7 @@
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import\
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
     ALL_DISTRIBUTIONS, ProjectionLayer
 
 
@@ -24,7 +24,17 @@ class ForecastingHead(NetworkHeadComponent):
     def __init__(self,
                  **kwargs: Any):
         super().__init__()
-        self.add_fit_requirements([
+        self.add_fit_requirements(self._required_fit_requirements)
+        self.head: Optional[nn.Module] = None
+        self.required_net_out_put_type: Optional[str] = None
+        self.auto_regressive = kwargs.get('auto_regressive', False)
+
+        self.config = kwargs
+
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        return [
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('encoder_properties', (str,), user_defined=False, dataset_property=False),
@@ -32,17 +42,15 @@ def __init__(self,
             FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement('loss_type', (str,), user_defined=False, dataset_property=False)
-            # TODO add loss type
-        ])
-        self.head: Optional[nn.Module] = None
-        self.required_net_out_put_type: Optional[str] = None
-        self.auto_regressive = kwargs.get('auto_regressive', False)
-
-        self.config = kwargs
+        ]
 
     @property
     def decoder_properties(self):
-        decoder_property = {}
+        decoder_property = {'additional_output': False,
+                            'additional_input': False,
+                            'fixed_input_seq_length': False,
+                            'recurrent': False,
+                            }
         return decoder_property
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -60,22 +68,41 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.required_net_out_put_type = X['required_net_out_put_type']
 
-        auto_regressive = self.config.get("auto_regressive", False)
+        if self.required_net_out_put_type == 'distribution':
+            if 'dist_cls' not in X:
+                raise ValueError('Distribution output type must contain dist_cls!!')
+
+        dist_cls = X.get('dist_cls', None)
+
+        auto_regressive = self.auto_regressive
         X.update({"auto_regressive": auto_regressive})
         encoder_properties = X['encoder_properties']
 
+        # for information about encoder_properties, please check
+        # autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone
+        # TODO create a separate module so that users could know what is contained in encoder_properties
+
         # TODO consider Auto-regressive model on vanilla network head
         if auto_regressive:
-            output_shape[0] = 1
-        fixed_input_shape = encoder_properties.get("fixed_input_shape", False)
-        network_output_tuple = encoder_properties.get("network_output_tuple", False)
-        arch_kwargs = encoder_properties.get("arch_kwargs", {})
-        if fixed_input_shape:
+            output_shape = output_shape[1:]
+            n_prediction_heads = 1
+        else:
+            n_prediction_heads = output_shape[0]
+
+        fixed_input_seq_length = encoder_properties.get("fixed_input_seq_length", False)
+        has_hidden_states = encoder_properties.get("has_hidden_states", False)
+
+        if fixed_input_seq_length:
             input_shape = (X["window_size"], input_shape[-1])
+
+        arch_kwargs = {'n_prediction_heads': n_prediction_heads}
+
         self.head = self.build_head(
             input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape,
-                                         network_output_tuple=network_output_tuple),
+                                         has_hidden_states=has_hidden_states),
             output_shape=output_shape,
+            auto_regressive=auto_regressive,
+            dist_cls=dist_cls,
             **arch_kwargs,
         )
         return self
@@ -89,13 +116,14 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        X = super().transform(X)
         X.update({'decoder_properties': self.decoder_properties})
-        return X
+        return super().transform(X)
 
     def build_head(self,
                    input_shape: Tuple[int, ...],
                    output_shape: Tuple[int, ...],
+                   auto_regressive: bool = False,
+                   dist_cls: Optional[str] = None,
                    **arch_kwargs: Dict) -> nn.Module:
         """
         Builds the head module and returns it
@@ -103,6 +131,8 @@ def build_head(self,
         Args:
             input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
             output_shape (Tuple[int, ...]): shape of the output of the head
+            auto_regressive (bool): if the network is auto-regressive
+            dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
             arch_kwargs (Dict): additional paramter for initializing architectures.
 
         Returns:
@@ -110,13 +140,14 @@ def build_head(self,
         """
         base_header_layer, num_head_base_output_features = self._build_head(input_shape, **arch_kwargs)
         proj_layer = []
-        # TODO consider local output layer introduced in Wen et al, A Multi-Horizon Quantile Recurrent Forecaster
 
-        output_layer = self.build_proj_layer(num_head_base_output_features=num_head_base_output_features,
-                                             output_shape=output_shape,
-                                             net_out_put_type=self.required_net_out_put_type,
-                                             dist_cls=self.config.get('dist_cls', None)
-                                             )
+        output_layer = self.build_proj_layer(
+            num_head_base_output_features=num_head_base_output_features,
+            output_shape=output_shape,
+            auto_regressive=auto_regressive,
+            net_out_put_type=self.required_net_out_put_type,
+            dist_cls=dist_cls,
+            )
         proj_layer.append(output_layer)
         return nn.Sequential(*base_header_layer, *proj_layer)
 
@@ -138,6 +169,7 @@ def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List
     @staticmethod
     def build_proj_layer(num_head_base_output_features: int,
                          output_shape: Tuple[int, ...],
+                         auto_regressive: bool,
                          net_out_put_type: str,
                          dist_cls: Optional[str] = None) -> torch.nn.Module:
         """
@@ -146,18 +178,23 @@ def build_proj_layer(num_head_base_output_features: int,
             num_head_base_output_features (int): output feature of head base,
             is used to initialize size of the linear layer
             output_shape (Tuple[int, ..]): deserved output shape
+            auto_regressive (bool): if the network is auto-regressive
             net_out_put_type (str), type of the loss, it determines the output of the network
             dist_cls (str), distribution class, only activate if output is a distribution
 
         Returns:
             proj_layer: nn.Module
             projection layer that maps the features to the final output
+            required_padding_value: float,
+            which values need to be padded when loadding the data
+
         """
         if net_out_put_type == 'distribution':
             if dist_cls not in ALL_DISTRIBUTIONS.keys():
                 raise ValueError(f'Unsupported distribution class type: {dist_cls}')
             proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
-                                                     output_shape=output_shape, )
+                                                     output_shape=output_shape,
+                                                     auto_regressive=auto_regressive)
             return proj_layer
         elif net_out_put_type == 'regression':
             proj_layer = nn.Sequential(nn.Linear(num_head_base_output_features, np.product(output_shape)),
diff --git a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
index b16ed01d5..3a853648f 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
@@ -21,13 +21,6 @@ class FullyConnectedHead(NetworkHeadComponent):
     """
 
     def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> nn.Module:
-        layers, head_base_output_features = self._build_head(input_shape)
-        out_features = np.prod(output_shape).item()
-        layers.append(nn.Linear(in_features=head_base_output_features,
-                                out_features=out_features))
-        return nn.Sequential(*layers)
-
-    def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], int]:
         layers = [nn.Flatten()]
         in_features = np.prod(input_shape).item()
         for i in range(1, self.config["num_layers"]):
@@ -35,8 +28,10 @@ def _build_head(self, input_shape: Tuple[int, ...]) -> Tuple[List[nn.Module], in
                                     out_features=self.config[f"units_layer_{i}"]))
             layers.append(_activations[self.config["activation"]]())
             in_features = self.config[f"units_layer_{i}"]
-        head_base_output_features = in_features
-        return layers, head_base_output_features
+        out_features = np.prod(output_shape).item()
+        layers.append(nn.Linear(in_features=in_features,
+                                out_features=out_features))
+        return nn.Sequential(*layers)
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 2f5c5832a..a557c05f6 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -193,6 +193,7 @@ def __init__(self,
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.subseq_length = self.window_size
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
+        self.padding_value = 0.0
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -209,6 +210,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         sample_interval = X.get('sample_interval', 1)
         self.sample_interval = sample_interval
 
+        self.padding_value = X.get('required_padding_value', 0.0)
+
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         # we want models with different sample_interval to have similar length scale
         self.subseq_length = self.window_size
@@ -331,7 +334,8 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         candidate_transformations.append((SequenceBuilder(sample_interval=self.sample_interval,
                                                           window_size=self.window_size,
-                                                          subseq_length=self.subseq_length)))
+                                                          subseq_length=self.subseq_length,
+                                                          padding_value=self.padding_value)))
         candidate_transformations.append((ExpandTransformTimeSeries()))
         if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
             candidate_transformations.extend(X['preprocess_transforms'])
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 6bda6332a..3acc0afc8 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -21,7 +21,8 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
-from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet
+
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
@@ -31,7 +32,7 @@ class ForecastingBaseTrainerComponent(BaseTrainerComponent, ABC):
     def prepare(
             self,
             metrics: List[Any],
-            model: torch.nn.Module,
+            model: ForecastingNet,
             criterion: Type[torch.nn.Module],
             budget_tracker: BudgetTracker,
             optimizer: Optimizer,
@@ -158,6 +159,7 @@ def train_step(self, data: Dict[str, torch.Tensor], targets: Dict[str, Union[tor
 
         loss_func = self.criterion_preparation(**criterion_kwargs)
         loss = loss_func(self.criterion, outputs)
+
         loss.backward()
         self.optimizer.step()
         self._scheduler_step(step_interval=StepIntervalUnit.batch, loss=loss.item())
@@ -208,20 +210,22 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 outputs = self.model(X)
 
-                outputs = self.rescale_output_distribution(outputs, loc=loc, scale=scale)
+                outputs_rescaled = self.rescale_output_distribution(outputs, loc=loc, scale=scale)
 
-                loss = self.criterion(outputs, targets)
+                loss = self.criterion(outputs_rescaled, targets)
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size
+
+                outputs = self.model.pred_from_net_output(outputs).detach().cpu()
                 if loc is None and scale is None:
-                    outputs_data.append(outputs.mean.detach().cpu())
+                    outputs_data.append(outputs)
                 else:
                     if loc is None:
                         loc = 0.
                     if scale is None:
                         scale = 1.
-                    outputs_data.append(outputs.base_dist.mean.detach().cpu() * scale + loc)
+                    outputs_data.append(outputs * scale + loc)
                 targets_data.append(targets.detach().cpu())
 
                 if writer:
@@ -230,6 +234,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                         loss.item(),
                         epoch * len(test_loader) + step,
                     )
+
         # mase_coefficent has the shape [B, 1, 1]
         # to be compatible with outputs_data with shape [B, n_prediction_steps, num_output]
         mase_coefficients = np.expand_dims(torch.cat(mase_coefficients, dim=0).numpy(), axis=[1])
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index e1c5d1041..4132cca55 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -6,7 +6,7 @@
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 from ConfigSpace.hyperparameters import Constant
 from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause, ForbiddenInClause
-from ConfigSpace.conditions import EqualsCondition, NotEqualsCondition, AndConjunction
+from ConfigSpace.conditions import EqualsCondition, NotEqualsCondition
 
 import numpy as np
 
@@ -37,7 +37,7 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling import \
     TargetScalerChoice
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
-from autoPyTorch.pipeline.components.setup.forecasting_training_losses import ForecastingLossChoices
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
 from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
     TimeSeriesForecastingDataLoader
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
@@ -185,43 +185,53 @@ def _get_hyperparameter_search_space(self,
                             except IndexError:
                                 raise ValueError("Cannot find a legal default configuration")
                             cs.get_hyperparameter('network_embedding:__choice__').default_value = default
+                            """
+                            # in this case we cannot deactivate the hps, we might need to think about this
+                            if 'RegressionLoss' in hp_loss.choices:
+                                forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
+                                for hp_dist in hp_distribution_children:
+                                    forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
+                                    forbidden_hp_dist = AndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
+                                    forbidden_regression_losses_all.append(forbidden_hp_dist)
+                            else:
+                                for hp_dist in hp_distribution_children:
+                                    forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
+                                    forbidden_regression_losses_all.append(forbidden_hp_dist)
+                            """
 
         # dist_cls and auto_regressive are only activate if the network outputs distribution
         if 'loss' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():
             hp_loss = cs.get_hyperparameter('loss:__choice__')
-            hp_distribution_heads = []
+
+            hp_auto_regressive = []
             for hp_name in cs.get_hyperparameter_names():
                 if hp_name.startswith('network_head:'):
-                    if hp_name.endswith(':dist_cls') or hp_name.endswith(':auto_regressive'):
-                        hp_distribution_heads.append(cs.get_hyperparameter(hp_name))
+                    if hp_name.endswith(':auto_regressive'):
+                        hp_auto_regressive.append(cs.get_hyperparameter(hp_name))
+
+            # Auto-Regressive is incompatible with regression losses
+            forbidden_regression_losses_all = []
+            if 'RegressionLoss' in hp_loss.choices:
+                forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
+                for hp_ar in hp_auto_regressive:
+                    forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
+                    forbidden_regression_losses_all.append(forbidden_hp_dist)
+
+            hp_distribution_children = []
+            if 'network' in self.named_steps.keys():
+                hp_distribution_children.append(cs.get_hyperparameter('network:forecast_strategy'))
 
             # in this case we cannot deactivate the hps, we might need to think about this
-            if 'distribution_losses' in hp_loss.choices:
-                for hp_dist in hp_distribution_heads:
-                    hp_dist_parent_condition = cs.get_parent_conditions_of(hp_dist.name)
-                    new_cond = AndConjunction(EqualsCondition(hp_dist, hp_loss, 'distribution_losses'),
-                                              *hp_dist_parent_condition)
-
-                    # TODO: this is only a temporal solution, we need to create a PR for ConfigSpace to allow replacing or
-                    # deleting conditions!!!
-
-                    # delete the old condition
-                    cs._parents[hp_dist.name] = OrderedDict()
-                    cs._parents[hp_dist.name]['__HPOlib_configuration_space_root__'] = None
-                    cs.add_condition(new_cond)
+            if 'DistributionLoss' in hp_loss.choices:
+                for hp_dist in hp_distribution_children:
+                    cs.add_condition(EqualsCondition(hp_dist, hp_loss, 'DistributionLoss'))
             else:
                 # we set a placeholder and use it to inactivate the related values
                 placeholder = Constant("loss_place_holder", 0)
                 cs.add_hyperparameter(placeholder)
-                for hp_dist in hp_distribution_heads:
-                    hp_dist_parent_condition = cs.get_parent_conditions_of(hp_dist.name)
-                    new_cond = AndConjunction(NotEqualsCondition(hp_dist, placeholder, 0),
-                                              *hp_dist_parent_condition)
-
-                    # delete the old condition
-                    cs._parents[hp_dist.name] = OrderedDict()
-                    cs._parents[hp_dist.name]['__HPOlib_configuration_space_root__'] = None
-                    cs.add_condition(new_cond)
+                for hp_dist in hp_distribution_children:
+                    cs.add_condition(NotEqualsCondition(hp_dist, placeholder, 0))
 
         # rnn head only allow rnn backbone
         if 'network_backbone' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():
@@ -270,9 +280,9 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
             ("network_backbone", ForecastingNetworkBackboneChoice(default_dataset_properties,
-                                                       random_state=self.random_state)),
+                                                                  random_state=self.random_state)),
             ("network_head", ForecastingNetworkHeadChoice(default_dataset_properties,
-                                               random_state=self.random_state)),
+                                                          random_state=self.random_state)),
             ("network", ForecastingNetworkComponent(random_state=self.random_state)),
             ("network_init", NetworkInitializerChoice(default_dataset_properties,
                                                       random_state=self.random_state)),

From 846c05bfb0e8e559fc4d1161f72719a67b0fd240 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 12 Dec 2021 19:26:16 +0100
Subject: [PATCH 082/347] maint

---
 .../ForecastingMLPHead.py                         |  2 +-
 .../forecasting_network_head/distribution.py      | 15 ++++++++++-----
 .../forecasting_network_head/forecasting_head.py  |  9 +++++++--
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
index 0f82ee8a8..d6f1e3b67 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
@@ -39,7 +39,7 @@ def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List
                                 out_features=self.config['units_final_layer'] * n_prediction_steps))
         if 'activation' in self.config:
             layers.append(_activations[self.config["activation"]]())
-        head_base_output_features = self.config['units_final_layer'] * n_prediction_steps
+        head_base_output_features = self.config['units_final_layer']
 
         return layers, head_base_output_features
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 404f51b2f..b2fef6abf 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -47,6 +47,7 @@ class ProjectionLayer(nn.Module):
     def __init__(self,
                  num_in_features: int,
                  output_shape: Tuple[int, ...],
+                 n_prediction_heads: int,
                  auto_regressive: bool,
                  **kwargs, ):
         super().__init__(**kwargs)
@@ -58,18 +59,22 @@ def build_single_proj_layer(arg_dim):
             """
             build a single proj layer given the input dims, the output is unflattened to fit the required output_shape
             and n_prediction_steps.
+            we note that output_shape's first dimensions is always n_prediction_steps
             Args:
                 arg_dim: dimension of the target distribution
 
             Returns:
 
             """
-            if not auto_regressive:
-                return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
-                                     nn.Unflatten(-1, (*output_shape, arg_dim)))
+            if auto_regressive:
+                unflatten_layer = []
             else:
-                return nn.Sequential(nn.Unflatten(-1, *output_shape),
-                                     nn.Linear(num_in_features, arg_dim))
+                # we need to unflatten the input from 2D to 3D such that local MLP can be applied to each prediction
+                # separately
+                unflatten_layer = [nn.Unflatten(-1, (n_prediction_heads, num_in_features))]
+            return nn.Sequential(*unflatten_layer,
+                                 nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+                                 nn.Unflatten(-1, (*output_shape, arg_dim)))
 
         self.proj = nn.ModuleList(
             [build_single_proj_layer(dim) for dim in self.arg_dims.values()]
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 3e7cc8fe8..95653968a 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -84,10 +84,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         # TODO consider Auto-regressive model on vanilla network head
         if auto_regressive:
-            output_shape = output_shape[1:]
             n_prediction_heads = 1
         else:
             n_prediction_heads = output_shape[0]
+        # output shape now doe not contain information about n_prediction_steps
+        output_shape = output_shape[1:]
 
         fixed_input_seq_length = encoder_properties.get("fixed_input_seq_length", False)
         has_hidden_states = encoder_properties.get("has_hidden_states", False)
@@ -147,7 +148,8 @@ def build_head(self,
             auto_regressive=auto_regressive,
             net_out_put_type=self.required_net_out_put_type,
             dist_cls=dist_cls,
-            )
+            n_prediction_heads=arch_kwargs['n_prediction_heads']
+        )
         proj_layer.append(output_layer)
         return nn.Sequential(*base_header_layer, *proj_layer)
 
@@ -169,6 +171,7 @@ def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List
     @staticmethod
     def build_proj_layer(num_head_base_output_features: int,
                          output_shape: Tuple[int, ...],
+                         n_prediction_heads: int,
                          auto_regressive: bool,
                          net_out_put_type: str,
                          dist_cls: Optional[str] = None) -> torch.nn.Module:
@@ -178,6 +181,7 @@ def build_proj_layer(num_head_base_output_features: int,
             num_head_base_output_features (int): output feature of head base,
             is used to initialize size of the linear layer
             output_shape (Tuple[int, ..]): deserved output shape
+            n_prediction_heads: int, how many steps the head want to predict
             auto_regressive (bool): if the network is auto-regressive
             net_out_put_type (str), type of the loss, it determines the output of the network
             dist_cls (str), distribution class, only activate if output is a distribution
@@ -194,6 +198,7 @@ def build_proj_layer(num_head_base_output_features: int,
                 raise ValueError(f'Unsupported distribution class type: {dist_cls}')
             proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
                                                      output_shape=output_shape,
+                                                     n_prediction_heads=n_prediction_heads,
                                                      auto_regressive=auto_regressive)
             return proj_layer
         elif net_out_put_type == 'regression':

From efceca1d521ace2a649157626d2307d4fc0c4fca Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 13 Dec 2021 18:25:08 +0100
Subject: [PATCH 083/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    |  2 -
 autoPyTorch/pipeline/base_pipeline.py         |  1 -
 .../setup/network/forecasting_network.py      | 42 ++++++++++++++-----
 .../ForecastingMLPHead.py                     |  2 +-
 .../forecasting_head.py                       | 16 +++++--
 .../forecasting_base_trainer.py               | 29 ++++++++-----
 .../pipeline/time_series_forecasting.py       | 35 ++++++++++------
 7 files changed, 85 insertions(+), 42 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 2784ca67e..7655d9711 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -140,7 +140,6 @@ def search(
             portfolio_selection: Optional[str] = None,
             shift_input_data: bool = True,
             normalize_y: bool = True,
-            train_with_log_prob: bool = True
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -277,7 +276,6 @@ def search(
             n_prediction_steps=n_prediction_steps,
             shift_input_data=shift_input_data,
             normalize_y=normalize_y,
-            train_with_log_prob=train_with_log_prob,
         )
 
         if self.dataset.freq_value is not None or not self.customized_window_size:
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index 70d3fa897..90c0f6362 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -231,7 +231,6 @@ def set_hyperparameters(
                     new_name = param.replace('%s:' % node_name, '', 1)
                     sub_config_dict[new_name] = value
 
-
             sub_configuration = Configuration(sub_configuration_space,
                                               values=sub_config_dict)
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 36db59ea1..ff39b4874 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -29,9 +29,9 @@ def __init__(self,
                  encoder_properties: Dict = {},
                  decoder_properties: Dict = {},
                  output_type: str = 'regression',
-                 forecast_strategy: str = 'mean',
-                 num_samples: int = 100,
-                 aggregation: str = 'mean'
+                 forecast_strategy: Optional[str] = 'mean',
+                 num_samples: Optional[int] = 100,
+                 aggregation: Optional[str] = 'mean'
                  ):
         super(ForecastingNet, self).__init__()
         self.embedding = network_embedding
@@ -70,11 +70,11 @@ def pred_from_net_output(self, net_output):
             if self.forecast_strategy == 'mean':
                 return net_output.mean
             elif self.forecast_strategy == 'sample':
-                samples = net_output.sample(self.num_samples)
+                samples = net_output.sample((self.num_samples, ))
                 if self.aggregation == 'mean':
                     return torch.mean(samples, dim=0)
                 elif self.aggregation == 'median':
-                    return torch.median(samples, 0)
+                    return torch.median(samples, 0)[0]
                 else:
                     raise ValueError(f'Unknown aggregation: {self.aggregation}')
             else:
@@ -92,15 +92,17 @@ def __init__(
             self,
             network: Optional[torch.nn.Module] = None,
             random_state: Optional[np.random.RandomState] = None,
-            forecast_strategy: str = 'mean',
+            net_out_type: str = 'regression',
+            forecast_strategy: Optional[str] = 'mean',
             num_samples: Optional[int] = None,
-            aggregation: Optional[str] = None
+            aggregation: Optional[str] = None,
+
     ) -> None:
         super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
+        self.net_out_type = net_out_type
         self.forecast_strategy = forecast_strategy
         self.num_samples = num_samples
         self.aggregation = aggregation
-        self.output_type = None
 
     @property
     def _required_fit_requirements(self):
@@ -119,12 +121,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         self.check_requirements(X, y)
         self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
 
+        if self.net_out_type != X['required_net_out_put_type']:
+            raise ValueError(f"network output type must be the same as required_net_out_put_type defiend by "
+                             f"loss function. However, net_out_type is {self.net_out_type} and "
+                             f"required_net_out_put_type is {X['required_net_out_put_type']}")
+
         self.network = ForecastingNet(network_embedding=X['network_embedding'],
                                       network_backbone=X['network_backbone'],
                                       network_head=X['network_head'],
                                       encoder_properties=X['encoder_properties'],
                                       decoder_properties=X['decoder_properties'],
-                                      output_type=X['required_net_out_put_type'],
+                                      output_type=self.net_out_type,
                                       forecast_strategy=self.forecast_strategy,
                                       num_samples=self.num_samples,
                                       aggregation=self.aggregation,
@@ -183,6 +190,11 @@ def predict(self, loader: torch.utils.data.DataLoader,
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            net_out_type: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='net_out_type',
+                                                                                value_range=('regression',
+                                                                                             'distribution'),
+                                                                                default_value='distribution'
+                                                                                ),
             forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
                                                                                      value_range=('sample', 'mean'),
                                                                                      default_value='mean'),
@@ -193,15 +205,23 @@ def get_hyperparameter_search_space(
                                                                                value_range=('mean', 'median'),
                                                                                default_value='mean')
     ) -> ConfigurationSpace:
+        """
+        prediction steagy
+        """
         cs = ConfigurationSpace()
+
+        net_out_type = get_hyperparameter(net_out_type, CategoricalHyperparameter)
+
         forecast_strategy = get_hyperparameter(forecast_strategy, CategoricalHyperparameter)
         num_samples = get_hyperparameter(num_samples, UniformIntegerHyperparameter)
         aggregation = get_hyperparameter(aggregation, CategoricalHyperparameter)
 
+        cond_net_out_type = EqualsCondition(forecast_strategy, net_out_type, 'distribution')
+
         cond_num_sample = EqualsCondition(num_samples, forecast_strategy, 'sample')
         cond_aggregation = EqualsCondition(aggregation, forecast_strategy, 'sample')
 
-        cs.add_hyperparameters([forecast_strategy, num_samples, aggregation])
-        cs.add_conditions([cond_aggregation, cond_num_sample])
+        cs.add_hyperparameters([net_out_type, forecast_strategy, num_samples, aggregation])
+        cs.add_conditions([cond_net_out_type, cond_aggregation, cond_num_sample])
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
index d6f1e3b67..09822cbb9 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
@@ -59,7 +59,7 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
                                                                               value_range=(0, 3),
-                                                                              default_value=1),
+                                                                              default_value=2),
             units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
                                                                                value_range=(64, 512),
                                                                                default_value=128,
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 95653968a..2c2df11ef 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -74,7 +74,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         dist_cls = X.get('dist_cls', None)
 
+
+
         auto_regressive = self.auto_regressive
+
+        auto_regressive = False # TODO implement auto_regressive mdoels!!
+
+
+
         X.update({"auto_regressive": auto_regressive})
         encoder_properties = X['encoder_properties']
 
@@ -88,7 +95,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         else:
             n_prediction_heads = output_shape[0]
         # output shape now doe not contain information about n_prediction_steps
-        output_shape = output_shape[1:]
 
         fixed_input_seq_length = encoder_properties.get("fixed_input_seq_length", False)
         has_hidden_states = encoder_properties.get("has_hidden_states", False)
@@ -197,13 +203,15 @@ def build_proj_layer(num_head_base_output_features: int,
             if dist_cls not in ALL_DISTRIBUTIONS.keys():
                 raise ValueError(f'Unsupported distribution class type: {dist_cls}')
             proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
-                                                     output_shape=output_shape,
+                                                     output_shape=output_shape[1:],
                                                      n_prediction_heads=n_prediction_heads,
                                                      auto_regressive=auto_regressive)
             return proj_layer
         elif net_out_put_type == 'regression':
-            proj_layer = nn.Sequential(nn.Linear(num_head_base_output_features, np.product(output_shape)),
-                                       nn.Unflatten(-1, *output_shape))
+            proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, num_head_base_output_features)),
+                                       nn.Linear(num_head_base_output_features, np.product(output_shape[1:])),
+                                       # nn.Unflatten(-1, tuple(output_shape)),
+                                       )
             return proj_layer
         else:
             raise ValueError(f"Unsupported network type "
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 3acc0afc8..36fd9f1c5 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -113,16 +113,24 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
         else:
             return loss_sum / N, {}
 
-    def rescale_output_distribution(self,
-                                    outputs: torch.distributions.Distribution,
-                                    loc: Optional[torch.Tensor],
-                                    scale: Optional[torch.Tensor]):
+    def rescale_output(self,
+                       outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                       loc: Optional[torch.Tensor],
+                       scale: Optional[torch.Tensor]):
         # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
         if loc is not None or scale is not None:
-            transfomr = AffineTransform(loc=0.0 if loc is None else loc.to(self.device),
-                                        scale=1.0 if scale is None else scale.to(self.device),
-                                        )
-            outputs = TransformedDistribution(outputs, [transfomr])
+            if isinstance(outputs, torch.distributions.Distribution):
+                transform = AffineTransform(loc=0.0 if loc is None else loc.to(self.device),
+                                            scale=1.0 if scale is None else scale.to(self.device),
+                                            )
+                outputs = TransformedDistribution(outputs, [transform])
+            else:
+                if loc is None:
+                    outputs = outputs * scale.to(self.device)
+                elif scale is None:
+                    outputs = outputs + loc.to(self.device)
+                else:
+                    outputs = outputs * scale.to(self.device) + loc.to(self.device)
         return outputs
 
     def train_step(self, data: Dict[str, torch.Tensor], targets: Dict[str, Union[torch.Tensor, np.ndarray]]) \
@@ -155,7 +163,7 @@ def train_step(self, data: Dict[str, torch.Tensor], targets: Dict[str, Union[tor
         self.optimizer.zero_grad()
         outputs = self.model(X)
 
-        outputs = self.rescale_output_distribution(outputs, loc=loc, scale=scale)
+        outputs = self.rescale_output(outputs, loc=loc, scale=scale)
 
         loss_func = self.criterion_preparation(**criterion_kwargs)
         loss = loss_func(self.criterion, outputs)
@@ -210,7 +218,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 outputs = self.model(X)
 
-                outputs_rescaled = self.rescale_output_distribution(outputs, loc=loc, scale=scale)
+                outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale)
 
                 loss = self.criterion(outputs_rescaled, targets)
 
@@ -218,6 +226,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 N += batch_size
 
                 outputs = self.model.pred_from_net_output(outputs).detach().cpu()
+
                 if loc is None and scale is None:
                     outputs_data.append(outputs)
                 else:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 4132cca55..3326e4840 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -210,28 +210,37 @@ def _get_hyperparameter_search_space(self,
                         hp_auto_regressive.append(cs.get_hyperparameter(hp_name))
 
             # Auto-Regressive is incompatible with regression losses
-            forbidden_regression_losses_all = []
+            forbidden_losses_all = []
             if 'RegressionLoss' in hp_loss.choices:
                 forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
                 for hp_ar in hp_auto_regressive:
                     forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, True)
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
-                    forbidden_regression_losses_all.append(forbidden_hp_dist)
+                    forbidden_losses_all.append(forbidden_hp_dist)
 
-            hp_distribution_children = []
+            hp_net_output_type = []
             if 'network' in self.named_steps.keys():
-                hp_distribution_children.append(cs.get_hyperparameter('network:forecast_strategy'))
+                hp_net_output_type.append(cs.get_hyperparameter('network:net_out_type'))
+
+            if 'RegressionLoss' in hp_loss.choices:
+                # TODO Quantile loses need to be added here
+                forbidden_hp_loss = ForbiddenInClause(hp_loss, ['RegressionLoss'])
+                # RegressionLos only allow regression hp_net_out
+                for hp_net_out in hp_net_output_type:
+                    forbidden_hp_dist = ForbiddenInClause(hp_net_out, ['distribution'])
+                    forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_loss)
+                    forbidden_losses_all.append(forbidden_hp_dist)
 
-            # in this case we cannot deactivate the hps, we might need to think about this
             if 'DistributionLoss' in hp_loss.choices:
-                for hp_dist in hp_distribution_children:
-                    cs.add_condition(EqualsCondition(hp_dist, hp_loss, 'DistributionLoss'))
-            else:
-                # we set a placeholder and use it to inactivate the related values
-                placeholder = Constant("loss_place_holder", 0)
-                cs.add_hyperparameter(placeholder)
-                for hp_dist in hp_distribution_children:
-                    cs.add_condition(NotEqualsCondition(hp_dist, placeholder, 0))
+                # TODO Quantile loses need to be added here
+                forbidden_hp_loss = ForbiddenInClause(hp_loss, ['DistributionLoss'])
+                # DistributionLoss only allow distribution hp_net_out
+                for hp_net_out in hp_net_output_type:
+                    forbidden_hp_dist = ForbiddenInClause(hp_net_out, ['regression'])
+                    forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_loss)
+                    forbidden_losses_all.append(forbidden_hp_dist)
+
+            cs.add_forbidden_clauses(forbidden_losses_all)
 
         # rnn head only allow rnn backbone
         if 'network_backbone' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():

From 63a83c0c87f240b771a34680bbc5b373b9dc467f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Dec 2021 18:51:58 +0100
Subject: [PATCH 084/347] move decoder to network backbone

---
 .../setup/network/forecasting_network.py      |  30 +--
 .../network_backbone/base_network_backbone.py |  58 ++---
 .../forecasting_decoder/MLPDecoder.py}        |  23 +-
 .../forecasting_decoder/RNNDecoder.py}        |  34 ++-
 .../forecasting_decoder/__init__.py           | 198 ++++++++++++++++++
 .../base_forecasting_decoder.py               | 155 ++++++++++++++
 .../InceptionTimeEncoder.py}                  |  14 +-
 .../MLPEncoder.py}                            |   8 +-
 .../RNNEncoder.py}                            |  22 +-
 .../TCNEncoder.py}                            |  21 +-
 .../__init__.py                               |  15 +-
 .../base_forecasting_encoder.py               | 127 +++++++++++
 .../base_forecasting_backbone.py              |  60 ------
 .../forecasting_head.py                       | 132 ++++--------
 .../pipeline/time_series_forecasting.py       |  18 +-
 15 files changed, 627 insertions(+), 288 deletions(-)
 rename autoPyTorch/pipeline/components/setup/{network_head/forecasting_network_head/ForecastingMLPHead.py => network_backbone/forecasting_decoder/MLPDecoder.py} (88%)
 rename autoPyTorch/pipeline/components/setup/{network_head/forecasting_network_head/ForecastingRNNHead.py => network_backbone/forecasting_decoder/RNNDecoder.py} (84%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_network_backbone/InceptionTimeBackbone.py => forecasting_encoder/InceptionTimeEncoder.py} (95%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_network_backbone/TimeSeriesMLPBackbone.py => forecasting_encoder/MLPEncoder.py} (95%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_network_backbone/RNNBackbone.py => forecasting_encoder/RNNEncoder.py} (91%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_network_backbone/TCNBackbone.py => forecasting_encoder/TCNEncoder.py} (93%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_network_backbone => forecasting_encoder}/__init__.py (77%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index ff39b4874..d1df21e3e 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -13,7 +13,7 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
     import EncoderNetwork
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -24,7 +24,8 @@
 class ForecastingNet(nn.Module):
     def __init__(self,
                  network_embedding: nn.Module,
-                 network_backbone: EncoderNetwork,
+                 network_encoder: EncoderNetwork,
+                 network_decoder: nn.Module,
                  network_head: nn.Module,
                  encoder_properties: Dict = {},
                  decoder_properties: Dict = {},
@@ -35,8 +36,9 @@ def __init__(self,
                  ):
         super(ForecastingNet, self).__init__()
         self.embedding = network_embedding
-        self.backbone = network_backbone
-        self.network_head = network_head
+        self.encoder = network_encoder
+        self.decoder = network_decoder
+        self.head = network_head
 
         self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
         self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
@@ -56,11 +58,14 @@ def __init__(self,
     def forward(self, X: torch.Tensor, hx: Optional[Tuple[torch.Tensor]] = None):
         X = self.embedding(X)
         if self.encoder_has_hidden_states:
-            X, hidden_state = self.backbone(X)
+            X, hidden_state_encoder = self.encoder(X)
         else:
-            X = self.backbone(X)
-
-        X = self.network_head(X)
+            X = self.encoder(X)
+        if self.decoder_has_hidden_states:
+            X, hidden_state_decoder = self.decoder(X, hidden_state_encoder)
+        else:
+            X = self.decoder(X)
+        X = self.head(X)
         return X
 
     def pred_from_net_output(self, net_output):
@@ -107,9 +112,10 @@ def __init__(
     @property
     def _required_fit_requirements(self):
         return [
-            FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_encoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_decoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
             FitRequirement("encoder_properties", (Dict,), user_defined=False, dataset_property=False),
             FitRequirement("decoder_properties", (Dict,), user_defined=False, dataset_property=False),
@@ -119,7 +125,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         # Make sure that input dictionary X has the required
         # information to fit this stage
         self.check_requirements(X, y)
-        self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
 
         if self.net_out_type != X['required_net_out_put_type']:
             raise ValueError(f"network output type must be the same as required_net_out_put_type defiend by "
@@ -127,7 +132,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                              f"required_net_out_put_type is {X['required_net_out_put_type']}")
 
         self.network = ForecastingNet(network_embedding=X['network_embedding'],
-                                      network_backbone=X['network_backbone'],
+                                      network_encoder=X['network_encoder'],
+                                      network_decoder=X['network_decoder'],
                                       network_head=X['network_head'],
                                       encoder_properties=X['encoder_properties'],
                                       decoder_properties=X['decoder_properties'],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index a159003be..857b812da 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -30,46 +30,21 @@ class NetworkBackboneComponent(autoPyTorchComponent):
     def __init__(self,
                  **kwargs: Any):
         super().__init__()
-        self.add_fit_requirements(
-            self._required_fit_arguments
-        )
+        self.add_fit_requirements([
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+                           dataset_property=False),
+            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
+            FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
+        ])
         self.backbone: nn.Module = None
         self.config = kwargs
         self.input_shape: Optional[Iterable] = None
 
-    @property
-    def _required_fit_arguments(self) -> List[FitRequirement]:
-        if self.get_properties()['handles_tabular']:
-            return [
-                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                               dataset_property=False),
-                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-                FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
-                FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
-            ]
-        elif self.get_properties()['handles_time_series']:
-            return [
-                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                               dataset_property=False),
-                FitRequirement('time_series_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
-                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            ]
-        elif self.get_properties()['handles_image']:
-            return [
-                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                               dataset_property=False),
-                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            ]
-        else:
-            raise ValueError('Unsupported task type!')
-
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Builds the backbone component and assigns it to self.backbone
-
         Args:
             X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
             y (Any): not used. To comply with sklearn API
@@ -78,18 +53,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         self.check_requirements(X, y)
         X_train = X['X_train']
-        if X["dataset_properties"]["task_type"] == TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]:
-            input_shape = X["dataset_properties"]['input_shape']
+
+        if X["dataset_properties"]["is_small_preprocess"]:
+            input_shape = X_train.shape[1:]
         else:
+            # get input shape by transforming first two elements of the training set
+            column_transformer = X['tabular_transformer'].preprocessor
+            input_shape = column_transformer.transform(X_train[:1]).shape[1:]
 
-            if X["dataset_properties"]["is_small_preprocess"]:
-                input_shape = X_train.shape[1:]
-            else:
-                # get input shape by transforming first two elements of the training set
-                transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
-                input_shape = transforms(X_train[:1, ...]).shape[1:]
-        if 'network_embedding' in X.keys():
-            input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
+        input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
         self.input_shape = input_shape
 
         self.backbone = self.build_backbone(
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
similarity index 88%
rename from autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
index 09822cbb9..9bc132e27 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingMLPHead.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
@@ -11,13 +11,11 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import \
-    ForecastingHead
-from autoPyTorch.pipeline.components.setup.network_head.fully_connected import FullyConnectedHead
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
+    BaseForecastingDecoder
 
 
-class ForecastingMLPHeader(ForecastingHead, FullyConnectedHead):
+class ForecastingMLPHeader(BaseForecastingDecoder):
     @property
     def decoder_properties(self):
         decoder_properties = {'has_hidden_states': False,
@@ -25,10 +23,9 @@ def decoder_properties(self):
                               }
         return decoder_properties
 
-    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List[nn.Module], int]:
+    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int) -> Tuple[nn.Module, int]:
         layers = []
         in_features = input_shape[-1]
-        n_prediction_steps = arch_kwargs['n_prediction_heads']
         if self.config["num_layers"] > 0:
             for i in range(1, self.config["num_layers"]):
                 layers.append(nn.Linear(in_features=in_features,
@@ -36,19 +33,19 @@ def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List
                 layers.append(_activations[self.config["activation"]]())
                 in_features = self.config[f"units_layer_{i}"]
         layers.append(nn.Linear(in_features=in_features,
-                                out_features=self.config['units_final_layer'] * n_prediction_steps))
+                                out_features=self.config['units_final_layer'] * n_prediction_heads))
         if 'activation' in self.config:
             layers.append(_activations[self.config["activation"]]())
-        head_base_output_features = self.config['units_final_layer']
+        num_decoder_output_features = self.config['units_final_layer']
 
-        return layers, head_base_output_features
+        return nn.Sequential(*layers), num_decoder_output_features
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
         return {
-            'shortname': 'ForecastingMLPHead',
-            'name': 'ForecastingMLPHead',
+            'shortname': 'MLPDecoder',
+            'name': 'MLPDecoder',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
@@ -98,8 +95,6 @@ def get_hyperparameter_search_space(
             activation (HyperparameterSearchSpace): activation function
             units_final_layer (HyperparameterSearchSpace): number of units of final layer. The size of this layer is
             smaller as it needs to be expanded to adapt to the number of predictions
-            dist_cls (HyperparameterSearchSpace): only activate when required_output_tpe is distribution, the sorts of
-            distribution that the network could output
             auto_regressive (HyperparameterSearchSpace): if the model acts as a DeepAR model
         Returns:
             cs (ConfigurationSpace): ConfigurationSpace
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHead.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
similarity index 84%
rename from autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHead.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
index 0678768d0..30fd6d71a 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/ForecastingRNNHead.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
@@ -14,8 +14,8 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import \
-    ForecastingHead
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
+    BaseForecastingDecoder
 
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
@@ -43,11 +43,13 @@ def __init__(self,
 
     def forward(self, x: torch.Tensor,
                 hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+        if x.ndim == 2:
+            x = x.unsqueeze(1)
         outputs, hidden_state, = self.lstm(x, hx)
-        return outputs, hidden_state
+        return outputs[:, -1, :], hidden_state
 
 
-class ForecastingRNNHeader(ForecastingHead):
+class ForecastingRNNHeader(BaseForecastingDecoder):
     """
     Standard searchable RNN decoder for time series data, only works when the encoder is
     """
@@ -64,18 +66,18 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
         fit_requirement.append(FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False))
         return fit_requirement
 
-    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> nn.Module:
+    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int) -> Tuple[List[nn.Module], int]:
         # RNN decoder only allows RNN encoder, these parameters need to exists.
         hidden_size = self.rnn_kwargs['hidden_size']
         num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs['num_layers']
         cell_type = self.rnn_kwargs['cell_type']
-        head = _RNN_Decoder(in_features=input_shape[-1],
-                            config=self.config,
+        decoder = _RNN_Decoder(in_features=input_shape[-1],
                             hidden_size=hidden_size,
                             num_layers=num_layers,
-                            cell_type=cell_type)
-        self.head = head
-        return head
+                            cell_type=cell_type,
+                            config=self.config,
+                            )
+        return decoder, hidden_size
 
     @property
     def decoder_properties(self):
@@ -88,19 +90,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.rnn_kwargs = X['rnn_kwargs']
         return super().fit(X, y)
 
-    @property
-    def only_return_final_stage(self):
-        return self.backbone.only_return_final_stage
-
-    @only_return_final_stage.setter
-    def only_return_final_stage(self, only_return_final_stage):
-        self.backbone.only_return_final_stage = only_return_final_stage
-
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {
-            'shortname': 'ForecastingRNNHead',
-            'name': 'ForecastingRNNHead',
+            'shortname': 'RNNDecoder',
+            'name': 'RNNDecoder',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py
new file mode 100644
index 000000000..1e1498fdd
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py
@@ -0,0 +1,198 @@
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.setup.network_head.base_network_head import (
+    NetworkHeadComponent,
+)
+
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import (
+    BaseForecastingDecoder,
+)
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    find_components,
+)
+
+directory = os.path.split(__file__)[0]
+_decoders = find_components(__package__,
+                         directory,
+                         BaseForecastingDecoder)
+
+_addons = ThirdPartyComponents(BaseForecastingDecoder)
+
+
+class ForecastingDecoderChoice(NetworkBackboneChoice):
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available head components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all NetworkHeadComponents available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+
+        components.update(_decoders)
+        components.update(_addons.components)
+
+        return components
+
+    def get_available_components(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            include: List[str] = None,
+            exclude: List[str] = None,
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate heads
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        available_comp = self.get_components()
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == ForecastingDecoderChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # heads based on the dataset!
+            # TODO: Think if there is any case where a head
+            # is not recommended for a certain dataset
+
+            components_dict[name] = entry
+        return components_dict
+
+    def get_hyperparameter_search_space(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Describes the dataset to work on
+            default (Optional[str]): Default head to use
+            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
+                list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]: which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_heads = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        if len(available_heads) == 0:
+            raise ValueError("No head found")
+
+        if default is None:
+            defaults = [
+                'MLPDecoder',
+                'RNNDecoder',
+            ]
+            for default_ in defaults:
+                if default_ in available_heads:
+                    default = default_
+                    break
+
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_heads):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_heads,
+                                                               choice_hyperparameter.value_range))
+            head = CSH.CategoricalHyperparameter('__choice__',
+                                                 choice_hyperparameter.value_range,
+                                                 default_value=choice_hyperparameter.default_value)
+        else:
+            head = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_heads.keys()),
+                default_value=default)
+        cs.add_hyperparameter(head)
+        for name in head.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                 **updates)
+            parent_hyperparameter = {'parent': head, 'value': name}
+            cs.add_configuration_space(
+                name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
new file mode 100644
index 000000000..783737b55
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -0,0 +1,155 @@
+from abc import abstractmethod, ABC
+from typing import Any, Dict, Iterable, Tuple, List, Optional
+
+import numpy as np
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
+    ALL_DISTRIBUTIONS, ProjectionLayer
+from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
+
+
+class BaseForecastingDecoder(autoPyTorchComponent):
+    """
+    Base class for network heads used for forecasting.
+     Holds the head module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 **kwargs: Any):
+        super().__init__()
+        self.add_fit_requirements(self._required_fit_requirements)
+        self.head: Optional[nn.Module] = None
+        self.auto_regressive = kwargs.get('auto_regressive', False)
+
+        self.config = kwargs
+        self.decoder = None
+        self.n_decoder_output_features = None
+        self.n_prediction_heads = 1
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        return [
+            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
+            FitRequirement('network_encoder', (nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement('encoder_properties', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
+        ]
+
+    @property
+    def decoder_properties(self):
+        decoder_property = {'additional_output': False,
+                            'additional_input': False,
+                            'fixed_input_seq_length': False,
+                            'recurrent': False,
+                            }
+        return decoder_property
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        """
+        Builds the head component and assigns it to self.head
+
+        Args:
+            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
+            y (Any): not used. To comply with sklearn API
+        Returns:
+            Self
+        """
+        self.check_requirements(X, y)
+        input_shape = X['dataset_properties']['input_shape']
+        output_shape = X['dataset_properties']['output_shape']
+
+        auto_regressive = self.auto_regressive
+
+        X.update({"auto_regressive": auto_regressive})
+
+        # for information about encoder_properties, please check
+        # autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_backbone
+        # TODO create a separate module so that users could know what is contained in encoder_properties
+
+        # TODO consider Auto-regressive model on vanilla network head
+
+        if auto_regressive:
+            self.n_prediction_heads = 1
+        else:
+            self.n_prediction_heads = output_shape[0]
+
+        encoder_properties = X['encoder_properties']
+        fixed_input_seq_length = encoder_properties.get("fixed_input_seq_length", False)
+        has_hidden_states = encoder_properties.get("has_hidden_states", False)
+
+        if fixed_input_seq_length:
+            input_shape = (X["window_size"], input_shape[-1])
+
+        self.decoder, self.n_decoder_output_features = self.build_decoder(
+            input_shape=get_output_shape(X['network_encoder'], input_shape=input_shape,
+                                         has_hidden_states=has_hidden_states),
+            n_prediction_heads=self.n_prediction_heads
+        )
+
+        X['n_decoder_output_features'] = self.n_decoder_output_features
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the network head into the fit dictionary 'X' and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X.update({'decoder_properties': self.decoder_properties,
+                  'network_decoder': self.decoder,
+                  'n_prediction_heads': self.n_prediction_heads,
+                  'n_decoder_output_features': self.n_decoder_output_features})
+
+        return X
+
+    def build_decoder(self,
+                   input_shape: Tuple[int, ...],
+                   n_prediction_heads: int) -> Tuple[nn.Module, int]:
+        """
+        Builds the head module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            n_prediction_heads (int): how many prediction heads the network has, used for final forecasting heads
+        Returns:
+            nn.Module: head module
+        """
+        decoder, n_decoder_features = self._build_decoder(input_shape, n_prediction_heads)
+        return decoder, n_decoder_features
+
+    @abstractmethod
+    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads) -> Tuple[List[nn.Module], int]:
+        """
+        Builds the head module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            n_prediction_heads (int): how many prediction heads will be generated after the encoder
+
+        Returns:
+            decoder (nn.Module): decoder module
+            n_decoder_features (int): output of decoder features, used for initialize network head.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def get_name(cls) -> str:
+        """
+        Get the name of the head
+
+        Args:
+            None
+
+        Returns:
+            str: Name of the head
+        """
+        return str(cls.get_properties()["shortname"])
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
similarity index 95%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
index ad400223f..9e128531f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/InceptionTimeBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
@@ -9,8 +9,8 @@
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
-    import BaseForecastingNetworkBackbone
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder\
+    import BaseForecastingEncoder
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
@@ -123,23 +123,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class InceptionTimeBackbone(BaseForecastingNetworkBackbone):
+class InceptionTimeEncoder(BaseForecastingEncoder):
     """
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
     @property
     def encoder_properties(self):
-        # TODO consider property for the network
         encoder_properties = {'has_hidden_states': False,
                               'bijective_seq_output': True,
                               'fixed_input_seq_length': False}
         return encoder_properties
 
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        backbone = _InceptionTime(in_features=input_shape[-1],
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        encoder = _InceptionTime(in_features=input_shape[-1],
                                   config=self.config)
-        self.backbone = backbone
-        return backbone
+        return encoder
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
similarity index 95%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index d48910342..05f4d6b08 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TimeSeriesMLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -7,8 +7,8 @@
 
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
-    import BaseForecastingNetworkBackbone, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
+    import BaseForecastingEncoder, EncoderNetwork
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
@@ -50,7 +50,7 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
         return self.module_layers(x)
 
 
-class TimeSeriesMLPBackbone(BaseForecastingNetworkBackbone, MLPBackbone):
+class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
     _fixed_seq_length = True
     window_size = 1
 
@@ -73,7 +73,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
         return super().fit(X, y)
 
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1] * self.window_size
         return _TimeSeriesMLP(self.window_size, self._build_backbone(in_features))
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
similarity index 91%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
index aabbce9f8..adb366b6c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/RNNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
@@ -12,8 +12,8 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone \
-    import BaseForecastingNetworkBackbone, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
+    import BaseForecastingEncoder, EncoderNetwork
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -62,8 +62,7 @@ def forward(self,
                 return out, hidden_state
 
 
-
-class RNNBackbone(BaseForecastingNetworkBackbone):
+class RNNEncoder(BaseForecastingEncoder):
     """
     Standard searchable LSTM backbone for time series data
     """
@@ -72,11 +71,10 @@ class RNNBackbone(BaseForecastingNetworkBackbone):
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
 
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        backbone = _RNN(in_features=input_shape[-1],
-                        config=self.config)
-        self.backbone = backbone
-        return backbone
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        encoder = _RNN(in_features=input_shape[-1],
+                       config=self.config)
+        return encoder
 
     @property
     def encoder_properties(self):
@@ -88,9 +86,9 @@ def encoder_properties(self):
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         rnn_kwargs = {'hidden_size': self.config['hidden_size'],
-                       'num_layers': self.config['num_layers'],
-                       'bidirectional': self.config['bidirectional'],
-                       'cell_type': self.config['cell_type']}  # used for initialize
+                      'num_layers': self.config['num_layers'],
+                      'bidirectional': self.config['bidirectional'],
+                      'cell_type': self.config['cell_type']}  # used for initialize
         X.update({'rnn_kwargs': rnn_kwargs})
         return super().transform(X)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
similarity index 93%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
index dc5ce5850..298ffdf95 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/TCNBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
@@ -13,8 +13,8 @@
 from torch.nn.utils import weight_norm
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
-    import BaseForecastingNetworkBackbone, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
+    import BaseForecastingEncoder, EncoderNetwork
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -102,7 +102,7 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
             return x[:, -1, :]
 
 
-class TCNBackbone(BaseForecastingNetworkBackbone):
+class TCNEncoder(BaseForecastingEncoder):
     """
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
@@ -115,17 +115,16 @@ def encoder_properties(self):
                               }
         return encoder_properties
 
-    def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_0"]]
         for i in range(1, self.config["num_blocks"]):
             num_channels.append(self.config[f"num_filters_{i}"])
-        backbone = _TemporalConvNet(input_shape[-1],
-                                    num_channels,
-                                    kernel_size=self.config["kernel_size"],
-                                    dropout=self.config["dropout"] if self.config["use_dropout"] else 0.0
-                                    )
-        self.backbone = backbone
-        return backbone
+        encoder = _TemporalConvNet(input_shape[-1],
+                                   num_channels,
+                                   kernel_size=self.config["kernel_size"],
+                                   dropout=self.config["dropout"] if self.config["use_dropout"] else 0.0
+                                   )
+        return encoder
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
similarity index 77%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
index 8406ebd5f..ed2d0e8d0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
@@ -15,23 +15,22 @@
     find_components,
 )
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone\
-    import (
-    BaseForecastingNetworkBackbone,
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder import (
+    BaseForecastingEncoder,
 )
 
 directory = os.path.split(__file__)[0]
 _backbones = find_components(__package__,
                              directory,
-                             BaseForecastingNetworkBackbone)
-_addons = ThirdPartyComponents(BaseForecastingNetworkBackbone)
+                             BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
 
 
-def add_backbone(backbone: BaseForecastingNetworkBackbone) -> None:
-    _addons.add_component(backbone)
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
 
 
-class ForecastingNetworkBackboneChoice(NetworkBackboneChoice):
+class ForecastingEncoderChoice(NetworkBackboneChoice):
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
new file mode 100644
index 000000000..39be09858
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -0,0 +1,127 @@
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import csr_matrix
+
+import torch
+import torchvision
+from autoPyTorch.utils.common import FitRequirement
+from torch import nn
+from abc import abstractmethod
+from typing import Any, Dict, Iterable, Optional, Tuple, List
+
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.pipeline.components.base_component import (
+    autoPyTorchComponent,
+)
+
+
+class EncoderNetwork(nn.Module):
+    def forward(self, x: torch.Tensor, output_seq: bool = False):
+        """
+        Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
+        When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
+         needs to be set as True
+        When the decoder is a seq2seq model, the network needs to output a 2-d Tensor (B, N), in which case,
+        output_seq needs to be set as False
+
+        Args:
+            x: torch.Tensor(B, L_in, N)
+            output_seq (bool), if the network outputs a sequence tensor. If it is set True,
+            output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
+            If this value is set as False, the network only returns the last item of the sequence.
+        Returns:
+            net_output: torch.Tensor with shape either (B, N) or (B, L_out, N)
+
+        """
+        raise NotImplementedError
+
+
+class BaseForecastingEncoder(autoPyTorchComponent):
+    """
+    Base class for network backbones. Holds the backbone module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 **kwargs: Any):
+        super().__init__()
+        self.add_fit_requirements(
+            self._required_fit_arguments
+        )
+        self.encoder: nn.Module = None
+        self.config = kwargs
+        self.input_shape: Optional[Iterable] = None
+
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        return [
+                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+                               dataset_property=False),
+                FitRequirement('time_series_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
+                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            ]
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.check_requirements(X, y)
+        X_train = X['X_train']
+
+        input_shape = X["dataset_properties"]['input_shape']
+
+        if not X["dataset_properties"]["is_small_preprocess"]:
+            # get input shape by transforming first two elements of the training set
+            transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
+            X_train = X_train[:1, np.newaxis, ...]
+            input_shape = transforms(X_train).shape[1:]
+
+        if 'network_embedding' in X.keys():
+            input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
+        self.input_shape = input_shape
+
+        self.encoder = self.build_encoder(
+            input_shape=input_shape,
+        )
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X['dataset_properties'].update({'input_shape': self.input_shape})
+        X.update({'network_encoder': self.encoder})
+        X.update({'encoder_properties': self.encoder_properties})
+        return X
+
+    @abstractmethod
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        """
+        Builds the backbone module and returns it
+
+        Args:
+            input_shape (Tuple[int, ...]): shape of the input to the backbone
+
+        Returns:
+            nn.Module: backbone module
+        """
+        raise NotImplementedError()
+
+    @property
+    def encoder_properties(self):
+        """
+        Encoder properties, this determines how the data flows over the forecasting networks
+
+        has_hidden_states, it determines if the network contains hidden states and thus return or accept the hidden
+        states
+        bijective_seq_output, determines if the network returns a sequence with the same sequence length as the input
+        sequence when output_seq is set True
+        fix_input_shape if the input shape is fixed, this is useful for building network head
+        """
+        # TODO make use of bijective_seq_output in trainer!!!
+        encoder_properties = {'has_hidden_states': False,
+                              'bijective_seq_output': False,
+                              'fixed_input_seq_length': False
+                              }
+        return encoder_properties
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
deleted file mode 100644
index 5bd411020..000000000
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_network_backbone/base_forecasting_backbone.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from abc import abstractmethod
-from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone import NetworkBackboneComponent
-
-import torch
-from torch import nn
-from abc import abstractmethod
-from typing import Any, Dict, Iterable, Optional, Tuple, List
-
-
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
-
-
-class EncoderNetwork(nn.Module):
-    def forward(self, x: torch.Tensor, output_seq: bool = False):
-        """
-        Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
-        When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
-         needs to be set as True
-        When the decoder is a seq2seq model, the network needs to output a 2-d Tensor (B, N), in which case,
-        output_seq needs to be set as False
-
-        Args:
-            x: torch.Tensor(B, L_in, N)
-            output_seq (bool), if the network outputs a sequence tensor. If it is set True,
-            output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
-            If this value is set as False, the network only returns the last item of the sequence.
-        Returns:
-            net_output: torch.Tensor with shape either (B, N) or (B, L_out, N)
-
-        """
-        raise NotImplementedError
-
-
-class BaseForecastingNetworkBackbone(NetworkBackboneComponent):
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        return super().fit(X, y)
-
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        X = super().transform(X)
-        X.update({'encoder_properties': self.encoder_properties})
-        return X
-
-    @property
-    def encoder_properties(self):
-        """
-        Encoder properties, this determines how the data flows over the forecasting networks
-
-        has_hidden_states, it determines if the network contains hidden states and thus return or accept the hidden
-        states
-        bijective_seq_output, determines if the network returns a sequence with the same sequence length as the input
-        sequence when output_seq is set True
-        fix_input_shape if the input shape is fixed, this is useful for building network head
-        """
-        # TODO make use of bijective_seq_output in trainer!!!
-        encoder_properties = {'has_hidden_states': False,
-                              'bijective_seq_output': False,
-                              'fixed_input_seq_length': False
-                              }
-        return encoder_properties
-
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 2c2df11ef..98dcdf801 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -1,16 +1,16 @@
-from abc import abstractmethod, ABC
 from typing import Any, Dict, Iterable, Tuple, List, Optional
 
 import numpy as np
 import torch
 from torch import nn
+from ConfigSpace import ConfigurationSpace
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
-    ALL_DISTRIBUTIONS, ProjectionLayer
+    ALL_DISTRIBUTIONS
 
 
 class ForecastingHead(NetworkHeadComponent):
@@ -18,18 +18,15 @@ class ForecastingHead(NetworkHeadComponent):
     Base class for network heads used for forecasting.
      Holds the head module and the config which was used to create it.
     """
-    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series",
-                            "n_prediction_steps"]
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
     def __init__(self,
-                 **kwargs: Any):
-        super().__init__()
+                 random_state: Optional[np.random.RandomState] = None):
+        super(NetworkHeadComponent, self).__init__(random_state=random_state)
+
         self.add_fit_requirements(self._required_fit_requirements)
         self.head: Optional[nn.Module] = None
         self.required_net_out_put_type: Optional[str] = None
-        self.auto_regressive = kwargs.get('auto_regressive', False)
-
-        self.config = kwargs
 
 
     @property
@@ -37,11 +34,10 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
-            FitRequirement('encoder_properties', (str,), user_defined=False, dataset_property=False),
-            FitRequirement('n_prediction_steps', (int,), user_defined=True, dataset_property=True),
+            FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('n_decoder_output_features', (int, ), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
-            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
-            FitRequirement('loss_type', (str,), user_defined=False, dataset_property=False)
         ]
 
     @property
@@ -63,7 +59,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         Returns:
             Self
         """
-        input_shape = X['dataset_properties']['input_shape']
+        self.check_requirements(X, y)
         output_shape = X['dataset_properties']['output_shape']
 
         self.required_net_out_put_type = X['required_net_out_put_type']
@@ -74,64 +70,26 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         dist_cls = X.get('dist_cls', None)
 
+        auto_regressive = X.get('auto_regressive', False)
 
-
-        auto_regressive = self.auto_regressive
-
-        auto_regressive = False # TODO implement auto_regressive mdoels!!
-
-
-
-        X.update({"auto_regressive": auto_regressive})
-        encoder_properties = X['encoder_properties']
-
-        # for information about encoder_properties, please check
-        # autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone.base_forecasting_backbone
-        # TODO create a separate module so that users could know what is contained in encoder_properties
-
-        # TODO consider Auto-regressive model on vanilla network head
-        if auto_regressive:
-            n_prediction_heads = 1
-        else:
-            n_prediction_heads = output_shape[0]
-        # output shape now doe not contain information about n_prediction_steps
-
-        fixed_input_seq_length = encoder_properties.get("fixed_input_seq_length", False)
-        has_hidden_states = encoder_properties.get("has_hidden_states", False)
-
-        if fixed_input_seq_length:
-            input_shape = (X["window_size"], input_shape[-1])
-
-        arch_kwargs = {'n_prediction_heads': n_prediction_heads}
+        head_input_shape = X["n_decoder_output_features"]
+        n_prediction_heads = X["n_prediction_heads"]
 
         self.head = self.build_head(
-            input_shape=get_output_shape(X['network_backbone'], input_shape=input_shape,
-                                         has_hidden_states=has_hidden_states),
+            input_shape=head_input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
             dist_cls=dist_cls,
-            **arch_kwargs,
+            n_prediction_heads=n_prediction_heads,
         )
         return self
 
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Adds the network head into the fit dictionary 'X' and returns it.
-
-        Args:
-            X (Dict[str, Any]): 'X' dictionary
-        Returns:
-            (Dict[str, Any]): the updated 'X' dictionary
-        """
-        X.update({'decoder_properties': self.decoder_properties})
-        return super().transform(X)
-
     def build_head(self,
                    input_shape: Tuple[int, ...],
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
                    dist_cls: Optional[str] = None,
-                   **arch_kwargs: Dict) -> nn.Module:
+                   n_prediction_heads: int = 1) -> nn.Module:
         """
         Builds the head module and returns it
 
@@ -140,42 +98,23 @@ def build_head(self,
             output_shape (Tuple[int, ...]): shape of the output of the head
             auto_regressive (bool): if the network is auto-regressive
             dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
-            arch_kwargs (Dict): additional paramter for initializing architectures.
+            n_prediction_heads (Dict): additional paramter for initializing architectures. How many heads to predict
 
         Returns:
             nn.Module: head module
         """
-        base_header_layer, num_head_base_output_features = self._build_head(input_shape, **arch_kwargs)
-        proj_layer = []
-
-        output_layer = self.build_proj_layer(
-            num_head_base_output_features=num_head_base_output_features,
+        head_layer = self.build_proj_layer(
+            input_shape=input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
             net_out_put_type=self.required_net_out_put_type,
             dist_cls=dist_cls,
-            n_prediction_heads=arch_kwargs['n_prediction_heads']
+            n_prediction_heads=n_prediction_heads
         )
-        proj_layer.append(output_layer)
-        return nn.Sequential(*base_header_layer, *proj_layer)
-
-    @abstractmethod
-    def _build_head(self, input_shape: Tuple[int, ...], **arch_kwargs) -> Tuple[List[nn.Module], int]:
-        """
-        Builds the head module and returns it
-
-        Args:
-            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
-            output_shape (Tuple[int, ...]): shape of the output of the head
-            n_prediction_steps (int): how many steps need to be predicted in advance
-
-        Returns:
-            nn.Module: head module
-        """
-        raise NotImplementedError()
+        return head_layer
 
     @staticmethod
-    def build_proj_layer(num_head_base_output_features: int,
+    def build_proj_layer(input_shape: Tuple[int, ...],
                          output_shape: Tuple[int, ...],
                          n_prediction_heads: int,
                          auto_regressive: bool,
@@ -184,7 +123,7 @@ def build_proj_layer(num_head_base_output_features: int,
         """
         a final layer that project the head output to the final distribution
         Args:
-            num_head_base_output_features (int): output feature of head base,
+            input_shape (int): input shape to build the header,
             is used to initialize size of the linear layer
             output_shape (Tuple[int, ..]): deserved output shape
             n_prediction_heads: int, how many steps the head want to predict
@@ -202,17 +141,34 @@ def build_proj_layer(num_head_base_output_features: int,
         if net_out_put_type == 'distribution':
             if dist_cls not in ALL_DISTRIBUTIONS.keys():
                 raise ValueError(f'Unsupported distribution class type: {dist_cls}')
-            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=num_head_base_output_features,
+            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=input_shape,
                                                      output_shape=output_shape[1:],
                                                      n_prediction_heads=n_prediction_heads,
                                                      auto_regressive=auto_regressive)
             return proj_layer
         elif net_out_put_type == 'regression':
-            proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, num_head_base_output_features)),
-                                       nn.Linear(num_head_base_output_features, np.product(output_shape[1:])),
+            proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, input_shape)),
+                                       nn.Linear(input_shape, np.product(output_shape[1:])),
                                        # nn.Unflatten(-1, tuple(output_shape)),
                                        )
             return proj_layer
         else:
             raise ValueError(f"Unsupported network type "
                              f"{net_out_put_type} (should be regression or distribution)")
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> ConfigurationSpace:
+        """Return the configuration space of this classification algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, Union[str, int]]):
+                Describes the dataset to work on
+
+        Returns:
+            ConfigurationSpace:
+                The configuration space of this algorithm.
+        """
+        cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 3326e4840..932261a61 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -28,9 +28,10 @@
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_network_backbone import \
-    ForecastingNetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head import ForecastingNetworkHeadChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder import \
+    ForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder import ForecastingDecoderChoice
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
@@ -205,7 +206,7 @@ def _get_hyperparameter_search_space(self,
 
             hp_auto_regressive = []
             for hp_name in cs.get_hyperparameter_names():
-                if hp_name.startswith('network_head:'):
+                if hp_name.startswith('network_decoder:'):
                     if hp_name.endswith(':auto_regressive'):
                         hp_auto_regressive.append(cs.get_hyperparameter(hp_name))
 
@@ -288,10 +289,11 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
-            ("network_backbone", ForecastingNetworkBackboneChoice(default_dataset_properties,
-                                                                  random_state=self.random_state)),
-            ("network_head", ForecastingNetworkHeadChoice(default_dataset_properties,
-                                                          random_state=self.random_state)),
+            ("network_encoder", ForecastingEncoderChoice(default_dataset_properties,
+                                                         random_state=self.random_state)),
+            ("network_decoder", ForecastingDecoderChoice(default_dataset_properties,
+                                                         random_state=self.random_state)),
+            ("network_head", ForecastingHead(random_state=self.random_state)),
             ("network", ForecastingNetworkComponent(random_state=self.random_state)),
             ("network_init", NetworkInitializerChoice(default_dataset_properties,
                                                       random_state=self.random_state)),

From d147254e7dc52b0c5f56094e820220b3b7a457c5 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Dec 2021 00:17:57 +0100
Subject: [PATCH 085/347] rnn decoder

---
 .../base_target_scaler.py                     |  13 +-
 .../forecasting_target_scaling/utils.py       |  27 ++-
 .../setup/network/forecasting_network.py      | 175 ++++++++++++++----
 .../forecasting_decoder/MLPDecoder.py         |   5 +-
 .../forecasting_decoder/RNNDecoder.py         |  24 ++-
 .../base_forecasting_decoder.py               |  14 +-
 .../forecasting_network_head/distribution.py  |   5 +-
 .../forecasting_base_trainer.py               |  44 +++--
 8 files changed, 216 insertions(+), 91 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
index b7f2d8349..a50e6f5a0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
@@ -56,14 +56,17 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'target_scaler': self})
         return X
 
-    def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
+    def __call__(self,
+                 past_target: Union[np.ndarray, torch.tensor],
+                 future_targets: Optional[Union[np.ndarray, torch.Tensor]]=None,
+                 ) -> Union[np.ndarray, torch.tensor]:
 
         if self.scaler is None:
             raise ValueError("cant call {} without fitting the column transformer first."
                              .format(self.__class__.__name__))
 
-        if len(X.shape) == 2:
+        if len(past_target.shape) == 2:
             # expand batch dimension when called on a single record
-            X = X[np.newaxis, ...]
-        X[:, :, self.target_columns], loc, scale = self.scaler.transform(X[:, :, self.target_columns])
-        return X, loc, scale
+            past_target = past_target[np.newaxis, ...]
+        past_target, future_targets, loc, scale = self.scaler.transform(past_target, future_targets)
+        return past_target, future_targets, loc, scale
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index 8f4ede183..8230e5f0b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -17,31 +17,38 @@ def __init__(self, mode: str):
     def fit(self, X: Dict, y: Any = None) -> "TimeSeriesScalerBatch":
         return self
 
-    def transform(self, X: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    def transform(self, past_targets: torch.Tensor, future_targets: Optional[torch.Tensor]=None) -> \
+            Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         if self.mode == "standard":
-            loc = torch.mean(X, dim=-2, keepdim=True)
-            scale = torch.std(X, dim=-2, keepdim=True)
+            loc = torch.mean(past_targets, dim=-2, keepdim=True)
+            scale = torch.std(past_targets, dim=-2, keepdim=True)
             scale[scale == 0.0] = 1.0
-            return (X - loc) / scale, loc, scale
+            if future_targets is not None:
+                future_targets = (future_targets - loc) / scale
+            return (past_targets - loc) / scale, future_targets,loc, scale
 
         elif self.mode == "min_max":
-            min_ = torch.min(X, dim=-2, keepdim=True)[0]
-            max_ = torch.max(X, dim=-2, keepdim=True)[0]
+            min_ = torch.min(past_targets, dim=-2, keepdim=True)[0]
+            max_ = torch.max(past_targets, dim=-2, keepdim=True)[0]
 
             diff_ = max_ - min_
             loc = min_ - 1e-10
             scale = diff_
             scale[scale == 0.0] = 1.0
-            return (X - loc) / scale, loc, scale
+            if future_targets is not None:
+                future_targets = (future_targets - loc) / scale
+            return (past_targets - loc) / scale, future_targets, loc, scale
 
         elif self.mode == "max_abs":
-            max_abs_ = torch.max(torch.abs(X), dim=-2, keepdim=True)[0]
+            max_abs_ = torch.max(torch.abs(past_targets), dim=-2, keepdim=True)[0]
             max_abs_[max_abs_ == 0.0] = 1.0
             scale = max_abs_
-            return X / scale, None, scale
+            if future_targets is not None:
+                future_targets = future_targets / scale
+            return past_targets / scale, future_targets, None, scale
 
         elif self.mode == "none":
-            return X, None, None
+            return past_targets, future_targets, None, None
 
         else:
             raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index d1df21e3e..cd69b324c 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -22,51 +22,68 @@
 
 
 class ForecastingNet(nn.Module):
+    future_target_required = False
+
     def __init__(self,
-                 network_embedding: nn.Module,
+                 network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
                  network_encoder: EncoderNetwork,
                  network_decoder: nn.Module,
                  network_head: nn.Module,
-                 encoder_properties: Dict = {},
-                 decoder_properties: Dict = {},
+                 n_prediction_steps: int,
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
                  num_samples: Optional[int] = 100,
                  aggregation: Optional[str] = 'mean'
                  ):
+        """
+        This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
+        MLP decoder and the final head).
+
+        This structure is active when the decoder is a MLP with auto_regressive set as false
+
+        Args:
+            network_embedding (nn.Module): network embedding
+            network_encoder (EncoderNetwork): Encoder network, could be selected to return a sequence or a
+            network_decoder (nn.Module): network decoder
+            network_head (nn.Module): network head, maps the output of decoder to the final output
+            n_prediction_steps (int): how many steps the network want to predict
+            encoder_properties (Dict): encoder properties
+            decoder_properties: (Dict): decoder properties
+            output_type (str): the form that the network outputs. It could be regression, distribution and
+            (TODO) quantile
+            forecast_strategy (str): only valid if output_type is distribution or quantile, how the network transforms
+            its output to predicted values, could be mean or sample
+            num_samples (int): only valid if output_type is not regression and forecast_strategy is sample. this
+            indicates the number of the points to sample when doing prediction
+            aggregation (str): how the samples are aggregated. We could take their mean or median values.
+        """
         super(ForecastingNet, self).__init__()
         self.embedding = network_embedding
-        self.encoder = network_encoder
+        self.encoder = network_encoder  # type:EncoderNetwork
         self.decoder = network_decoder
         self.head = network_head
 
-        self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
-        self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
-
-        if self.decoder_has_hidden_states:
-            if not self.encoder_has_hidden_states:
-                raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
-                                 'for decoder!')
-
-        self.recurrent_decoder = decoder_properties['recurrent']
-
+        self.n_prediction_steps = n_prediction_steps
         self.output_type = output_type
         self.forecast_strategy = forecast_strategy
         self.num_samples = num_samples
         self.aggregation = aggregation
 
-    def forward(self, X: torch.Tensor, hx: Optional[Tuple[torch.Tensor]] = None):
-        X = self.embedding(X)
-        if self.encoder_has_hidden_states:
-            X, hidden_state_encoder = self.encoder(X)
+    def forward(self,
+                targets_past: torch.Tensor,
+                targets_future: Optional[torch.Tensor] = None,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        if features_past is not None:
+            x_past = torch.cat([targets_past, features_past], dim=1)
         else:
-            X = self.encoder(X)
-        if self.decoder_has_hidden_states:
-            X, hidden_state_decoder = self.decoder(X, hidden_state_encoder)
-        else:
-            X = self.decoder(X)
-        X = self.head(X)
-        return X
+            x_past = targets_past
+        x_past = self.embedding(x_past)
+        x_past = self.decoder(x_past)
+        output = self.head(x_past)
+        return output
 
     def pred_from_net_output(self, net_output):
         if self.output_type == 'regression':
@@ -75,7 +92,7 @@ def pred_from_net_output(self, net_output):
             if self.forecast_strategy == 'mean':
                 return net_output.mean
             elif self.forecast_strategy == 'sample':
-                samples = net_output.sample((self.num_samples, ))
+                samples = net_output.sample((self.num_samples,))
                 if self.aggregation == 'mean':
                     return torch.mean(samples, dim=0)
                 elif self.aggregation == 'median':
@@ -87,8 +104,76 @@ def pred_from_net_output(self, net_output):
         else:
             raise ValueError(f'Unknown output_type: {self.output_type}')
 
-    def predict(self, X: torch.Tensor):
-        net_output = self(X)
+    def predict(self,
+                targets_past: torch.Tensor,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None
+                ):
+        net_output = self(targets_past, features_past)
+        return self.pred_from_net_output(net_output)
+
+
+class ForecastingSeq2SeqNet(ForecastingNet):
+    future_target_required = True
+    """
+    Forecasting network with Seq2Seq structure.
+
+    This structure is activate when the decoder is recurrent (RNN). We train the network with teacher forcing, thus
+    targets_future is required for the network. To train the network, past targets and past features are fed to the
+    encoder to obtain the hidden states whereas future targets and future features
+    """
+
+    def forward(self,
+                targets_past: torch.Tensor,
+                targets_future: Optional[torch.Tensor] = None,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        x_past = targets_past if features_past is None else torch.cat([targets_past, features_past], dim=-1)
+
+        x_past = self.embedding(x_past)
+
+        if self.training:
+            # we do one step ahead forecasting
+            targets_future = torch.cat([targets_past[:, [-1], :], targets_future[:, :-1, :]], dim=1)
+
+            x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
+                                                                                dim=-1)
+
+            _, hidden_states = self.encoder(x_past)
+            x_future, _ = self.decoder(x_future, hidden_states)
+            net_output = self.head(x_future)
+
+            return net_output
+        else:
+            all_predictions = []
+            predicted_target = targets_past[:, [-1]]
+
+            dist_keys = None
+
+            _, hidden_states = self.encoder(x_past)
+            for idx_pred in range(self.n_prediction_steps):
+                x_future = predicted_target if features_future is None else torch.cat(
+                    [predicted_target, features_future[:, [idx_pred], :]],
+                    dim=-1)
+
+                x_future, hidden_states = self.decoder(x_future, hidden_states)
+                net_output = self.head(x_future[:, -1:, ])
+                predicted_target = self.pred_from_net_output(net_output)
+
+                all_predictions.append(net_output)
+
+            return all_predictions
+
+    def predict(self,
+                targets_past: torch.Tensor,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None
+                ):
+        net_output = self(targets_past, features_past, features_future)
         return self.pred_from_net_output(net_output)
 
 
@@ -131,17 +216,29 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                              f"loss function. However, net_out_type is {self.net_out_type} and "
                              f"required_net_out_put_type is {X['required_net_out_put_type']}")
 
-        self.network = ForecastingNet(network_embedding=X['network_embedding'],
-                                      network_encoder=X['network_encoder'],
-                                      network_decoder=X['network_decoder'],
-                                      network_head=X['network_head'],
-                                      encoder_properties=X['encoder_properties'],
-                                      decoder_properties=X['decoder_properties'],
-                                      output_type=self.net_out_type,
-                                      forecast_strategy=self.forecast_strategy,
-                                      num_samples=self.num_samples,
-                                      aggregation=self.aggregation,
-                                      )
+        if X['decoder_properties']['has_hidden_states']:
+            if not X['encoder_properties']['has_hidden_states']:
+                raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
+                                 'for decoder!')
+
+        network_init_kwargs = dict(network_embedding=X['network_embedding'],
+                                   network_encoder=X['network_encoder'],
+                                   network_decoder=X['network_decoder'],
+                                   network_head=X['network_head'],
+                                   n_prediction_steps=X['dataset_properties']['n_prediction_steps'],
+                                   output_type=self.net_out_type,
+                                   forecast_strategy=self.forecast_strategy,
+                                   num_samples=self.num_samples,
+                                   aggregation=self.aggregation, )
+
+        if X['decoder_properties']['has_hidden_states']:
+            # decoder is RNN
+            self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
+        elif X['auto_regressive']:
+            # decoder is MLP and auto_regressive, we have deep AR model
+            raise NotImplementedError
+        else:
+            self.network = ForecastingNet(**network_init_kwargs)
 
         # Properly set the network training device
         if self.device is None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
index 9bc132e27..9bb669910 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
@@ -23,7 +23,10 @@ def decoder_properties(self):
                               }
         return decoder_properties
 
-    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int) -> Tuple[nn.Module, int]:
+    def _build_decoder(self,
+                       input_shape: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
         layers = []
         in_features = input_shape[-1]
         if self.config["num_layers"] > 0:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
index 30fd6d71a..6344ac63e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
@@ -21,7 +21,7 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
 
 
-class _RNN_Decoder(nn.Module):
+class RNN_Module(nn.Module):
     def __init__(self,
                  in_features: int,
                  hidden_size: int,
@@ -46,7 +46,7 @@ def forward(self, x: torch.Tensor,
         if x.ndim == 2:
             x = x.unsqueeze(1)
         outputs, hidden_state, = self.lstm(x, hx)
-        return outputs[:, -1, :], hidden_state
+        return outputs, hidden_state
 
 
 class ForecastingRNNHeader(BaseForecastingDecoder):
@@ -66,17 +66,21 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
         fit_requirement.append(FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False))
         return fit_requirement
 
-    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int) -> Tuple[List[nn.Module], int]:
+    def _build_decoder(self,
+                       input_shape: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> Tuple[List[nn.Module], int]:
         # RNN decoder only allows RNN encoder, these parameters need to exists.
         hidden_size = self.rnn_kwargs['hidden_size']
-        num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs['num_layers']
+        num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs[
+            'num_layers']
         cell_type = self.rnn_kwargs['cell_type']
-        decoder = _RNN_Decoder(in_features=input_shape[-1],
-                            hidden_size=hidden_size,
-                            num_layers=num_layers,
-                            cell_type=cell_type,
-                            config=self.config,
-                            )
+        decoder = RNN_Module(in_features=dataset_properties['output_shape'][-1],
+                             hidden_size=hidden_size,
+                             num_layers=num_layers,
+                             cell_type=cell_type,
+                             config=self.config,
+                             )
         return decoder, hidden_size
 
     @property
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
index 783737b55..bdb81601e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -88,7 +88,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.decoder, self.n_decoder_output_features = self.build_decoder(
             input_shape=get_output_shape(X['network_encoder'], input_shape=input_shape,
                                          has_hidden_states=has_hidden_states),
-            n_prediction_heads=self.n_prediction_heads
+            n_prediction_heads=self.n_prediction_heads,
+            dataset_properties=X['dataset_properties']
         )
 
         X['n_decoder_output_features'] = self.n_decoder_output_features
@@ -111,22 +112,25 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return X
 
     def build_decoder(self,
-                   input_shape: Tuple[int, ...],
-                   n_prediction_heads: int) -> Tuple[nn.Module, int]:
+                      input_shape: Tuple[int, ...],
+                      n_prediction_heads: int,
+                      dataset_properties: Dict) -> Tuple[nn.Module, int]:
         """
         Builds the head module and returns it
 
         Args:
             input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
             n_prediction_heads (int): how many prediction heads the network has, used for final forecasting heads
+            dataset_properties (Dict): dataset properties
         Returns:
             nn.Module: head module
         """
-        decoder, n_decoder_features = self._build_decoder(input_shape, n_prediction_heads)
+        decoder, n_decoder_features = self._build_decoder(input_shape, n_prediction_heads, dataset_properties)
         return decoder, n_decoder_features
 
     @abstractmethod
-    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads) -> Tuple[List[nn.Module], int]:
+    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
+                       dataset_properties:Dict) -> Tuple[nn.Module, int]:
         """
         Builds the head module and returns it
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index b2fef6abf..a964d83c0 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -92,7 +92,10 @@ def forward(self, x: torch.Tensor) -> torch.distributions:
              with shape (batch_size, n_prediction_steps, output_shape)
         """
         params_unbounded = [proj(x) for proj in self.proj]
-        return self.dist_cls(*self.domain_map(*params_unbounded))
+
+        # TODO consider how to handle network parameter issues (or register a hook)
+        parameters_bounded = self.domain_map(*params_unbounded)
+        return self.dist_cls(*parameters_bounded)
 
     @property
     @abstractmethod
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 36fd9f1c5..c85d37a18 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -133,7 +133,7 @@ def rescale_output(self,
                     outputs = outputs * scale.to(self.device) + loc.to(self.device)
         return outputs
 
-    def train_step(self, data: Dict[str, torch.Tensor], targets: Dict[str, Union[torch.Tensor, np.ndarray]]) \
+    def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor) \
             -> Tuple[float, torch.Tensor]:
         """
         Allows to train 1 step of gradient descent, given a batch of train/labels
@@ -146,22 +146,26 @@ def train_step(self, data: Dict[str, torch.Tensor], targets: Dict[str, Union[tor
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        X = data['past_target']
+        past_target = data['past_target']
 
         # prepare
-        X = X.float()
-
-        X, loc, scale = self.target_scaler(X)
+        past_target = past_target.float()
+        if self.model.future_target_required:
+            past_target, scaled_future_targets, loc, scale = self.target_scaler(past_target, future_targets)
+        else:
+            past_target, _, loc, scale = self.target_scaler(past_target)
+            scaled_future_targets = future_targets
 
-        X = X.to(self.device)
+        past_target = past_target.to(self.device)
 
-        targets = self.cast_targets(targets)
+        future_targets = self.cast_targets(future_targets)
 
-        X, criterion_kwargs = self.data_preparation(X, targets)
+        past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
         # training
         self.optimizer.zero_grad()
-        outputs = self.model(X)
+
+        outputs = self.model(past_target, scaled_future_targets.float().to(self.device))
 
         outputs = self.rescale_output(outputs, loc=loc, scale=scale)
 
@@ -198,29 +202,29 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         mase_coefficients = list()
 
         with torch.no_grad():
-            for step, (data, targets) in enumerate(test_loader):
-                X = data['past_target']
+            for step, (data, future_targets) in enumerate(test_loader):
+                past_target = data['past_target']
 
                 mase_coefficients.append(data['mase_coefficient'])
 
-                batch_size = X.shape[0]
+                batch_size = past_target.shape[0]
 
                 # prepare
-                X = X.float()
+                past_target = past_target.float()
 
-                X, loc, scale = self.target_scaler(X)
+                past_target, _, loc, scale = self.target_scaler(past_target)
 
-                X = X.to(self.device)
+                past_target = past_target.to(self.device)
 
-                targets = self.cast_targets(targets)
+                future_targets = self.cast_targets(future_targets)
 
-                X, criterion_kwargs = self.data_preparation(X, targets)
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
-                outputs = self.model(X)
+                outputs = self.model(past_target)
 
                 outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale)
 
-                loss = self.criterion(outputs_rescaled, targets)
+                loss = self.criterion(outputs_rescaled, future_targets)
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size
@@ -235,7 +239,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                     if scale is None:
                         scale = 1.
                     outputs_data.append(outputs * scale + loc)
-                targets_data.append(targets.detach().cpu())
+                targets_data.append(future_targets.detach().cpu())
 
                 if writer:
                     writer.add_scalar(

From e7e4225af52815d4d49ed3d3c4eaf8f546f95449 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Dec 2021 11:41:30 +0100
Subject: [PATCH 086/347] maint, add LN

---
 .../setup/network/forecasting_network.py      |  3 +-
 .../forecasting_encoder/MLPEncoder.py         | 37 ++++++++++++++++---
 .../base_forecasting_encoder.py               |  2 +-
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index cd69b324c..4edfcfef4 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -81,6 +81,7 @@ def forward(self,
         else:
             x_past = targets_past
         x_past = self.embedding(x_past)
+        x_past = self.encoder(x_past)
         x_past = self.decoder(x_past)
         output = self.head(x_past)
         return output
@@ -274,7 +275,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
                 loc = 0.
                 scale = 1.
             else:
-                X, loc, scale = target_scaler(X)
+                X, _, loc, scale = target_scaler(X)
 
             X = X.to(self.device)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index 05f4d6b08..e28544c3a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -4,7 +4,7 @@
 from torch import nn
 
 from ConfigSpace import ConfigurationSpace
-
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
@@ -12,17 +12,17 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
-from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
 
 
 class _TimeSeriesMLP(EncoderNetwork):
     def __init__(self,
                  window_size: int,
-                 module_layers: nn.Module,
+                 mlp_layers: nn.Module,
                  ):
         super().__init__()
         self.window_size = window_size
-        self.module_layers = module_layers
+        self.mlp_layers = mlp_layers
 
     def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
@@ -47,7 +47,7 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
                 # we need to ensure that the input size fits the network shape
                 x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
         x = x.flatten(-2)
-        return self.module_layers(x)
+        return self.mlp_layers(x)
 
 
 class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
@@ -77,6 +77,26 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1] * self.window_size
         return _TimeSeriesMLP(self.window_size, self._build_backbone(in_features))
 
+    def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
+                   layer_id: int) -> None:
+        """
+        Dynamically add a layer given the in->out specification
+
+        Args:
+            layers (List[nn.Module]): The list where all modules are added
+            in_features (int): input dimensionality of the new layer
+            out_features (int): output dimensionality of the new layer
+
+        """
+        layers.append(nn.Linear(in_features, out_features))
+        if self.config['normalization'] == 'BN':
+            layers.append(nn.BatchNorm1d(out_features))
+        elif self.config['normalization'] == 'LN':
+            layers.append(nn.LayerNorm(out_features))
+        layers.append(_activations[self.config["activation"]]())
+        if self.config['use_dropout']:
+            layers.append(nn.Dropout(self.config["dropout_%d" % layer_id]))
+
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
@@ -109,14 +129,19 @@ def get_hyperparameter_search_space(
                                                                              default_value=256,
                                                                              log=True
                                                                              ),
+            normalization: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='normalization',
+                                                                                 value_range=('BN', 'LN'),
+                                                                                 default_value='BN'),
             dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
                                                                            value_range=(0, 0.8),
                                                                            default_value=0.5,
                                                                            ),
     ) -> ConfigurationSpace:
-        return MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,
+        cs = MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,
                                                            num_groups=num_groups,
                                                            activation=activation,
                                                            use_dropout=use_dropout,
                                                            num_units=num_units,
                                                            dropout=dropout)
+        add_hyperparameter(cs, normalization, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
index 39be09858..714bc7e77 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -48,7 +48,7 @@ class BaseForecastingEncoder(autoPyTorchComponent):
 
     def __init__(self,
                  **kwargs: Any):
-        super().__init__()
+        autoPyTorchComponent.__init__(self)
         self.add_fit_requirements(
             self._required_fit_arguments
         )

From 46e4b4008a4dad6884d586b0699da7580fadd203 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Dec 2021 16:03:08 +0100
Subject: [PATCH 087/347] faster forecasting metric computation

---
 .../setup/network/forecasting_network.py      | 20 +++++---
 .../components/training/metrics/base.py       | 48 +++++++++++++++----
 .../components/training/metrics/utils.py      |  4 +-
 .../forecasting_base_trainer.py               |  7 +--
 4 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 4edfcfef4..3c7899681 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -30,6 +30,8 @@ def __init__(self,
                  network_decoder: nn.Module,
                  network_head: nn.Module,
                  n_prediction_steps: int,
+                 encoder_properties: Dict,
+                 decoder_properties: Dict,
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
                  num_samples: Optional[int] = 100,
@@ -69,6 +71,12 @@ def __init__(self,
         self.num_samples = num_samples
         self.aggregation = aggregation
 
+        if decoder_properties['has_hidden_states']:
+            if not encoder_properties['has_hidden_states']:
+                raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
+                                 'for decoder!')
+        self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
+
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
@@ -81,7 +89,10 @@ def forward(self,
         else:
             x_past = targets_past
         x_past = self.embedding(x_past)
-        x_past = self.encoder(x_past)
+        if self.encoder_has_hidden_states:
+            x_past, _ = self.encoder(x_past)
+        else:
+            x_past = self.encoder(x_past)
         x_past = self.decoder(x_past)
         output = self.head(x_past)
         return output
@@ -217,16 +228,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                              f"loss function. However, net_out_type is {self.net_out_type} and "
                              f"required_net_out_put_type is {X['required_net_out_put_type']}")
 
-        if X['decoder_properties']['has_hidden_states']:
-            if not X['encoder_properties']['has_hidden_states']:
-                raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
-                                 'for decoder!')
-
         network_init_kwargs = dict(network_embedding=X['network_embedding'],
                                    network_encoder=X['network_encoder'],
                                    network_decoder=X['network_decoder'],
                                    network_head=X['network_head'],
                                    n_prediction_steps=X['dataset_properties']['n_prediction_steps'],
+                                   encoder_properties=X['encoder_properties'],
+                                   decoder_properties=X['decoder_properties'],
                                    output_type=self.net_out_type,
                                    forecast_strategy=self.forecast_strategy,
                                    num_samples=self.num_samples,
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 0857244d1..157e9d4c5 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta
-from typing import Any, Callable, List, Optional, Dict
+from typing import Any, Callable, List, Optional, Dict, Union
 
 import numpy as np
 
@@ -7,12 +7,11 @@
 from sklearn.utils.multiclass import type_of_target
 
 
-
 class autoPyTorchMetric(object, metaclass=ABCMeta):
 
     def __init__(self,
                  name: str,
-                 score_func: Callable[..., float],
+                 score_func: Callable[..., Union[float, np.ndarray]],
                  optimum: float,
                  worst_possible_result: float,
                  sign: float,
@@ -199,6 +198,7 @@ def __call__(
             sp: int,
             n_prediction_steps: int,
             horizon_weight: Optional[List[float]] = None,
+            sample_weight: Optional[List[float]] = None,
             **kwarg: Dict,
     ) -> float:
         """Evaluate time series forecastin losses given input data
@@ -231,13 +231,41 @@ def __call__(
             raise ValueError(f"The length of y_true, y_pred and y_train must equal, however, they are "
                              f"{len(y_pred)} and {len(y_true)} respectively")
 
-        losses_all = np.ones([len(y_true)])
-        for seq_idx in range(len(y_true)):
-            losses_all[seq_idx] = self._sign * self._metric_func(y_true=y_true[seq_idx],
-                                                                 y_pred=y_pred[seq_idx],
-                                                                 sp=sp,
-                                                                 horizon_weight=horizon_weight,
-                                                                 **self._kwargs)
+        # we want to compute loss w.r.t. each sequence, so the first dimension needs to be n_prediction_steps
+
+        n_outputs = y_true.shape[-1]
+
+        if sample_weight is not None:
+            if n_outputs != len(sample_weight):
+                raise ValueError(("There must be equally many custom weights "
+                                  "(%d) as outputs (%d).") %
+                                 (len(sample_weight), n_outputs))
+
+        # shape is [n_prediction_steps, n_sequence, n_outputs]
+        y_true = np.transpose(y_true.reshape((-1, n_prediction_steps, n_outputs)),
+                              (1, 0, 2))
+        y_pred = np.transpose(y_pred.reshape((-1, n_prediction_steps, n_outputs)),
+                              (1, 0, 2))
+
+        # shape is [n_prediction_steps, n_sequence * n_outputs]
+        y_true = y_true.reshape((n_prediction_steps, -1))
+        y_pred = y_pred.reshape((n_prediction_steps, -1))
+
+        losses_all = self._metric_func(y_true=y_true,
+                                       y_pred=y_pred,
+                                       sp=sp,
+                                       horizon_weight=horizon_weight,
+                                       multioutput='raw_values',
+                                       **self._kwargs)
+
+        losses_all = losses_all.reshape([-1, n_outputs])
+
+        # multi output aggregation
+        if sample_weight is not None:
+            losses_all = np.sum(losses_all * sample_weight, axis=-1)
+        else:
+            losses_all = np.mean(losses_all, -1)
+
         if agg == 'mean':
             return self._sign * np.mean(losses_all)
         elif agg == 'median':
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 03cf4cca6..0b1d07a66 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -125,9 +125,9 @@ def calculate_score(
             if metric_ in MASE_LOSSES and 'mase_cofficient' in score_kwargs:
                 target_scaled = target * score_kwargs['mase_cofficient']
                 cprediction_scaled = cprediction * score_kwargs['mase_cofficient']
-                score_dict[metric_.name] = metric_(target_scaled, cprediction_scaled, **score_kwargs)
+                score_dict[metric_.name] = metric_._sign * metric_(target_scaled, cprediction_scaled, **score_kwargs)
             else:
-                score_dict[metric_.name] = metric_(target, cprediction, **score_kwargs)
+                score_dict[metric_.name] = metric_._sign * metric_(target, cprediction, **score_kwargs)
     elif task_type in REGRESSION_TASKS:
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index c85d37a18..8766d87cf 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -154,7 +154,6 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             past_target, scaled_future_targets, loc, scale = self.target_scaler(past_target, future_targets)
         else:
             past_target, _, loc, scale = self.target_scaler(past_target)
-            scaled_future_targets = future_targets
 
         past_target = past_target.to(self.device)
 
@@ -164,8 +163,10 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         # training
         self.optimizer.zero_grad()
-
-        outputs = self.model(past_target, scaled_future_targets.float().to(self.device))
+        if self.model.future_target_required:
+            outputs = self.model(past_target, scaled_future_targets.float().to(self.device))
+        else:
+            outputs = self.model(past_target)
 
         outputs = self.rescale_output(outputs, loc=loc, scale=scale)
 

From b741a4f1f7eb3d6c5bc9233d1f3c048d7b5c3cf5 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Dec 2021 17:39:48 +0100
Subject: [PATCH 088/347] allow RNN as decoder

---
 .../setup/network/forecasting_network.py      | 18 +++++++++----
 .../forecasting_decoder/MLPDecoder.py         |  2 +-
 .../base_forecasting_decoder.py               |  5 ----
 .../forecasting_network_head/distribution.py  |  7 ++---
 .../forecasting_head.py                       |  5 +++-
 .../forecasting_base_trainer.py               | 27 ++++++++++++-------
 6 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 3c7899681..4211a65d2 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -102,9 +102,15 @@ def pred_from_net_output(self, net_output):
             return net_output
         elif self.output_type == 'distribution':
             if self.forecast_strategy == 'mean':
-                return net_output.mean
+                if isinstance(net_output, list):
+                    return torch.cat([dist.mean for dist in net_output], dim=-2)
+                else:
+                    return net_output.mean
             elif self.forecast_strategy == 'sample':
-                samples = net_output.sample((self.num_samples,))
+                if isinstance(net_output, list):
+                    samples = torch.cat([dist.sample((self.num_samples,)) for dist in net_output], dim=-2)
+                else:
+                    samples = net_output.sample((self.num_samples,))
                 if self.aggregation == 'mean':
                     return torch.mean(samples, dim=0)
                 elif self.aggregation == 'median':
@@ -158,13 +164,12 @@ def forward(self,
             x_future, _ = self.decoder(x_future, hidden_states)
             net_output = self.head(x_future)
 
+
             return net_output
         else:
             all_predictions = []
             predicted_target = targets_past[:, [-1]]
 
-            dist_keys = None
-
             _, hidden_states = self.encoder(x_past)
             for idx_pred in range(self.n_prediction_steps):
                 x_future = predicted_target if features_future is None else torch.cat(
@@ -173,10 +178,13 @@ def forward(self,
 
                 x_future, hidden_states = self.decoder(x_future, hidden_states)
                 net_output = self.head(x_future[:, -1:, ])
-                predicted_target = self.pred_from_net_output(net_output)
+                predicted_target = self.pred_from_net_output(net_output).to(targets_past.device)
 
                 all_predictions.append(net_output)
 
+            if self.output_type != 'distribution':
+                all_predictions = torch.cat(all_predictions, dim=1)
+
             return all_predictions
 
     def predict(self,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
index 9bb669910..027202cd2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
@@ -140,5 +140,5 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
 
         # TODO let dataset_properties decide if auto_regressive models is applicable
-        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        # add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
index bdb81601e..7535dc7d3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -67,11 +67,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         X.update({"auto_regressive": auto_regressive})
 
-        # for information about encoder_properties, please check
-        # autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_backbone
-        # TODO create a separate module so that users could know what is contained in encoder_properties
-
-        # TODO consider Auto-regressive model on vanilla network head
 
         if auto_regressive:
             self.n_prediction_heads = 1
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index a964d83c0..f303ea7d0 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -92,10 +92,7 @@ def forward(self, x: torch.Tensor) -> torch.distributions:
              with shape (batch_size, n_prediction_steps, output_shape)
         """
         params_unbounded = [proj(x) for proj in self.proj]
-
-        # TODO consider how to handle network parameter issues (or register a hook)
-        parameters_bounded = self.domain_map(*params_unbounded)
-        return self.dist_cls(*parameters_bounded)
+        return self.dist_cls(*self.domain_map(*params_unbounded))
 
     @property
     @abstractmethod
@@ -159,7 +156,7 @@ def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor)
 
     @property
     def dist_cls(self) -> type(Distribution):
-        # TODO there is a bug with Beta implementation!!!
+        # TODO consider constraints on Beta!!!
         return Beta
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 98dcdf801..301bb9c7c 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -147,7 +147,10 @@ def build_proj_layer(input_shape: Tuple[int, ...],
                                                      auto_regressive=auto_regressive)
             return proj_layer
         elif net_out_put_type == 'regression':
-            proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, input_shape)),
+            if auto_regressive:
+                proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
+            else:
+                proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, input_shape)),
                                        nn.Linear(input_shape, np.product(output_shape[1:])),
                                        # nn.Unflatten(-1, tuple(output_shape)),
                                        )
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 8766d87cf..61e63a4cd 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -116,21 +116,22 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
     def rescale_output(self,
                        outputs: Union[torch.distributions.Distribution, torch.Tensor],
                        loc: Optional[torch.Tensor],
-                       scale: Optional[torch.Tensor]):
+                       scale: Optional[torch.Tensor],
+                       device: torch.device = torch.device('cpu')):
         # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
         if loc is not None or scale is not None:
             if isinstance(outputs, torch.distributions.Distribution):
-                transform = AffineTransform(loc=0.0 if loc is None else loc.to(self.device),
-                                            scale=1.0 if scale is None else scale.to(self.device),
+                transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
+                                            scale=1.0 if scale is None else scale.to(device),
                                             )
                 outputs = TransformedDistribution(outputs, [transform])
             else:
                 if loc is None:
-                    outputs = outputs * scale.to(self.device)
+                    outputs = outputs * scale.to(device)
                 elif scale is None:
-                    outputs = outputs + loc.to(self.device)
+                    outputs = outputs + loc.to(device)
                 else:
-                    outputs = outputs * scale.to(self.device) + loc.to(self.device)
+                    outputs = outputs * scale.to(device) + loc.to(device)
         return outputs
 
     def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor) \
@@ -168,7 +169,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         else:
             outputs = self.model(past_target)
 
-        outputs = self.rescale_output(outputs, loc=loc, scale=scale)
+        outputs = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
 
         loss_func = self.criterion_preparation(**criterion_kwargs)
         loss = loss_func(self.criterion, outputs)
@@ -223,9 +224,17 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 outputs = self.model(past_target)
 
-                outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale)
+                if isinstance(outputs, list):
+                    outputs_rescaled = [self.rescale_output(output,
+                                                            loc=loc,
+                                                            scale=scale,
+                                                            device=self.device) for output in outputs]
 
-                loss = self.criterion(outputs_rescaled, future_targets)
+                    loss = [self.criterion(output_rescaled, future_targets) for output_rescaled in outputs_rescaled]
+                    loss = torch.mean(torch.Tensor(loss))
+                else:
+                    outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
+                    loss = self.criterion(outputs_rescaled, future_targets)
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size

From ca7ef909642f77da79fc3df8c8584699ad54179e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Dec 2021 12:07:31 +0100
Subject: [PATCH 089/347] remvoe pdb

---
 .../setup/network_backbone/forecasting_encoder/TCNEncoder.py    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
index 298ffdf95..3259e97a8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
@@ -90,8 +90,6 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int =
         self.network = nn.Sequential(*layers)
 
     def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
-        import pdb
-        pdb.set_trace()
         # swap sequence and feature dimensions for use with convolutional nets
         x = x.transpose(1, 2).contiguous()
         x = self.network(x)

From d80d705c60ef897a44142ea3fabfc3a09cebd73c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Dec 2021 12:23:44 +0100
Subject: [PATCH 090/347] inception net accept argument for seq output

---
 .../forecasting_encoder/InceptionTimeEncoder.py            | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
index 9e128531f..c6324200a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
@@ -110,7 +110,7 @@ def __init__(self,
                 n_res_inputs = n_res_outputs
             n_inputs = block.get_n_outputs()
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
         # swap sequence and feature dimensions for use with convolutional nets
         x = x.transpose(1, 2).contiguous()
         res = x
@@ -120,7 +120,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 x = self.__getattr__(f"residual_block_{i}")(x, res)
                 res = x
         x = x.transpose(1, 2).contiguous()
-        return x
+        if output_seq:
+            return x
+        else:
+            return x[:, -1, :]
 
 
 class InceptionTimeEncoder(BaseForecastingEncoder):

From 23d4cc556fc9704d03e07d6ae62963be2a2a0f95 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Dec 2021 12:32:12 +0100
Subject: [PATCH 091/347] fix forbidden clauses

---
 .../pipeline/time_series_forecasting.py        | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 932261a61..cd070e8b8 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -244,19 +244,19 @@ def _get_hyperparameter_search_space(self,
             cs.add_forbidden_clauses(forbidden_losses_all)
 
         # rnn head only allow rnn backbone
-        if 'network_backbone' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():
-            hp_backbone_choice = cs.get_hyperparameter('network_backbone:__choice__')
-            hp_head_choice = cs.get_hyperparameter('network_head:__choice__')
+        if 'network_encoder' in self.named_steps.keys() and 'network_decoder' in self.named_steps.keys():
+            hp_encoder_choice = cs.get_hyperparameter('network_encoder:__choice__')
+            hp_decoder_choice = cs.get_hyperparameter('network_decoder:__choice__')
 
-            if 'ForecastingRNNHeader' in hp_head_choice.choices:
-                if len(hp_head_choice.choices) == 1 and 'RNNBackbone' not in hp_backbone_choice.choices:
+            if 'RNNEncoder' in hp_encoder_choice.choices:
+                if len(hp_decoder_choice.choices) == 1 and 'RNNDecoder' not in hp_decoder_choice.choices:
                     raise ValueError("RNN Header is only compatible with RNNBackbone, RNNHead is not allowed to be "
                                      "the only network head choice if the backbone choices do not contain RNN!")
-                backbone_choices = [choice for choice in hp_backbone_choice.choices if choice != 'RNNBackbone']
-                forbidden_clause_backbone = ForbiddenInClause(hp_backbone_choice, backbone_choices)
-                forbidden_clause_head = ForbiddenEqualsClause(hp_head_choice, 'ForecastingRNNHeader')
+                encoder_choices = [choice for choice in hp_encoder_choice.choices if choice != 'RNNEncoder']
+                forbidden_clause_encoder = ForbiddenInClause(hp_encoder_choice, encoder_choices)
+                forbidden_clause_decoder = ForbiddenEqualsClause(hp_decoder_choice, 'RNNDecoder')
 
-                cs.add_forbidden_clause(ForbiddenAndConjunction(forbidden_clause_backbone, forbidden_clause_head))
+                cs.add_forbidden_clause(ForbiddenAndConjunction(forbidden_clause_encoder, forbidden_clause_decoder))
             cs.get_hyperparameter_names()
 
         self.configuration_space = cs

From 23806a9d33421c861f679c729d8ca4b70d8da606 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Dec 2021 16:06:35 +0100
Subject: [PATCH 092/347] DeepAR

---
 .../setup/network/forecasting_network.py      | 121 +++++++++++++++++-
 .../forecasting_decoder/MLPDecoder.py         |  11 +-
 .../forecasting_encoder/MLPEncoder.py         |   2 +-
 .../forecasting_base_trainer.py               |  40 +++---
 .../pipeline/time_series_forecasting.py       |  22 +++-
 5 files changed, 169 insertions(+), 27 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 4211a65d2..ceb089d2e 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union, Tuple
+from typing import Any, Dict, Optional, Union, Tuple, List
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
@@ -76,6 +76,7 @@ def __init__(self,
                 raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
                                  'for decoder!')
         self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
+        self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
 
     def forward(self,
                 targets_past: torch.Tensor,
@@ -164,7 +165,6 @@ def forward(self,
             x_future, _ = self.decoder(x_future, hidden_states)
             net_output = self.head(x_future)
 
-
             return net_output
         else:
             all_predictions = []
@@ -176,7 +176,7 @@ def forward(self,
                     [predicted_target, features_future[:, [idx_pred], :]],
                     dim=-1)
 
-                x_future, hidden_states = self.decoder(x_future, hidden_states)
+                x_future, hidden_states = self.decoder(x_future, hx=hidden_states)
                 net_output = self.head(x_future[:, -1:, ])
                 predicted_target = self.pred_from_net_output(net_output).to(targets_past.device)
 
@@ -197,6 +197,117 @@ def predict(self,
         return self.pred_from_net_output(net_output)
 
 
+class ForecastingDeepARNet(ForecastingNet):
+    future_target_required = True
+
+    def __init__(self,
+                 **kwargs):
+        """
+        Forecasting network with DeepAR structure.
+
+        This structure is activate when the decoder is not recurrent (MLP) and its hyperparameter "auto_regressive" is
+        set  as True. We train the network to let it do a one-step prediction. This structure is compatible with any
+         sorts of encoder (except MLP).
+        """
+        super(ForecastingDeepARNet, self).__init__(**kwargs)
+        # this determines the training targets
+        self.encoder_bijective_seq_output = kwargs['encoder_properties']['bijective_seq_output']
+
+    def forward(self,
+                targets_past: torch.Tensor,
+                targets_future: Optional[torch.Tensor] = None,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        x_past = targets_past if features_past is None else torch.cat([targets_past, features_past], dim=-1)
+
+        # TODO consider static features
+        x_past = self.embedding(x_past)
+
+        if self.training:
+            x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
+                                                                                dim=-1)
+            x_future = self.embedding(x_future)
+
+            x_input = torch.cat([x_past, x_future[:, :-1]], dim=1)
+
+            if self.encoder_has_hidden_states:
+                x_input, _ = self.encoder(x_input, output_seq=True)
+            else:
+                x_input = self.encoder(x_input, output_seq=True)
+
+            net_output = self.head(self.decoder(x_input))
+            return net_output
+        else:
+            all_predictions = []
+            batch_size = targets_past.shape[0]
+
+            if self.encoder_has_hidden_states:
+                # For RNN, we only feed the hidden state and generated future input to the netwrok
+                encoder_output, hidden_states = self.encoder(x_past)
+                repeated_state = [
+                    s.repeat_interleave(repeats=self.num_samples, dim=1)
+                    for s in hidden_states
+                ]
+
+            else:
+                # For other models, the full past targets are passed to the network.
+                encoder_output = self.encoder(x_past)
+                repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples, dim=0).squeeze(1)
+
+            repeated_static_feat = features_static.repeat_interleave(
+                repeats=self.num_samples, dim=0
+            ).unsqueeze(dim=1) if features_static is not None else None
+
+            repeated_time_feat = features_future.repeat_interleave(
+                repeats=self.num_samples, dim=0
+            ) if features_future is not None else None
+
+            net_output = self.head(self.decoder(encoder_output))
+
+            next_sample = net_output.sample(sample_shape=(self.num_samples,))
+
+            next_sample = next_sample.transpose(0, 1).reshape(
+                (next_sample.shape[0] * next_sample.shape[1], 1, -1)
+            )
+
+            all_predictions.append(next_sample)
+
+            for k in range(1, self.n_prediction_steps):
+                x_next = next_sample if repeated_time_feat is None else torch.cat([next_sample,
+                                                                                   repeated_time_feat[:, k:k + 1]],
+                                                                                  dim=-1)
+                if self.encoder_has_hidden_states:
+                    encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
+                else:
+                    x_next = torch.cat([repeated_past_target, x_next], dim=1)
+                    encoder_output = self.encoder(x_next)
+                # for training, the encoder output a sequence. Thus for prediction, the network should have the same
+                # format output
+                encoder_output = torch.unsqueeze(encoder_output, 1)
+
+                net_output = self.head(self.decoder(encoder_output))
+
+                next_sample = net_output.sample()
+                all_predictions.append(next_sample)
+
+            all_predictions = torch.cat(all_predictions, dim=1).unflatten(0, (batch_size, self.num_samples))
+
+            return all_predictions
+
+    def pred_from_net_output(self, net_output: torch.Tensor):
+        if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
+            raise ValueError(f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
+                             f"but this network has {self.output_type} and {self.forecast_strategy}")
+        if self.aggregation == 'mean':
+            return torch.mean(net_output, dim=1)
+        elif self.aggregation == 'median':
+            return torch.median(net_output, dim=1)[0]
+        else:
+            raise ValueError(f'Unknown aggregation: {self.aggregation}')
+
+
 class ForecastingNetworkComponent(NetworkComponent):
     def __init__(
             self,
@@ -253,7 +364,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
             self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
         elif X['auto_regressive']:
             # decoder is MLP and auto_regressive, we have deep AR model
-            raise NotImplementedError
+            self.network = ForecastingDeepARNet(**network_init_kwargs)
         else:
             self.network = ForecastingNet(**network_init_kwargs)
 
@@ -317,7 +428,7 @@ def get_hyperparameter_search_space(
                                                                                 ),
             forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
                                                                                      value_range=('sample', 'mean'),
-                                                                                     default_value='mean'),
+                                                                                     default_value='sample'),
             num_samples: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_samples',
                                                                                value_range=(50, 200),
                                                                                default_value=100),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
index 027202cd2..b2c4ad222 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
@@ -1,11 +1,11 @@
 from abc import ABC
-from typing import Dict, Optional, Tuple, Union, List
+from typing import Dict, Optional, Tuple, Union, Any
 
 from torch import nn
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition
+from ConfigSpace.conditions import GreaterThanCondition, EqualsCondition
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
@@ -74,7 +74,7 @@ def get_hyperparameter_search_space(
                                                                                      log=True),
             auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
                                                                                    value_range=(True, False),
-                                                                                   default_value=False)
+                                                                                   default_value=True),
     ) -> ConfigurationSpace:
         """
         Builds the mlp head layer. The decoder implementation follows the idea from:
@@ -99,6 +99,9 @@ def get_hyperparameter_search_space(
             units_final_layer (HyperparameterSearchSpace): number of units of final layer. The size of this layer is
             smaller as it needs to be expanded to adapt to the number of predictions
             auto_regressive (HyperparameterSearchSpace): if the model acts as a DeepAR model
+            deepar_n_samples (HyperparameterSearchSpace) activate when auto_regressive is True, how many points to
+            sample when doing deepAR prediction (we note that this hyperparameters are only applied to generate new
+            future distribution in the future, but it does not influence the way that network makes prediction)
         Returns:
             cs (ConfigurationSpace): ConfigurationSpace
         """
@@ -140,5 +143,5 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
 
         # TODO let dataset_properties decide if auto_regressive models is applicable
-        # add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index e28544c3a..f3955407b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -40,7 +40,7 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
 
         """
         if output_seq:
-            x = x.unfold((1, self.window_size, 1)).transpose(-1, -2)
+            x = x.unfold(1, self.window_size, 1).transpose(-1, -2)
             # x.shape = [B, L_in - self.window + 1, self.window, N]
         else:
             if x.shape[1] > self.window_size:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 61e63a4cd..eb7fd752f 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -7,6 +7,7 @@
 import pandas as pd
 
 import torch
+import torch.nn.functional as F
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
@@ -21,7 +22,7 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
-from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet
 
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 
@@ -159,8 +160,11 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         past_target = past_target.to(self.device)
 
         future_targets = self.cast_targets(future_targets)
-
-        past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+        if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
+            future_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
+            past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+        else:
+            past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
         # training
         self.optimizer.zero_grad()
@@ -224,23 +228,29 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 outputs = self.model(past_target)
 
-                if isinstance(outputs, list):
-                    outputs_rescaled = [self.rescale_output(output,
-                                                            loc=loc,
-                                                            scale=scale,
-                                                            device=self.device) for output in outputs]
-
-                    loss = [self.criterion(output_rescaled, future_targets) for output_rescaled in outputs_rescaled]
-                    loss = torch.mean(torch.Tensor(loss))
-                else:
+                if isinstance(self.model, ForecastingDeepARNet):
+                    # DeepAR only generate sampled points, we replace log_prob loss with MSELoss
+                    outputs = self.model.pred_from_net_output(outputs)
                     outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
-                    loss = self.criterion(outputs_rescaled, future_targets)
+                    loss = F.mse_loss(outputs_rescaled, future_targets)
+                    outputs = outputs.detach().cpu()
+                else:
+                    if isinstance(outputs, list):
+                        outputs_rescaled = [self.rescale_output(output,
+                                                                loc=loc,
+                                                                scale=scale,
+                                                                device=self.device) for output in outputs]
+
+                        loss = [self.criterion(output_rescaled, future_targets) for output_rescaled in outputs_rescaled]
+                        loss = torch.mean(torch.Tensor(loss))
+                    else:
+                        outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
+                        loss = self.criterion(outputs_rescaled, future_targets)
+                    outputs = self.model.pred_from_net_output(outputs).detach().cpu()
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size
 
-                outputs = self.model.pred_from_net_output(outputs).detach().cpu()
-
                 if loc is None and scale is None:
                     outputs_data.append(outputs)
                 else:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index cd070e8b8..cc82c5649 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -241,15 +241,33 @@ def _get_hyperparameter_search_space(self,
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_loss)
                     forbidden_losses_all.append(forbidden_hp_dist)
 
+            network_encoder_hp = cs.get_hyperparameter('network_encoder:__choice__')
+            if 'MLPEncoder' in network_encoder_hp.choices:
+                for hp_ar in hp_auto_regressive:
+                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_mlpencoder = ForbiddenEqualsClause(network_encoder_hp, 'MLPEncoder')
+                    forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
+                    forbidden_losses_all.append(forbidden_hp_ar_mlp)
+
+            forecast_strategy = cs.get_hyperparameter('network:forecast_strategy')
+            if 'mean' in forecast_strategy.choices:
+                for hp_ar in hp_auto_regressive:
+                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
+                    forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
+                                                                                forbidden_hp_forecast_strategy)
+                    forbidden_losses_all.append(forbidden_hp_ar_forecast_strategy)
+
             cs.add_forbidden_clauses(forbidden_losses_all)
 
+
         # rnn head only allow rnn backbone
         if 'network_encoder' in self.named_steps.keys() and 'network_decoder' in self.named_steps.keys():
             hp_encoder_choice = cs.get_hyperparameter('network_encoder:__choice__')
             hp_decoder_choice = cs.get_hyperparameter('network_decoder:__choice__')
 
-            if 'RNNEncoder' in hp_encoder_choice.choices:
-                if len(hp_decoder_choice.choices) == 1 and 'RNNDecoder' not in hp_decoder_choice.choices:
+            if 'RNNDecoder' in hp_decoder_choice.choices:
+                if len(hp_decoder_choice.choices) == 1 and 'RNNEncoder' not in hp_encoder_choice.choices:
                     raise ValueError("RNN Header is only compatible with RNNBackbone, RNNHead is not allowed to be "
                                      "the only network head choice if the backbone choices do not contain RNN!")
                 encoder_choices = [choice for choice in hp_encoder_choice.choices if choice != 'RNNEncoder']

From 732bfa94817755f25f5a845b7db72aaebfeed84c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Dec 2021 16:20:39 +0100
Subject: [PATCH 093/347] allow 100 epochs for forecasting tasks

---
 autoPyTorch/api/base_task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index e4ab30f37..f4f989e63 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -939,6 +939,8 @@ def _search(
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type
+        if time_series_forecasting and budget_type is not 'epochs':
+            self.pipeline_options['epochs'] = 100
 
         # Here the budget is set to max because the SMAC intensifier can be:
         # Hyperband: in this case the budget is determined on the fly and overwritten

From 4a27c0b51a65b1416735dede678301d0acc39100 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 18 Dec 2021 12:55:54 +0100
Subject: [PATCH 094/347] new fidelity and fix on resolution

---
 autoPyTorch/api/base_task.py                  |  4 +-
 autoPyTorch/constants_forecasting.py          |  3 ++
 autoPyTorch/evaluation/abstract_evaluator.py  | 13 ++++++-
 autoPyTorch/evaluation/tae.py                 |  7 ++--
 autoPyTorch/optimizer/smbo.py                 |  4 +-
 .../forecasting_decoder/MLPDecoder.py         |  2 +-
 .../forecasting_encoder/MLPEncoder.py         | 35 ++++++++++++++---
 .../time_series_forecasting_data_loader.py    | 39 ++++++++++++-------
 .../components/training/trainer/__init__.py   | 13 ++++---
 .../trainer/forecasting_trainer/__init__.py   | 16 ++++++++
 10 files changed, 103 insertions(+), 33 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index f4f989e63..5e4317736 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -62,6 +62,7 @@
 from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements
 from autoPyTorch.utils.single_thread_client import SingleThreadedClient
 from autoPyTorch.utils.stopwatch import StopWatch
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 
 def _pipeline_predict(pipeline: BasePipeline,
@@ -935,7 +936,8 @@ def _search(
 
 
         # Incorporate budget to pipeline config
-        if budget_type not in ('epochs', 'runtime') and (budget_type == 'resolution' and not time_series_forecasting):
+        if budget_type not in ('epochs', 'runtime') and (budget_type in FORECASTING_BUDGET_TYPE
+                                                         and not time_series_forecasting):
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index 2dfe4722a..3b5a355ca 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -1,6 +1,9 @@
 # The cosntant values for time series forecasting comes from
 # https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
 # seasonality map, maps a frequency value to a number
+
+FORECASTING_BUDGET_TYPE = ['resolution', 'num_seq', 'num_sample_per_seq']
+
 SEASONALITY_MAP = {
     "minutely": [1440, 10080, 525960],
     "10_minutes": [144, 1008, 52596],
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index eb8f7698a..52e011593 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -52,6 +52,7 @@
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 from autoPyTorch.utils.pipeline import get_dataset_requirements
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 __all__ = [
     'AbstractEvaluator',
@@ -636,9 +637,17 @@ def _init_fit_dictionary(
             self.fit_dictionary['sample_interval'] = int(np.ceil(1.0 / self.budget))
             self.fit_dictionary.pop('epochs', None)
             self.fit_dictionary.pop('runtime', None)
+        elif self.budget_type == 'num_seq':
+            self.fit_dictionary['sample_interval'] = self.budget
+            self.fit_dictionary.pop('epochs', None)
+            self.fit_dictionary.pop('runtime', None)
+        elif self.budget_type == 'num_sample_per_seq':
+            self.fit_dictionary['fraction_samples_per_seq'] = self.budget
+            self.fit_dictionary.pop('epochs', None)
+            self.fit_dictionary.pop('runtime', None)
         else:
-            raise ValueError(f"budget type must be `epochs` or `runtime` or 'resolution' (Only used in forecasting "
-                             f"taskss), but got {self.budget_type}")
+            raise ValueError(f"budget type must be `epochs` or `runtime` or {FORECASTING_BUDGET_TYPE} "
+                             f"(Only used in forecasting taskss), but got {self.budget_type}")
 
 
     def _get_pipeline(self) -> BaseEstimator:
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 52dd1e229..dd5b96b15 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -30,6 +30,7 @@
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 from autoPyTorch.utils.parallel import preload_modules
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 
 def fit_predict_try_except_decorator(
@@ -220,7 +221,7 @@ def run_wrapper(
                 elif run_info.budget <= 0 or run_info.budget > 100:
                     raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
                                      run_info.budget)
-            elif self.budget_type == 'resolution':
+            elif self.budget_type in FORECASTING_BUDGET_TYPE:
                 if run_info.budget == 0:
                     run_info = run_info._replace(budget=1.0)
                 elif run_info.budget <= 0 or run_info.budget > 1.:
@@ -228,8 +229,8 @@ def run_wrapper(
                                      run_info.budget)
             else:
                 raise ValueError("Illegal value for budget type, must be one of "
-                                 "('epochs', 'runtime', 'resolution'), but is : %s" %
-                                 self.budget_type)
+                                 f"('epochs', 'runtime', or {FORECASTING_BUDGET_TYPE} (for forecasting tasks)), "
+                                 f"but is : {self.budget_type}")
 
         remaining_time = self.stats.get_remaing_time_budget()
 
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 6ffd023bb..20c5cffd9 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -34,6 +34,8 @@
 from autoPyTorch.utils.logging_ import get_named_client_logger
 from autoPyTorch.utils.stopwatch import StopWatch
 
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+
 
 def get_smac_object(
     scenario_dict: Dict[str, Any],
@@ -356,7 +358,7 @@ def run_smbo(self, func: Optional[Callable] = None
             scenario_dict.update(self.smac_scenario_args)
 
         budget_type = self.pipeline_config['budget_type']
-        if budget_type == 'resolution':
+        if budget_type in FORECASTING_BUDGET_TYPE:
             if self.min_budget > 1. or self.max_budget > 1.:
                 self.min_budget = self.min_budget / self.max_budget
                 self.max_budget = 1.0
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
index b2c4ad222..d4b0a2c05 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
@@ -74,7 +74,7 @@ def get_hyperparameter_search_space(
                                                                                      log=True),
             auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
                                                                                    value_range=(True, False),
-                                                                                   default_value=True),
+                                                                                   default_value=False),
     ) -> ConfigurationSpace:
         """
         Builds the mlp head layer. The decoder implementation follows the idea from:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index f3955407b..f55f3e780 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
@@ -19,10 +20,14 @@ class _TimeSeriesMLP(EncoderNetwork):
     def __init__(self,
                  window_size: int,
                  mlp_layers: nn.Module,
+                 fill_lower_resolution_seq: bool = False,
+                 fill_kwargs: Dict = {},
                  ):
         super().__init__()
         self.window_size = window_size
         self.mlp_layers = mlp_layers
+        self.fill_lower_resolution_seq = fill_lower_resolution_seq
+        self.fill_interval = fill_kwargs.get('loader_sample_interval', 1)
 
     def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
@@ -46,6 +51,14 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
             if x.shape[1] > self.window_size:
                 # we need to ensure that the input size fits the network shape
                 x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
+        if self.fill_lower_resolution_seq and x.shape[1] < self.window_size:
+
+            x = F.conv_transpose1d(x.transpose(1, 2),
+                                   F.pad(torch.ones((1, 1, 1)), (1, 1)),
+                                   stride=self.fill_interval,
+                                   padding=1).transpose(1, 2)
+            if x.shape[1] < self.window_size:
+                x = torch.cat([torch.zeros(x.shape[0], self.window_size - x.shape[1], x.shape[2]), x], dim=1)
         x = x.flatten(-2)
         return self.mlp_layers(x)
 
@@ -53,6 +66,8 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
 class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
     _fixed_seq_length = True
     window_size = 1
+    fill_lower_resolution_seq = False
+    fill_kwargs = {}
 
     @property
     def encoder_properties(self):
@@ -71,11 +86,19 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
+        # when resolution is smaller
+        if 'sample_interval' in X and X['sample_interval'] > 1.:
+            self.fill_lower_resolution_seq = True
+            self.fill_kwargs = {'loader_sample_interval': X['sample_interval']}
+
         return super().fit(X, y)
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1] * self.window_size
-        return _TimeSeriesMLP(self.window_size, self._build_backbone(in_features))
+        return _TimeSeriesMLP(window_size=self.window_size,
+                              mlp_layers=self._build_backbone(in_features),
+                              fill_lower_resolution_seq=self.fill_lower_resolution_seq,
+                              fill_kwargs=self.fill_kwargs)
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
@@ -138,10 +161,10 @@ def get_hyperparameter_search_space(
                                                                            ),
     ) -> ConfigurationSpace:
         cs = MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,
-                                                           num_groups=num_groups,
-                                                           activation=activation,
-                                                           use_dropout=use_dropout,
-                                                           num_units=num_units,
-                                                           dropout=dropout)
+                                                         num_groups=num_groups,
+                                                         activation=activation,
+                                                         use_dropout=use_dropout,
+                                                         num_units=num_units,
+                                                         dropout=dropout)
         add_hyperparameter(cs, normalization, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index a557c05f6..952d87e2c 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -24,8 +24,6 @@
 )
 
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
-    TimeSeriesTransformer
 
 
 class TimeSeriesSampler(SubsetRandomSampler):
@@ -83,6 +81,9 @@ def __iter__(self):
         idx_samples_start = 0
         idx_seq_tracker = 0
         for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals, self.seq_lengths)):
+            if len(interval) == 1:
+                continue
+
             num_samples = len(interval) - 1
             idx_samples_end = idx_samples_start + num_samples
 
@@ -207,7 +208,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             A instance of self
         """
         X["window_size"] = self.window_size
+        # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
+        # this value corresponds to budget type num_sequence
+        fraction_seq = X.get('fraction_seq', 1.0)
+        # this value corresponds to budget type num_sample_per_seq
+        fraction_samples_per_seq = X.get('fraction_samples_per_seq', 1.0)
         self.sample_interval = sample_interval
 
         self.padding_value = X.get('required_padding_value', 0.0)
@@ -251,18 +257,23 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_dataset = np.size(train_split)
         num_instances_train = self.num_batches_per_epoch * self.batch_size
 
-        if num_instances_train > np.sum(train_split):
-            num_instances_per_seqs = None
-        else:
-            # get the length of each sequence of training data (after split)
-            # as we already know that the elements in 'train_split' increases consecutively with a certain number of
-            # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
-            #  A new sequence must start from the index 7. We could then split each unique values to represent the length
-            # of each split
-            _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
-            num_instances_per_seqs = np.ceil(num_instances_train / num_instances_dataset * seq_train_length)
-            num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
-            # at least one element of each sequence should be selected
+        if num_instances_train > num_instances_dataset:
+            num_instances_train = num_instances_dataset
+
+        # get the length of each sequence of training data (after split)
+        # as we already know that the elements in 'train_split' increases consecutively with a certain number of
+        # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
+        #  A new sequence must start from the index 7. We could then split each unique values to represent the length
+        # of each split
+        _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
+        # create masks for masking
+        seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
+        seq_train_length[seq_idx_inactivate] = 0
+        # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
+        num_instances_per_seqs = np.round(np.ceil(num_instances_train / num_instances_dataset * seq_train_length) *
+                                          fraction_samples_per_seq)
+        num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
+        # at least one element of each sequence should be selected
 
         # TODO consider the case where num_instances_train is greater than num_instances_dataset,
         # In which case we simply iterate through all the datasets
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index ed77337ea..16ddf65a7 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -245,6 +245,13 @@ def prepare_trainer(self, X):
             step_interval=X['step_interval'],
         )
 
+    def get_budget_tracker(self, X):
+        return BudgetTracker(
+            budget_type=X['budget_type'],
+            max_runtime=X['runtime'] if 'runtime' in X else None,
+            max_epochs=X['epochs'] if 'epochs' in X else None,
+        )
+
     def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice':
         """
         Fits a component by using an input dictionary with pre-requisites
@@ -272,11 +279,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
         if X["torch_num_threads"] > 0:
             torch.set_num_threads(X["torch_num_threads"])
 
-        self.budget_tracker = BudgetTracker(
-            budget_type=X['budget_type'],
-            max_runtime=X['runtime'] if 'runtime' in X else None,
-            max_epochs=X['epochs'] if 'epochs' in X else None,
-        )
+        self.budget_tracker = self.get_budget_tracker(X)
 
         self.prepare_trainer(X)
         total_parameter_count, trainable_parameter_count = self.count_parameters(X['network'])
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 2225b879c..67283bc9d 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -14,6 +14,8 @@
     autoPyTorchComponent,
     find_components,
 )
+from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker
+
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.training.losses import get_loss
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
@@ -21,6 +23,7 @@
     base_target_scaler import BaseTargetScaler
 
 from autoPyTorch.utils.common import get_device_from_fit_dictionary
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 trainer_directory = os.path.split(__file__)[0]
 _trainers = find_components(__package__,
@@ -44,6 +47,19 @@ def _fit_requirements(self) -> Optional[List[FitRequirement]]:
                                                user_defined=False, dataset_property=False))
         return fit_requirements
 
+    def get_budget_tracker(self, X):
+        if 'epochs' in X:
+            max_epochs = X['max_epochs']
+        elif X['budget_type'] in FORECASTING_BUDGET_TYPE:
+            max_epochs = 100
+        else:
+            max_epochs = None
+        return BudgetTracker(
+            budget_type=X['budget_type'],
+            max_runtime=X['runtime'] if 'runtime' in X else None,
+            max_epochs=max_epochs,
+        )
+
     def prepare_trainer(self, X):
         # Support additional user metrics
         metrics = get_metrics(dataset_properties=X['dataset_properties'])

From 52dcfb5e9ce4f32883b5bbc3ba451c7e04b3b648 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 19 Dec 2021 11:25:26 +0100
Subject: [PATCH 095/347] maint

---
 .../components/training/trainer/forecasting_trainer/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 67283bc9d..ebf2bd18d 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -49,7 +49,7 @@ def _fit_requirements(self) -> Optional[List[FitRequirement]]:
 
     def get_budget_tracker(self, X):
         if 'epochs' in X:
-            max_epochs = X['max_epochs']
+            max_epochs = X['epochs']
         elif X['budget_type'] in FORECASTING_BUDGET_TYPE:
             max_epochs = 100
         else:

From 1154a23897f70ae29155d7d988d7fcb1872825b8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 19 Dec 2021 23:56:04 +0100
Subject: [PATCH 096/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    |  6 +-
 autoPyTorch/evaluation/abstract_evaluator.py  |  2 +-
 .../time_series_forecasting_data_loader.py    | 66 ++++++++++++++++---
 3 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 7655d9711..5c1c71829 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -290,12 +290,12 @@ def search(
             if self.search_space_updates is None:
                 self.search_space_updates = HyperparameterSearchSpaceUpdates()
 
-            window_size_scales = [1, 2]
+            window_size_scales = [1, 3]
 
             self.search_space_updates.append(node_name="data_loader",
                                              hyperparameter="window_size",
-                                             value_range=[window_size_scales[0] * base_window_size,
-                                                          window_size_scales[1] * base_window_size],
+                                             value_range=[int(window_size_scales[0] * base_window_size),
+                                                          int(window_size_scales[1] * base_window_size)],
                                              default_value=int(np.ceil(1.25 * base_window_size)),
                                              )
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 52e011593..9a598626a 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -638,7 +638,7 @@ def _init_fit_dictionary(
             self.fit_dictionary.pop('epochs', None)
             self.fit_dictionary.pop('runtime', None)
         elif self.budget_type == 'num_seq':
-            self.fit_dictionary['sample_interval'] = self.budget
+            self.fit_dictionary['fraction_seq'] = self.budget
             self.fit_dictionary.pop('epochs', None)
             self.fit_dictionary.pop('runtime', None)
         elif self.budget_type == 'num_sample_per_seq':
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 952d87e2c..72d5cef68 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -3,9 +3,8 @@
 from torch.utils.data.sampler import SubsetRandomSampler
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    UniformIntegerHyperparameter, Constant
-)
+from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
+from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
 
@@ -20,7 +19,8 @@
 from autoPyTorch.utils.common import (
     HyperparameterSearchSpace,
     custom_collate_fn,
-    add_hyperparameter
+    add_hyperparameter,
+    get_hyperparameter
 )
 
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
@@ -60,9 +60,6 @@ def __init__(self,
             if len(seq_lengths) != len(num_instances_per_seqs):
                 raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
                                  f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
-            if np.sum(seq_lengths) != len(indices):
-                raise ValueError(f'the sum of sequence length must correspond to the number of indices. '
-                                 f'However, they are {np.sum(seq_lengths)} versus {len(indices)}')
             seq_intervals = []
             idx_tracker = 0
             for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
@@ -146,6 +143,7 @@ def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length
 
     def __call__(self, data: np.ndarray) -> np.ndarray:
         sample_indices = np.arange(self.first_indices, 0, step=self.sample_interval)
+
         if sample_indices[0] < -1 * len(data):
             # we need to pad with 0
             valid_indices = sample_indices[np.where(sample_indices >= -len(data))[0]]
@@ -171,6 +169,8 @@ class TimeSeriesForecastingDataLoader(FeatureDataLoader):
 
     def __init__(self,
                  batch_size: int = 64,
+                 backcast: bool = False,
+                 backcast_period: int = 2,
                  window_size: int = 1,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
@@ -186,7 +186,12 @@ def __init__(self,
             n_prediction_steps: how many steps to predict in advance
         """
         super().__init__(batch_size=batch_size, random_state=random_state)
-        self.window_size: int = window_size
+        self.backcast = backcast
+        self.backcast_period = backcast_period
+        if not backcast:
+            self.window_size: int = window_size
+        else:
+            self.window_size: int = backcast_period * n_prediction_steps
         self.n_prediction_steps = n_prediction_steps
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
@@ -435,12 +440,53 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                         num_batch_per_epoch: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter="num_batches_per_epoch",
                                                                   value_range=(30, 200),
-                                                                  default_value=100)
+                                                                  default_value=100),
+                                        backcast: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='backcast',
+                                                                  value_range=(True, False),
+                                                                  default_value=False),
+                                        backcast_period: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='backcast_period',
+                                                                  value_range=(2, 7),
+                                                                  default_value=2)
                                         ) -> ConfigurationSpace:
+        """
+        hyperparameter search space for forecasting dataloader. Forecasting dataloader construct the window size in two
+        ways: either window_size is directly assigned or it is computed by backcast_period * n_prediction_steps
+        (introduced by nbeats:
+        Oreshkin et al., N-BEATS: Neural basis expansion analysis for interpretable time series forecasting, ICLR 2020
+        https://arxiv.org/abs/1905.10437)
+        Currently back_cast_period is only activate when back_cast is activate
+        TODO ablation study on whether this technique can be applied to other models
+        Args:
+            dataset_properties (Optional[Dict]): dataset properties
+            batch_size (int): batch size
+            window_size (int): window size, (if activate) this value directly determines the window_size of the
+                               data loader
+            num_batch_per_epoch (int): how many batches are trained at each iteration
+            backcast (bool): if back_cast module is activate (in which case window size is a
+            multiple of n_prediction_steps)
+            backcast_period (int): activate if backcast is activate, the window size is then computed with
+                                   backcast_period * n_prediction_steps
+
+        Returns:
+            cs: Configuration Space
+
+        """
         cs = ConfigurationSpace()
         add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
-        add_hyperparameter(cs, window_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, num_batch_per_epoch, UniformIntegerHyperparameter)
+
+        window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+        backcast = get_hyperparameter(backcast, CategoricalHyperparameter)
+        backcast_period = get_hyperparameter(backcast_period, UniformIntegerHyperparameter)
+
+        cs.add_hyperparameters([window_size, backcast, backcast_period])
+
+        window_size_cond = EqualsCondition(window_size, backcast, False)
+        backcast_period_cond = EqualsCondition(backcast_period, backcast, True)
+        cs.add_conditions([window_size_cond, backcast_period_cond])
+
         return cs
 
     def __str__(self) -> str:

From 9df8921f6d3d50f739c675085a9157765ddbff42 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 20 Dec 2021 23:42:59 +0100
Subject: [PATCH 097/347] NBEATS

---
 .../RegressionLoss.py                         |   4 +-
 .../setup/network/forecasting_network.py      |  31 +-
 .../forecasting_decoder/MLPDecoder.py         |   7 -
 .../forecasting_decoder/NBEATSDecoder.py      | 319 ++++++++++++++++++
 .../forecasting_decoder/RNNDecoder.py         |   8 +-
 .../base_forecasting_decoder.py               |  18 +-
 .../InceptionTimeEncoder.py                   |   6 -
 .../forecasting_encoder/MLPEncoder.py         |  30 +-
 .../forecasting_encoder/NBEATSEncoder.py      |  74 ++++
 .../forecasting_encoder/RNNEncoder.py         |   8 +-
 .../forecasting_encoder/TCNEncoder.py         |   9 -
 .../forecasting_encoder/__init__.py           |   2 +-
 .../base_forecasting_encoder.py               |   8 +-
 .../forecasting_network_head/NBEATS_head.py   | 122 +++++++
 .../forecasting_network_head/__init__.py      |   8 -
 .../forecasting_head.py                       |  40 ++-
 .../time_series_forecasting_data_loader.py    |   3 +-
 .../trainer/forecasting_trainer/__init__.py   |   1 +
 .../forecasting_base_trainer.py               |  43 ++-
 .../pipeline/time_series_forecasting.py       |  31 ++
 20 files changed, 673 insertions(+), 99 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
index 4043612e9..c9fba041e 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
@@ -14,14 +14,14 @@
 from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss
 
 
-class RegressionLosses(ForecastingLossComponents):
+class RegressionLoss(ForecastingLossComponents):
     required_net_out_put_type = 'regression'
 
     def __init__(self,
                  loss_name: str,
                  random_state: Optional[np.random.RandomState] = None,
                  ):
-        super(RegressionLosses, self).__init__()
+        super(RegressionLoss, self).__init__()
         if loss_name == "l1":
             self.loss = L1Loss
         elif loss_name == 'mse':
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index ceb089d2e..44e54be1f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -28,7 +28,7 @@ def __init__(self,
                  network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
                  network_encoder: EncoderNetwork,
                  network_decoder: nn.Module,
-                 network_head: nn.Module,
+                 network_head: Optional[nn.Module],
                  n_prediction_steps: int,
                  encoder_properties: Dict,
                  decoder_properties: Dict,
@@ -308,6 +308,31 @@ def pred_from_net_output(self, net_output: torch.Tensor):
             raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
 
+class NBEATSNet(ForecastingNet):
+    future_target_required = False
+    def forward(self,
+                targets_past: torch.Tensor,
+                targets_future: Optional[torch.Tensor] = None,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        forecast = torch.zeros_like(targets_future).view(targets_future.shape[0], -1)
+        backcast = self.encoder(targets_past)
+        for block in self.decoder:
+            backcast_block, forecast_block = block(backcast)
+
+            backcast = backcast - backcast_block
+            forecast = forecast + forecast_block
+        if self.training:
+            return backcast, forecast
+        else:
+            return forecast
+
+    def pred_from_net_output(self, net_output: torch.Tensor):
+        return net_output
+
+
 class ForecastingNetworkComponent(NetworkComponent):
     def __init__(
             self,
@@ -331,7 +356,7 @@ def _required_fit_requirements(self):
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_encoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_decoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
             FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
             FitRequirement("encoder_properties", (Dict,), user_defined=False, dataset_property=False),
             FitRequirement("decoder_properties", (Dict,), user_defined=False, dataset_property=False),
@@ -362,6 +387,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         if X['decoder_properties']['has_hidden_states']:
             # decoder is RNN
             self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
+        elif X['decoder_properties']['multi_blocks']:
+            self.network = NBEATSNet(**network_init_kwargs)
         elif X['auto_regressive']:
             # decoder is MLP and auto_regressive, we have deep AR model
             self.network = ForecastingDeepARNet(**network_init_kwargs)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
index d4b0a2c05..e20764437 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
@@ -16,13 +16,6 @@
 
 
 class ForecastingMLPHeader(BaseForecastingDecoder):
-    @property
-    def decoder_properties(self):
-        decoder_properties = {'has_hidden_states': False,
-                              'recurrent': False,
-                              }
-        return decoder_properties
-
     def _build_decoder(self,
                        input_shape: Tuple[int, ...],
                        n_prediction_heads: int,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
new file mode 100644
index 000000000..8a5445673
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -0,0 +1,319 @@
+from typing import Any, Dict, List, Optional, Union, Tuple
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
+    UniformFloatHyperparameter
+from ConfigSpace.conditions import GreaterThanCondition, InCondition, EqualsCondition, AndConjunction
+from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction
+
+from typing import Dict, Optional, Tuple, Union, Any
+
+from torch import nn
+
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
+    BaseForecastingDecoder
+
+# TODO we need to rewrite NBEATS part to make it neater!!!
+
+
+class NBEATSBLock(nn.Module):
+    def __init__(self,
+                 n_in_features: int,
+                 stack_idx: int,
+                 config: Dict,
+                 ):
+        super().__init__()
+        self.n_in_features = n_in_features
+        self.stack_idx = stack_idx
+
+        self.weight_sharing = config['weight_sharing_%d' % self.stack_idx]
+        self.num_blocks = config['num_blocks_%d' % self.stack_idx]
+        self.stack_type = config['stack_type_%d' % self.stack_idx]
+        if self.stack_type == 'generic':
+            self.expansion_coefficient_length = config['expansion_coefficient_length_generic_%d' % self.stack_idx]
+        else:
+            self.expansion_coefficient_length = config['expansion_coefficient_length_interpretable_%d' % self.stack_idx]
+
+        self.num_layers = config['num_layers_%d' % self.stack_idx]
+        self.width = config['width_%d' % self.stack_idx]
+        self.normalization = config['normalization']
+        self.activation = config['activation']
+        self.use_dropout = config['use_dropout']
+        self.dropout_rate = config.get('dropout_%d' % self.stack_idx, None)
+
+        self.backbone = nn.Sequential(*self.build_backbone())
+
+        self.backcast_head = None
+        self.forecast_head = None
+
+    def build_backbone(self):
+        layers: List[nn.Module] = list()
+        for _ in range(self.num_layers):
+            self._add_layer(layers, self.n_in_features)
+        return layers
+
+    def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
+        layers.append(nn.Linear(in_features, self.width))
+        if self.normalization == 'BN':
+            layers.append(nn.BatchNorm1d(self.width))
+        elif self.normalization == 'LN':
+            layers.append(nn.LayerNorm(self.width))
+        layers.append(_activations[self.activation]())
+        if self.use_dropout:
+            layers.append(nn.Dropout(self.dropout_rate))
+
+    def forward(self, x):
+        if self.backcast_head is None and self.forecast_head is None:
+            # used to compute head dimensions
+            return self.backbone(x)
+        else:
+            x = self.backbone(x)
+            forecast = self.forecast_head(x)
+            backcast = self.backcast_head(x)
+            return backcast, forecast
+
+
+class NBEATSDecoder(BaseForecastingDecoder):
+    _fixed_seq_length = True
+    window_size = 1
+    fill_lower_resolution_seq = False
+    fill_kwargs = {}
+
+    def decoder_properties(self):
+        decoder_properties = super().decoder_properties()
+        decoder_properties.update({
+            'multi_blocks': True
+        })
+        return decoder_properties
+
+    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
+                       dataset_properties:Dict) -> Tuple[nn.Module, int]:
+        in_features = input_shape[-1]
+        stacks = [[] for _ in range(self.config['num_stacks'])]
+        for stack_idx in range(1, self.config['num_stacks'] + 1):
+            for block_idx in range(self.config['num_blocks_%d' % stack_idx]):
+                if self.config['weight_sharing_%d' % stack_idx] and block_idx > 0:
+                    # for weight sharing, we only create one instance
+                    break
+                stacks[stack_idx - 1].append(NBEATSBLock(in_features, stack_idx=stack_idx, config=self.config))
+        return stacks, stacks[-1][-1].width
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NBEATSDecoder',
+            'name': 'NBEATSDecoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'backcast_loss_ratio': self.config['backcast_loss_ratio']})
+        return super().transform(X)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_stacks: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="num_stacks",
+                value_range=(1, 4),
+                default_value=2
+            ),
+            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_blocks',
+                value_range=(1, 5),
+                default_value=3
+            ),
+            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_layers',
+                value_range=(1, 5),
+                default_value=3
+            ),
+            width: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'width',
+                value_range=(256, 2048),
+                default_value=512,
+                log=True
+            ),
+            weight_sharing: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'weight_sharing',
+                value_range=(True, False),
+                default_value=False,
+            ),
+            stack_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'stack_type',
+                value_range=('generic', 'seasonality', 'trend'),
+                default_value='generic'),
+            expansion_coefficient_length_generic: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_generic',
+                value_range=(1, 4),
+                default_value=3,
+            ),
+            expansion_coefficient_length_interpretable: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_interpretable',
+                value_range=(16, 64),
+                default_value=32,
+                log=True
+            ),
+            activation: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="activation",
+                value_range=tuple(_activations.keys()),
+                default_value=list(_activations.keys())[0],
+            ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="use_dropout",
+                value_range=(True, False),
+                default_value=False,
+            ),
+            normalization: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="normalization",
+                value_range=('BN', 'LN', 'NoNorm'),
+                default_value='BN'
+            ),
+            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="dropout",
+                value_range=(0, 0.8),
+                default_value=0.5,
+            ),
+            backcast_loss_ratio: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="backcast_loss_ratio",
+                value_range=(0., 1.),
+                default_value=1.,
+            )
+    ) -> ConfigurationSpace:
+        """
+        Configuration for N-BEATS. The network is composed of several stacks, each stack is composed of several block,
+        we follow the implementation from N-BEATS: blocks are only composed of fully-connected layers with the same
+        width
+        The design of the configuration space follows pytorch-forecasting:
+        https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
+        Args:
+            dataset_properties:
+            num_stacks: number of stacks
+            num_blocks: number of blocks per stack
+            num_layers: number of fc layers per block, this value is the same across all the blocks within one stack
+            width: fc layer width, this value is the same across all the blocks within one stack
+            weight_sharing: if weights are shared inside one block
+            stack_type: stack type, used to define the final output
+            expansion_coefficient_length_generic: expansion_coefficient_length, activate if stack_type is 'generic'
+            expansion_coefficient_length_interpretable: expansion_coefficient_length, activate if stack_type is 'trend'
+            or 'seasonality' (in this case n_dim is expansion_coefficient_length_interpretable * n_prediciton_steps)
+             the expansion coefficient) or trend (in this case, it corresponds to the degree of the polynomial)
+            activation: activation function across fc layers
+            use_dropout: if dropout is applied
+            normalization: if normalization is applied
+            dropout: dropout value, if use_dropout is set as True
+            backcast_loss_ratio: weight of backcast in comparison to forecast when calculating the loss.
+                A weight of 1.0 means that forecast and backcast loss is weighted the same (regardless of backcast and
+                forecast lengths). Defaults to 0.0, i.e. no weight.
+        Returns:
+            Configuration Space
+        """
+
+        cs = ConfigurationSpace()
+        min_num_stacks, max_num_stacks = num_stacks.value_range
+
+        num_stacks = get_hyperparameter(num_stacks, UniformIntegerHyperparameter)
+
+        add_hyperparameter(cs, activation, CategoricalHyperparameter)
+        add_hyperparameter(cs, normalization, CategoricalHyperparameter)
+        add_hyperparameter(cs, backcast_loss_ratio, UniformFloatHyperparameter)
+
+        # We can have dropout in the network for
+        # better generalization
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        cs.add_hyperparameters([num_stacks, use_dropout])
+
+        for stack_idx in range(1, int(max_num_stacks) + 1):
+            num_blocks_search_space = HyperparameterSearchSpace(hyperparameter='num_blocks_%d' % stack_idx,
+                                                                value_range=num_blocks.value_range,
+                                                                default_value=num_blocks.default_value,
+                                                                log=num_blocks.log)
+            num_layers_search_space = HyperparameterSearchSpace(hyperparameter='num_layers_%d' % stack_idx,
+                                                                value_range=num_layers.value_range,
+                                                                default_value=num_layers.default_value,
+                                                                log=num_layers.log)
+            width_search_space = HyperparameterSearchSpace(hyperparameter='width_%d' % stack_idx,
+                                                           value_range=width.value_range,
+                                                           default_value=width.default_value,
+                                                           log=width.log)
+            weight_sharing_search_sapce = HyperparameterSearchSpace(hyperparameter='weight_sharing_%d' % stack_idx,
+                                                                    value_range=weight_sharing.value_range,
+                                                                    default_value=weight_sharing.default_value,
+                                                                    log=weight_sharing.log)
+            stack_type_search_space = HyperparameterSearchSpace(hyperparameter='stack_type_%d' % stack_idx,
+                                                                value_range=stack_type.value_range,
+                                                                default_value=stack_type.default_value,
+                                                                log=stack_type.log)
+            expansion_coefficient_length_generic_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_generic_%d' % stack_idx,
+                value_range=expansion_coefficient_length_generic.value_range,
+                default_value=expansion_coefficient_length_generic.default_value,
+                log=expansion_coefficient_length_generic.log
+            )
+            expansion_coefficient_length_interpretable_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_interpretable_%d' % stack_idx,
+                value_range=expansion_coefficient_length_interpretable.value_range,
+                default_value=expansion_coefficient_length_interpretable.default_value,
+                log=expansion_coefficient_length_interpretable.log
+            )
+
+            num_blocks_hp = get_hyperparameter(num_blocks_search_space, UniformIntegerHyperparameter)
+            num_layers_hp = get_hyperparameter(num_layers_search_space, UniformIntegerHyperparameter)
+            width_hp = get_hyperparameter(width_search_space, UniformIntegerHyperparameter)
+            weight_sharing_hp = get_hyperparameter(weight_sharing_search_sapce, CategoricalHyperparameter)
+            stack_type_hp = get_hyperparameter(stack_type_search_space, CategoricalHyperparameter)
+
+            expansion_coefficient_length_generic_hp = get_hyperparameter(
+                expansion_coefficient_length_generic_search_space,
+                UniformIntegerHyperparameter
+            )
+            expansion_coefficient_length_interpretable_hp = get_hyperparameter(
+                expansion_coefficient_length_interpretable_search_space,
+                UniformIntegerHyperparameter
+            )
+
+            hps = [num_blocks_hp, num_layers_hp, width_hp, stack_type_hp, weight_sharing_hp]
+            cs.add_hyperparameters([*hps, expansion_coefficient_length_generic_hp,
+                                    expansion_coefficient_length_interpretable_hp])
+
+            if stack_idx > int(min_num_stacks):
+                # The units of layer i should only exist
+                # if there are at least i layers
+                for hp in hps:
+                    cs.add_condition(GreaterThanCondition(hp, num_stacks, stack_idx - 1))
+                cond_ecl_generic = AndConjunction(
+                    GreaterThanCondition(expansion_coefficient_length_generic_hp, num_stacks, stack_idx -1),
+                    EqualsCondition(expansion_coefficient_length_generic_hp, stack_type_hp, 'generic')
+                )
+                cond_ecl_interpretable = AndConjunction(
+                    GreaterThanCondition(expansion_coefficient_length_interpretable_hp, num_stacks, stack_idx - 1),
+                    InCondition(expansion_coefficient_length_interpretable_hp, stack_type_hp, ('seasonality', 'trend'))
+                )
+                cs.add_conditions([cond_ecl_generic, cond_ecl_interpretable])
+
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % stack_idx,
+                                                             value_range=dropout.value_range,
+                                                             default_value=dropout.default_value,
+                                                             log=dropout.log)
+
+            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
+            cs.add_hyperparameter(dropout_hp)
+
+            dropout_condition_1 = EqualsCondition(dropout_hp, use_dropout, True)
+
+            if stack_idx > int(min_num_stacks):
+                dropout_condition_2 = GreaterThanCondition(dropout_hp, num_stacks, stack_idx - 1)
+                cs.add_condition(AndConjunction(dropout_condition_1, dropout_condition_2))
+            else:
+                cs.add_condition(dropout_condition_1)
+
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
index 6344ac63e..8dc3fb39c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
@@ -83,11 +83,11 @@ def _build_decoder(self,
                              )
         return decoder, hidden_size
 
-    @property
     def decoder_properties(self):
-        decoder_properties = {'has_hidden_states': True,
-                              'recurrent': True,
-                              }
+        decoder_properties = super().decoder_properties()
+        decoder_properties.update({'has_hidden_states': True,
+                                   'recurrent': True,
+                                   })
         return decoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
index 7535dc7d3..79b1a6ce6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -1,14 +1,10 @@
 from abc import abstractmethod, ABC
 from typing import Any, Dict, Iterable, Tuple, List, Optional
 
-import numpy as np
-import torch
 from torch import nn
 
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
-    ALL_DISTRIBUTIONS, ProjectionLayer
 from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
 
 
@@ -40,14 +36,12 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
         ]
 
-    @property
     def decoder_properties(self):
-        decoder_property = {'additional_output': False,
-                            'additional_input': False,
-                            'fixed_input_seq_length': False,
-                            'recurrent': False,
-                            }
-        return decoder_property
+        decoder_properties = {'has_hidden_states': False,
+                              'recurrent': False,
+                              'multi_blocks': False,
+                              }
+        return decoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
@@ -99,7 +93,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        X.update({'decoder_properties': self.decoder_properties,
+        X.update({'decoder_properties': self.decoder_properties(),
                   'network_decoder': self.decoder,
                   'n_prediction_heads': self.n_prediction_heads,
                   'n_decoder_output_features': self.n_decoder_output_features})
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
index c6324200a..6cf1a0310 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
@@ -130,12 +130,6 @@ class InceptionTimeEncoder(BaseForecastingEncoder):
     """
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
-    @property
-    def encoder_properties(self):
-        encoder_properties = {'has_hidden_states': False,
-                              'bijective_seq_output': True,
-                              'fixed_input_seq_length': False}
-        return encoder_properties
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         encoder = _InceptionTime(in_features=input_shape[-1],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index f55f3e780..847251238 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -16,16 +16,22 @@
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
 
 
-class _TimeSeriesMLP(EncoderNetwork):
+class TimeSeriesMLPrecpocessor(EncoderNetwork):
     def __init__(self,
                  window_size: int,
-                 mlp_layers: nn.Module,
                  fill_lower_resolution_seq: bool = False,
                  fill_kwargs: Dict = {},
                  ):
+        """
+        Transform the input features (B, T, N) to fit the requirement of MLP
+        Args:
+            window_size (int): T
+            fill_lower_resolution_seq: if sequence with lower resolution needs to be filled with 0
+        (for multi-fidelity problems with resolution as fidelity)
+            fill_kwargs: filling information
+        """
         super().__init__()
         self.window_size = window_size
-        self.mlp_layers = mlp_layers
         self.fill_lower_resolution_seq = fill_lower_resolution_seq
         self.fill_interval = fill_kwargs.get('loader_sample_interval', 1)
 
@@ -52,7 +58,6 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
                 # we need to ensure that the input size fits the network shape
                 x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
         if self.fill_lower_resolution_seq and x.shape[1] < self.window_size:
-
             x = F.conv_transpose1d(x.transpose(1, 2),
                                    F.pad(torch.ones((1, 1, 1)), (1, 1)),
                                    stride=self.fill_interval,
@@ -60,7 +65,7 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
             if x.shape[1] < self.window_size:
                 x = torch.cat([torch.zeros(x.shape[0], self.window_size - x.shape[1], x.shape[2]), x], dim=1)
         x = x.flatten(-2)
-        return self.mlp_layers(x)
+        return x
 
 
 class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
@@ -69,13 +74,12 @@ class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
     fill_lower_resolution_seq = False
     fill_kwargs = {}
 
-    @property
     def encoder_properties(self):
-        encoder_properties = {
-            'has_hidden_states': False,
+        encoder_properties = super().encoder_properties()
+        encoder_properties.update({
             'bijective_seq_output': False,
             'fixed_input_seq_length': True,
-        }
+        })
         return encoder_properties
 
     @property
@@ -95,10 +99,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1] * self.window_size
-        return _TimeSeriesMLP(window_size=self.window_size,
-                              mlp_layers=self._build_backbone(in_features),
-                              fill_lower_resolution_seq=self.fill_lower_resolution_seq,
-                              fill_kwargs=self.fill_kwargs)
+        feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size,
+                                                        fill_lower_resolution_seq=self.fill_lower_resolution_seq,
+                                                        fill_kwargs=self.fill_kwargs)
+        return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features))
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
new file mode 100644
index 000000000..3927deaa5
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
@@ -0,0 +1,74 @@
+from typing import Any, Dict, List, Optional, Union, Tuple
+
+from torch import nn
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
+    UniformFloatHyperparameter
+from ConfigSpace.conditions import GreaterThanCondition, InCondition, EqualsCondition, AndConjunction
+
+from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
+    import BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.MLPEncoder import \
+    TimeSeriesMLPrecpocessor
+
+
+class NBEATSEncoder(BaseForecastingEncoder):
+    """
+    Encoder for NBEATS-like network. It flatten the input sequence to fit the requirement of MLP, the main part is
+    implemented under decoder
+    """
+    _fixed_seq_length = True
+    window_size = 1
+    fill_lower_resolution_seq = False
+    fill_kwargs = {}
+
+    def encoder_properties(self):
+        encoder_properties = super().encoder_properties()
+        encoder_properties.update({
+            'fixed_input_seq_length': True,
+        })
+        return encoder_properties
+
+    @property
+    def _required_fit_arguments(self) -> List[FitRequirement]:
+        requirements_list = super()._required_fit_arguments
+        requirements_list.append(FitRequirement('window_size', (int,), user_defined=False, dataset_property=False))
+        return requirements_list
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.window_size = X["window_size"]
+        # when resolution is smaller
+        if 'sample_interval' in X and X['sample_interval'] > 1.:
+            self.fill_lower_resolution_seq = True
+            self.fill_kwargs = {'loader_sample_interval': X['sample_interval']}
+        return super().fit(X, y)
+
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size,
+                                                 fill_lower_resolution_seq=self.fill_lower_resolution_seq,
+                                                 fill_kwargs=self.fill_kwargs
+                                                 )
+        return preprocessor
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NBEATSEncoder',
+            'name': 'NBEATSEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
index adb366b6c..d71662415 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
@@ -76,12 +76,10 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
                        config=self.config)
         return encoder
 
-    @property
     def encoder_properties(self):
-        encoder_properties = {'has_hidden_states': True,
-                              'bijective_seq_output': True,
-                              'fixed_input_seq_length': False
-                              }
+        encoder_properties = super().encoder_properties()
+        encoder_properties.update({'has_hidden_states': True,
+                                   })
         return encoder_properties
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
index 3259e97a8..50f28b111 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
@@ -104,15 +104,6 @@ class TCNEncoder(BaseForecastingEncoder):
     """
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
-
-    @property
-    def encoder_properties(self):
-        encoder_properties = {'has_hidden_states': False,
-                              'bijective_seq_output': True,
-                              'fixed_input_seq_length': False
-                              }
-        return encoder_properties
-
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_0"]]
         for i in range(1, self.config["num_blocks"]):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
index ed2d0e8d0..c6d10b546 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
@@ -48,4 +48,4 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
 
     @property
     def _defaults_network(self):
-        return ['RNNBackbone', 'TSMLPBackbone']
\ No newline at end of file
+        return ['RNNBackbone', 'RNNPBackbone']
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
index 714bc7e77..128cef415 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -91,7 +91,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
         X.update({'network_encoder': self.encoder})
-        X.update({'encoder_properties': self.encoder_properties})
+        X.update({'encoder_properties': self.encoder_properties()})
         return X
 
     @abstractmethod
@@ -107,7 +107,6 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         """
         raise NotImplementedError()
 
-    @property
     def encoder_properties(self):
         """
         Encoder properties, this determines how the data flows over the forecasting networks
@@ -118,10 +117,9 @@ def encoder_properties(self):
         sequence when output_seq is set True
         fix_input_shape if the input shape is fixed, this is useful for building network head
         """
-        # TODO make use of bijective_seq_output in trainer!!!
         encoder_properties = {'has_hidden_states': False,
-                              'bijective_seq_output': False,
-                              'fixed_input_seq_length': False
+                              'bijective_seq_output': True,
+                              'fixed_input_seq_length': False,
                               }
         return encoder_properties
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
new file mode 100644
index 000000000..a41a542ac
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -0,0 +1,122 @@
+# This part of implementation follows pytorch-forecasting:
+# https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/nbeats/sub_modules.py
+
+import torch
+from typing import Tuple, List
+import numpy as np
+from torch import nn
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.NBEATSDecoder import NBEATSBLock
+
+
+class TransposeLinear(nn.Module):
+    def __init__(self, weights: torch.Tensor):
+        self.register_buffer('weights', weights)
+
+    def forward(self, x: torch.Tensor):
+        return x.mm(self.weights)
+
+
+def linspace(backcast_length: int, forecast_length: int, centered: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+    if centered:
+        norm = max(backcast_length, forecast_length)
+        start = -backcast_length
+        stop = forecast_length - 1
+    else:
+        norm = backcast_length + forecast_length
+        start = 0
+        stop = backcast_length + forecast_length - 1
+    lin_space = np.linspace(start / norm, stop / norm, backcast_length + forecast_length, dtype=np.float32)
+    b_ls = lin_space[:backcast_length]
+    f_ls = lin_space[backcast_length:]
+    return b_ls, f_ls
+
+
+def get_generic_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
+    backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+                                  nn.Linear(thetas_dim, backcast_length))
+    forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+                                  nn.Linear(thetas_dim, forecast_length))
+    return backcast_head, forecast_head
+
+
+def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
+    base_layer = nn.Linear(block_width, thetas_dim, bias=False)
+
+    backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=True)
+    norm = np.sqrt(forecast_length / thetas_dim)  # ensure range of predictions is comparable to input
+
+    coefficients_backcast = torch.tensor([backcast_linspace ** i for i in range(thetas_dim)], dtype=torch.float32)
+
+    coefficients_forecast = torch.tensor([forecast_linspace ** i for i in range(thetas_dim)], dtype=torch.float32)
+
+    backcast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(coefficients_backcast * norm))
+    forecast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(coefficients_forecast * norm))
+    return backcast_head, forecast_head
+
+
+def get_seasonality_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
+    base_layer = nn.Linear(block_width, thetas_dim, bias=False)
+
+    backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=False)
+
+    def get_frequencies(n):
+        return np.linspace(0, (backcast_length + forecast_length) / thetas_dim, n)
+
+    p1, p2 = (forecast_length // 2, forecast_length // 2) if forecast_length % 2 == 0 else \
+        (forecast_length // 2, forecast_length // 2 + 1)
+
+    s1_b = torch.tensor(
+        [np.cos(2 * np.pi * i * backcast_linspace) for i in get_frequencies(p1)], dtype=torch.float32)  # H/2-1
+    s2_b = torch.tensor(
+        [np.sin(2 * np.pi * i * backcast_linspace) for i in get_frequencies(p2)], dtype=torch.float32)
+
+    s1_f = torch.tensor(
+        [np.cos(2 * np.pi * i * forecast_linspace) for i in get_frequencies(p1)], dtype=torch.float32
+    )  # H/2-1
+    s2_f = torch.tensor(
+        [np.sin(2 * np.pi * i * forecast_linspace) for i in get_frequencies(p2)], dtype=torch.float32
+    )
+
+    backcast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(torch.cat([s1_b, s2_b])))
+    forecast_head = nn.Sequential(base_layer,
+                                  TransposeLinear(torch.cat([s1_f, s2_f])))
+
+    return backcast_head, forecast_head
+
+
+def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBLock]], output_shape: Tuple[int]) -> List[NBEATSBLock]:
+    nbeats_blocks = []
+    for stack_idx, stack in enumerate(nbeats_decoder):
+        for block_idx, block in enumerate(nbeats_decoder[stack_idx]):
+            stack_type = block.stack_type
+            if stack_type == 'generic':
+                backcast_head, forecast_head = get_generic_heads(block_width=block.width,
+                                                                 thetas_dim=block.expansion_coefficient_length,
+                                                                 forecast_length=np.product(output_shape[1:]).item(),
+                                                                 backcast_length=block.n_in_features)
+            elif stack_type == 'trend':
+                backcast_head, forecast_head = get_trend_heads(block_width=block.width,
+                                                               thetas_dim=block.expansion_coefficient_length,
+                                                               forecast_length=np.product(output_shape[1:]).item(),
+                                                               backcast_length=block.n_in_features)
+            elif stack_type == 'seasonality':
+                backcast_head, forecast_head = get_seasonality_heads(block_width=block.width,
+                                                                     thetas_dim=block.expansion_coefficient_length,
+                                                                     forecast_length=np.product(
+                                                                         output_shape[1:]).item(),
+                                                                     backcast_length=block.n_in_features)
+            else:
+                raise ValueError(f"Unsupported stack_type {stack_type}")
+            block.backcast_head = backcast_head
+            block.forecast_head = forecast_head
+
+            nbeats_blocks.append(block)
+        if nbeats_blocks[-1].weight_sharing:
+            block = nbeats_blocks[-1]
+            for _ in range(block.num_blocks - 1):
+                nbeats_blocks.append(nbeats_blocks[-1])
+    return nbeats_blocks
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
index c05c78cd3..39ef0c558 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
@@ -5,17 +5,9 @@
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
     autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.setup.network_head.base_network_head import (
-    NetworkHeadComponent,
 )
 
 from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 301bb9c7c..b1fbb39e8 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -11,6 +11,7 @@
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
     ALL_DISTRIBUTIONS
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
 
 
 class ForecastingHead(NetworkHeadComponent):
@@ -27,7 +28,7 @@ def __init__(self,
         self.add_fit_requirements(self._required_fit_requirements)
         self.head: Optional[nn.Module] = None
         self.required_net_out_put_type: Optional[str] = None
-
+        self.output_shape = None
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -35,20 +36,12 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('decoder_properties', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_decoder_output_features', (int, ), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
         ]
 
-    @property
-    def decoder_properties(self):
-        decoder_property = {'additional_output': False,
-                            'additional_input': False,
-                            'fixed_input_seq_length': False,
-                            'recurrent': False,
-                            }
-        return decoder_property
-
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Builds the head component and assigns it to self.head
@@ -64,6 +57,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.required_net_out_put_type = X['required_net_out_put_type']
 
+        if X['decoder_properties']['multi_blocks']:
+            # if the decoder is a stacked block, we directly build head inside the decoder
+            if X.get('network_decoder', None) is None:
+                raise ValueError("when decoder has multi_blocks, it must be specified!")
+            if self.required_net_out_put_type != 'regression':
+                raise ValueError("decoder with multi block structure only allow regression loss!")
+            self.output_shape = output_shape
+            return self
+
         if self.required_net_out_put_type == 'distribution':
             if 'dist_cls' not in X:
                 raise ValueError('Distribution output type must contain dist_cls!!')
@@ -84,6 +86,24 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         )
         return self
 
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the network head into the fit dictionary 'X' and returns it.
+
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.head is not None:
+            X.update({'network_head': self.head})
+        else:
+            decoder = X['network_decoder']
+            decoder = build_NBEATS_network(decoder, self.output_shape[1:])
+
+        return X
+
+
     def build_head(self,
                    input_shape: Tuple[int, ...],
                    output_shape: Tuple[int, ...],
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 72d5cef68..4c72c1af8 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -232,8 +232,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         # Incorporate the transform to the dataset
         datamanager = X['backend'].load_datamanager()  # type: TimeSeriesForcecastingDataset
-        assert self.subseq_length < datamanager.seq_length_min, "dataloader's window size must be smaller than the" \
-                                                                "minimal sequence length of the dataset!!"
+
         # TODO, consider bucket setting
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index ebf2bd18d..6fb7da5be 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -84,6 +84,7 @@ def prepare_trainer(self, X):
             step_interval=X['step_interval'],
             dataset_properties=X['dataset_properties'],
             target_scaler=X['target_scaler'],
+            backcast_loss_ratio=X.get('backcast_loss_ratio', 0.0)
         )
 
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index eb7fd752f..6b9787215 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -22,7 +22,8 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
-from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet, \
+    NBEATSNet
 
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 
@@ -45,6 +46,7 @@ def prepare(
             step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
             dataset_properties: Optional[Dict] = None,
             target_scaler: BaseTargetScaler = TargetNoScaler(),
+            backcast_loss_ratio: Optional[float] = None,
     ) -> None:
         # metrics_during_training is not appliable when computing scaled values
         metrics_during_training = False
@@ -64,6 +66,7 @@ def prepare(
                          "n_prediction_steps": dataset_properties.get("n_prediction_steps", 1)}
         self.metrics_kwargs = metric_kwargs
         self.target_scaler = target_scaler  # typing: BaseTargetScaler
+        self.backcast_loss_ratio = backcast_loss_ratio
 
     def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                     writer: Optional[SummaryWriter],
@@ -152,7 +155,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         # prepare
         past_target = past_target.float()
-        if self.model.future_target_required:
+        if self.model.future_target_required or isinstance(self.model, NBEATSNet):
             past_target, scaled_future_targets, loc, scale = self.target_scaler(past_target, future_targets)
         else:
             past_target, _, loc, scale = self.target_scaler(past_target)
@@ -160,23 +163,37 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         past_target = past_target.to(self.device)
 
         future_targets = self.cast_targets(future_targets)
-        if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
-            future_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
-            past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
-        else:
-            past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
         # training
         self.optimizer.zero_grad()
-        if self.model.future_target_required:
-            outputs = self.model(past_target, scaled_future_targets.float().to(self.device))
+
+        if isinstance(self.model, NBEATSNet):
+            past_target, criterion_kwargs_past = self.data_preparation(past_target,
+                                                                       scaled_future_targets.to(self.device))
+            past_target, criterion_kwargs_future = self.data_preparation(past_target, past_target)
+            backcast, forecast = self.model(past_target)
+
+            loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
+            loss_backcast = loss_func_backcast(self.criterion, backcast)
+            loss_forecast = loss_func_backcast(self.criterion, forecast)
+            loss = loss_forecast + loss_backcast * self.backcast_loss_ratio
         else:
-            outputs = self.model(past_target)
 
-        outputs = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
+            if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
+                future_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+            else:
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+
+            if self.model.future_target_required:
+                outputs = self.model(past_target, scaled_future_targets.float().to(self.device))
+            else:
+                outputs = self.model(past_target)
+
+            outputs = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
 
-        loss_func = self.criterion_preparation(**criterion_kwargs)
-        loss = loss_func(self.criterion, outputs)
+            loss_func = self.criterion_preparation(**criterion_kwargs)
+            loss = loss_func(self.criterion, outputs)
 
         loss.backward()
         self.optimizer.step()
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index cc82c5649..9a12e8263 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -260,6 +260,37 @@ def _get_hyperparameter_search_space(self,
 
             cs.add_forbidden_clauses(forbidden_losses_all)
 
+            # NBEATS
+            forbidden_NBEATS = []
+            network_decoder_hp = cs.get_hyperparameter('network_decoder:__choice__')
+            encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'NBEATSEncoder']
+            decoders_non_NBEATS = [choice for choice in network_decoder_hp.choices if choice != 'NBEATSDecoder']
+            loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
+
+            forbidden_encoder_NBEATS = ForbiddenInClause(network_encoder_hp, encoder_non_BEATS)
+            forbidden_decoder_NBEATS = ForbiddenInClause(network_decoder_hp, decoders_non_NBEATS)
+            forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
+
+            # Ensure that NBEATS encoder only works with NBEATS decoder
+            if 'NBEATSEncoder' in network_encoder_hp.choices:
+                forbidden_NBEATS.append(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
+                    forbidden_encoder_NBEATS)
+                )
+                forbidden_NBEATS.append(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
+                    forbidden_loss_non_regression)
+                )
+            if 'NBEASTDecoder' in network_decoder_hp.choices:
+                forbidden_NBEATS.append(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
+                    forbidden_decoder_NBEATS)
+                )
+                forbidden_NBEATS.append(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
+                    forbidden_loss_non_regression)
+                )
+            cs.add_forbidden_clauses(forbidden_NBEATS)
 
         # rnn head only allow rnn backbone
         if 'network_encoder' in self.named_steps.keys() and 'network_decoder' in self.named_steps.keys():

From b362a753bb25e1a5574e79925510ba320ec90f1a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 21 Dec 2021 10:57:38 +0100
Subject: [PATCH 098/347] maint

---
 .../setup/network/forecasting_network.py           | 14 +++++++++-----
 .../forecasting_encoder/RNNEncoder.py              |  2 +-
 .../forecasting_network_head/NBEATS_head.py        | 12 +++++++-----
 .../forecasting_network_head/forecasting_head.py   |  5 +++--
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 44e54be1f..6d6a9f9f3 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -78,6 +78,7 @@ def __init__(self,
         self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
         self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
 
+
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
@@ -246,10 +247,13 @@ def forward(self,
             if self.encoder_has_hidden_states:
                 # For RNN, we only feed the hidden state and generated future input to the netwrok
                 encoder_output, hidden_states = self.encoder(x_past)
-                repeated_state = [
-                    s.repeat_interleave(repeats=self.num_samples, dim=1)
-                    for s in hidden_states
-                ]
+                if isinstance(hidden_states, tuple):
+                    repeated_state = [
+                        s.repeat_interleave(repeats=self.num_samples, dim=1)
+                        for s in hidden_states
+                    ]
+                else:
+                    repeated_state = hidden_states.repeat_interleave(repeats=self.num_samples, dim=1)
 
             else:
                 # For other models, the full past targets are passed to the network.
@@ -317,7 +321,7 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        forecast = torch.zeros_like(targets_future).view(targets_future.shape[0], -1)
+        forecast = torch.zeros([self.n_prediction_steps, *targets_past[1:]])
         backcast = self.encoder(targets_past)
         for block in self.decoder:
             backcast_block, forecast_block = block(backcast)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
index d71662415..427f5a345 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
@@ -42,7 +42,7 @@ def forward(self,
                 hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
         B, T, _ = x.shape
 
-        outputs, hidden_state, = self.lstm(x, hx)
+        outputs, hidden_state = self.lstm(x, hx)
 
         if output_seq:
             return outputs, hidden_state
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index a41a542ac..4825d12b9 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -11,6 +11,7 @@
 
 class TransposeLinear(nn.Module):
     def __init__(self, weights: torch.Tensor):
+        super().__init__()
         self.register_buffer('weights', weights)
 
     def forward(self, x: torch.Tensor):
@@ -88,7 +89,8 @@ def get_frequencies(n):
     return backcast_head, forecast_head
 
 
-def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBLock]], output_shape: Tuple[int]) -> List[NBEATSBLock]:
+def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBLock]],
+                         output_shape: Tuple[int]) -> nn.ModuleList:
     nbeats_blocks = []
     for stack_idx, stack in enumerate(nbeats_decoder):
         for block_idx, block in enumerate(nbeats_decoder[stack_idx]):
@@ -96,18 +98,18 @@ def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBLock]], output_shape:
             if stack_type == 'generic':
                 backcast_head, forecast_head = get_generic_heads(block_width=block.width,
                                                                  thetas_dim=block.expansion_coefficient_length,
-                                                                 forecast_length=np.product(output_shape[1:]).item(),
+                                                                 forecast_length=np.product(output_shape).item(),
                                                                  backcast_length=block.n_in_features)
             elif stack_type == 'trend':
                 backcast_head, forecast_head = get_trend_heads(block_width=block.width,
                                                                thetas_dim=block.expansion_coefficient_length,
-                                                               forecast_length=np.product(output_shape[1:]).item(),
+                                                               forecast_length=np.product(output_shape).item(),
                                                                backcast_length=block.n_in_features)
             elif stack_type == 'seasonality':
                 backcast_head, forecast_head = get_seasonality_heads(block_width=block.width,
                                                                      thetas_dim=block.expansion_coefficient_length,
                                                                      forecast_length=np.product(
-                                                                         output_shape[1:]).item(),
+                                                                         output_shape).item(),
                                                                      backcast_length=block.n_in_features)
             else:
                 raise ValueError(f"Unsupported stack_type {stack_type}")
@@ -119,4 +121,4 @@ def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBLock]], output_shape:
             block = nbeats_blocks[-1]
             for _ in range(block.num_blocks - 1):
                 nbeats_blocks.append(nbeats_blocks[-1])
-    return nbeats_blocks
+    return nn.ModuleList(nbeats_blocks)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index b1fbb39e8..fc20c30c6 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -99,8 +99,9 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             X.update({'network_head': self.head})
         else:
             decoder = X['network_decoder']
-            decoder = build_NBEATS_network(decoder, self.output_shape[1:])
-
+            decoder = build_NBEATS_network(decoder, self.output_shape)
+            X.update({'network_head': self.head,
+                      'network_decoder': decoder})
         return X
 
 

From 40116dc28507c11eef4a03a388befa7b96454bb3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 21 Dec 2021 17:05:07 +0100
Subject: [PATCH 099/347] maint

---
 .../setup/network/forecasting_network.py      |  1 -
 .../time_series_forecasting_data_loader.py    |  4 ++--
 .../pipeline/time_series_forecasting.py       | 24 +++++++++++++++++--
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 6d6a9f9f3..f784acd0f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -78,7 +78,6 @@ def __init__(self,
         self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
         self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
 
-
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 4c72c1af8..614338417 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -438,8 +438,8 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                                                   default_value=30),
                                         num_batch_per_epoch: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter="num_batches_per_epoch",
-                                                                  value_range=(30, 200),
-                                                                  default_value=100),
+                                                                  value_range=(30, 100),
+                                                                  default_value=50),
                                         backcast: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter='backcast',
                                                                   value_range=(True, False),
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 9a12e8263..2165bd25c 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -266,30 +266,50 @@ def _get_hyperparameter_search_space(self,
             encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'NBEATSEncoder']
             decoders_non_NBEATS = [choice for choice in network_decoder_hp.choices if choice != 'NBEATSDecoder']
             loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
+            data_loader_backcast = cs.get_hyperparameter('data_loader:backcast')
 
             forbidden_encoder_NBEATS = ForbiddenInClause(network_encoder_hp, encoder_non_BEATS)
             forbidden_decoder_NBEATS = ForbiddenInClause(network_decoder_hp, decoders_non_NBEATS)
             forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
+            forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
+            forbidden_backcast_false = ForbiddenEqualsClause(data_loader_backcast, False)
 
             # Ensure that NBEATS encoder only works with NBEATS decoder
             if 'NBEATSEncoder' in network_encoder_hp.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
-                    forbidden_encoder_NBEATS)
+                    forbidden_decoder_NBEATS)
                 )
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
                     forbidden_loss_non_regression)
                 )
+                forbidden_NBEATS.append(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
+                    forbidden_backcast_false)
+                )
             if 'NBEASTDecoder' in network_decoder_hp.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
-                    forbidden_decoder_NBEATS)
+                    forbidden_encoder_NBEATS)
                 )
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
                     forbidden_loss_non_regression)
                 )
+                forbidden_NBEATS.append(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
+                    forbidden_backcast_false)
+                )
+            forbidden_NBEATS.append(ForbiddenAndConjunction(
+                forbidden_backcast,
+                forbidden_encoder_NBEATS
+            ))
+            forbidden_NBEATS.append(ForbiddenAndConjunction(
+                forbidden_backcast,
+                forbidden_decoder_NBEATS
+            ))
+
             cs.add_forbidden_clauses(forbidden_NBEATS)
 
         # rnn head only allow rnn backbone

From 2b2af672872926f04e0408176c9da0fd32607e96 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 00:13:28 +0100
Subject: [PATCH 100/347] padding loader

---
 .../training/data_loader/base_data_loader.py  |  2 +-
 .../time_series_forecasting_data_loader.py    | 86 ++++++++++++-------
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index f39194477..8ac2af391 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -59,7 +59,7 @@ def __init__(self, batch_size: int = 64,
             FitRequirement("Backend", (Backend,), user_defined=True, dataset_property=False),
             FitRequirement("is_small_preprocess", (bool,), user_defined=True, dataset_property=True)])
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict) -> Dict:
         """The transform function calls the transform function of the
         underlying model and returns the transformed array.
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 614338417..c7251d53b 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 import torch
+from torch._six import container_abcs, string_classes, int_classes
 
 import torchvision
 
@@ -26,11 +27,48 @@
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 
 
+class PaddingCollecter:
+    """
+    A collector that transform the sequences from dataset. Since the sequences might contain different
+    length, they need to be padded with constant value. Given that target value might require special value to
+    fit the requirement of distribution, past_target will be padded with special values
+    #TODO implement padding collect!!
+
+    """
+
+    def __init__(self, target_padding_value: float = 0.0):
+        self.target_padding_value = target_padding_value
+
+    def __call__(self, batch, padding_value=0.0):
+
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            out = None
+            if torch.utils.data.get_worker_info() is not None:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int_classes):
+            return torch.tensor(batch)
+        elif isinstance(elem, string_classes):
+            return batch
+        elif isinstance(elem, container_abcs.Mapping):
+            return {key: self([d[key] for d in batch]) if key != "past_target"
+                    else self([d[key] for d in batch], self.target_padding_value) for key in elem}
+        raise TypeError(f"Unsupported data type {elem_type}")
+
+
 class TimeSeriesSampler(SubsetRandomSampler):
     def __init__(self,
                  indices: Sequence[int],
                  seq_lengths: Sequence[int],
-                 num_instances_per_seqs: Optional[List[int]]=None,
+                 num_instances_per_seqs: Optional[List[int]] = None,
                  min_start: int = 0,
                  generator: Optional[torch.Generator] = None) -> None:
         """
@@ -127,7 +165,7 @@ class SequenceBuilder(object):
         sliding window size
     """
 
-    def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length=1, padding_value=0.):
+    def __init__(self, sample_interval: int = 1, ):
         """
         initialization
         Args:
@@ -135,27 +173,18 @@ def __init__(self, sample_interval: int = 1, window_size: int = 1, subseq_length
             window_size: int: the size of the sliding window
         """
         self.sample_interval = sample_interval
-        self.window_size = window_size
         # assuming that subseq_length is 10, e.g., we can only start from -10. sample_interval = -4
         # we will sample the following indices: [-9,-5,-1]
-        self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
-        self.padding_value = padding_value
+        # self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
 
     def __call__(self, data: np.ndarray) -> np.ndarray:
-        sample_indices = np.arange(self.first_indices, 0, step=self.sample_interval)
-
-        if sample_indices[0] < -1 * len(data):
-            # we need to pad with 0
-            valid_indices = sample_indices[np.where(sample_indices >= -len(data))[0]]
-
-            data_values = data[valid_indices]
-            if data.ndim == 1:
-                padding_vector = np.full([len(sample_indices) - len(valid_indices)], self.padding_value)
-                return np.hstack([padding_vector, data_values])
-            else:
-                padding_vector = np.full([len(sample_indices) - len(valid_indices), data.shape[-1]], self.padding_value)
-                return np.vstack([padding_vector, data_values])
+        if self.sample_interval == 1:
+            return data
         else:
+            subseq_length = len(data)
+            first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
+            sample_indices = np.arange(first_indices, 0, step=self.sample_interval)
+
             return data[sample_indices]
 
 
@@ -212,9 +241,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Returns:
             A instance of self
         """
-        X["window_size"] = self.window_size
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
+        if sample_interval > 1:
+            # for lower resolution, window_size should be smaller
+            self.window_size = self.sample_interval * ((self.window_size - 1) // self.sample_interval) + 1
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
         # this value corresponds to budget type num_sample_per_seq
@@ -223,10 +254,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         self.padding_value = X.get('required_padding_value', 0.0)
 
-        # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
-        # we want models with different sample_interval to have similar length scale
-        self.subseq_length = self.window_size
-
         # Make sure there is an optimizer
         self.check_requirements(X, y)
 
@@ -327,6 +354,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         )
         return self
 
+    def transform(self, X: Dict) -> Dict:
+        X.update({"window_size": self.window_size})
+        return super().transform(X)
+
     def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
         """
         Method to build a transformation that can pre-process input data
@@ -346,12 +377,9 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         # if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
         #    candidate_transformations.extend(X['preprocess_transforms'])
-
-        candidate_transformations.append((SequenceBuilder(sample_interval=self.sample_interval,
-                                                          window_size=self.window_size,
-                                                          subseq_length=self.subseq_length,
-                                                          padding_value=self.padding_value)))
-        candidate_transformations.append((ExpandTransformTimeSeries()))
+        if self.sample_interval > 1:
+            candidate_transformations.append(SequenceBuilder(sample_interval=self.sample_interval))
+        candidate_transformations.append(ExpandTransformTimeSeries())
         if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
             candidate_transformations.extend(X['preprocess_transforms'])
 

From 2e30b711acd57b1def8f014a3792ebd7e434bb4b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 10:41:29 +0100
Subject: [PATCH 101/347] new argument to custom_collect_fn

---
 autoPyTorch/utils/common.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index 7be8a233c..c7ecd1e78 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union
+from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union, Callable
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -75,7 +75,7 @@ def __str__(self) -> str:
             self.hyperparameter, self.value_range, self.default_value, self.log)
 
 
-def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
+def custom_collate_fn(batch: List, collate: Callable = default_collate) -> List[Optional[torch.Tensor]]:
     """
     In the case of not providing a y tensor, in a
     dataset of form {X, y}, y would be None.
@@ -86,6 +86,8 @@ def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
 
     Args:
         batch (List): a batch from a dataset
+        collate (callable): how the data is collected, e.g., when one want to pad sequences with different lengths.
+            collate is only applied to X, for y, the normal default_collate is applied.
 
     Returns:
         List[Optional[torch.Tensor]]
@@ -94,7 +96,7 @@ def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
     items = list(zip(*batch))
 
     # The feature will always be available
-    items[0] = default_collate(items[0])
+    items[0] = collate(items[0])
     if None in items[1]:
         items[1] = list(items[1])
     else:

From f193423efecbd85b0a5f1ae48fe399a8f2927932 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 12:10:50 +0100
Subject: [PATCH 102/347] new target scaler, allow NoNorm for MLP Encpder

---
 .../TargetMeanAbsScaler.py                      | 17 +++++++++++++++++
 .../forecasting_target_scaling/utils.py         |  9 +++++++++
 .../forecasting_encoder/MLPEncoder.py           |  2 +-
 3 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
new file mode 100644
index 000000000..51de0c9a7
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, Optional, Union
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
+    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+
+
+class TargetMeanAbsScaler(BaseTargetScaler):
+    @property
+    def scaler_mode(self):
+        return 'mean_abs'
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TargetMeanAbsScaler',
+            'name': 'TargetMeanAbsScaler'
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index 8230e5f0b..f56920a63 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -47,6 +47,15 @@ def transform(self, past_targets: torch.Tensor, future_targets: Optional[torch.T
                 future_targets = future_targets / scale
             return past_targets / scale, future_targets, None, scale
 
+        elif self.mode == 'mean_abs':
+            mean_abs = torch.mean(torch.abs(past_targets), dim=1,  keepdim=True)
+            mean_abs[mean_abs == 0.0] = 1.0
+            scale = mean_abs
+            if future_targets is not None:
+                future_targets = future_targets / scale
+            return past_targets / scale, future_targets, None, scale
+
+
         elif self.mode == "none":
             return past_targets, future_targets, None, None
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index 847251238..1b555c119 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -157,7 +157,7 @@ def get_hyperparameter_search_space(
                                                                              log=True
                                                                              ),
             normalization: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='normalization',
-                                                                                 value_range=('BN', 'LN'),
+                                                                                 value_range=('BN', 'LN', 'NoNorm'),
                                                                                  default_value='BN'),
             dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
                                                                            value_range=(0, 0.8),

From 752a58fae8538f197a3a84c2793b1269c6f691b0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 12:11:52 +0100
Subject: [PATCH 103/347] allow sampling full sequences

---
 .../setup/network/forecasting_network.py      |  6 +-
 .../time_series_forecasting_data_loader.py    | 87 +++++++++++++------
 .../trainer/forecasting_trainer/__init__.py   |  7 +-
 .../forecasting_base_trainer.py               |  6 +-
 autoPyTorch/utils/common.py                   |  6 +-
 5 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index f784acd0f..deae048b1 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -352,10 +352,12 @@ def __init__(
         self.forecast_strategy = forecast_strategy
         self.num_samples = num_samples
         self.aggregation = aggregation
+        self.window_size = None
 
     @property
     def _required_fit_requirements(self):
         return [
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_encoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_decoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
@@ -387,6 +389,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
 
+        self.window_size = X['window_size']
+
         if X['decoder_properties']['has_hidden_states']:
             # decoder is RNN
             self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
@@ -424,7 +428,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
 
         for i, (X_batch, Y_batch) in enumerate(loader):
             # Predict on batch
-            X = X_batch['past_target']
+            X = X_batch['past_target'][:, -self.window_size:]
 
             X = X.float()
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index c7251d53b..e76a736e0 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,6 +1,5 @@
 from typing import Any, Dict, Optional, Tuple, Union, Sequence, List
-
-from torch.utils.data.sampler import SubsetRandomSampler
+from functools import partial
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
@@ -9,12 +8,12 @@
 import numpy as np
 
 import torch
+from torch.utils.data.sampler import SubsetRandomSampler
 from torch._six import container_abcs, string_classes, int_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format, default_collate
 
 import torchvision
 
-import warnings
-
 from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from autoPyTorch.utils.common import (
@@ -27,31 +26,68 @@
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 
 
-class PaddingCollecter:
+def pad_sequence_from_start(sequences: List[torch.Tensor],
+                            seq_minimal_length: int,
+                            batch_first=False,
+                            padding_value=0.0) -> torch.Tensor:
+    r"""
+    This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we pad new values from the start of
+    the sequence. i.e., instead of extending [1,2,3] to [1,2,3,0,0], we extend it as [0,0,1,2,3]. Additionally, the
+    generated sequnece needs to have a length of at least seq_minimal_length
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+    max_len = max(max([s.size(0) for s in sequences]), seq_minimal_length)
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].new_full(out_dims, padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.size(0)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, -length:, ...] = tensor
+        else:
+            out_tensor[-length:, i, ...] = tensor
+
+    return out_tensor
+
+
+class PadSequenceCollector:
     """
     A collector that transform the sequences from dataset. Since the sequences might contain different
     length, they need to be padded with constant value. Given that target value might require special value to
     fit the requirement of distribution, past_target will be padded with special values
-    #TODO implement padding collect!!
 
     """
 
-    def __init__(self, target_padding_value: float = 0.0):
+    def __init__(self, window_size: int, target_padding_value: float = 0.0):
+        self.window_size = window_size
         self.target_padding_value = target_padding_value
 
     def __call__(self, batch, padding_value=0.0):
-
         elem = batch[0]
         elem_type = type(elem)
         if isinstance(elem, torch.Tensor):
-            out = None
-            if torch.utils.data.get_worker_info() is not None:
-                # If we're in a background process, concatenate directly into a
-                # shared memory tensor to avoid an extra copy
-                numel = sum([x.numel() for x in batch])
-                storage = elem.storage()._new_shared(numel)
-                out = elem.new(storage)
-            return torch.stack(batch, 0, out=out)
+            seq = pad_sequence_from_start(batch,
+                                          seq_minimal_length=self.window_size,
+                                          batch_first=True, padding_value=padding_value) # type: torch.Tensor
+            return seq
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+                return default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
         elif isinstance(elem, float):
             return torch.tensor(batch, dtype=torch.float64)
         elif isinstance(elem, int_classes):
@@ -228,7 +264,7 @@ def __init__(self,
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.subseq_length = self.window_size
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
-        self.padding_value = 0.0
+        self.padding_collector = PadSequenceCollector(self.window_size, 0.0)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -245,15 +281,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         sample_interval = X.get('sample_interval', 1)
         if sample_interval > 1:
             # for lower resolution, window_size should be smaller
-            self.window_size = self.sample_interval * ((self.window_size - 1) // self.sample_interval) + 1
+            self.window_size = (self.window_size - 1) // self.sample_interval + 1
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
         # this value corresponds to budget type num_sample_per_seq
         fraction_samples_per_seq = X.get('fraction_samples_per_seq', 1.0)
         self.sample_interval = sample_interval
 
-        self.padding_value = X.get('required_padding_value', 0.0)
+        padding_value = X.get('required_padding_value', 0.0)
 
+        self.padding_collector.target_padding_value = padding_value
         # Make sure there is an optimizer
         self.check_requirements(X, y)
 
@@ -339,7 +376,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', True),
-            collate_fn=custom_collate_fn,
+            collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
             sampler=self.sampler_train,
         )
 
@@ -350,7 +387,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', False),
-            collate_fn=custom_collate_fn,
+            collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
         )
         return self
 
@@ -396,12 +433,6 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         """
         # TODO more supported inputs
         if isinstance(X, (np.ndarray, torch.Tensor)):
-            X = X[-self.subseq_length - self.n_prediction_steps + 1:]
-
-            if y is not None:
-                # we want to make sure that X, and y can be mapped one to one (as sampling y requires a shifted value)
-                y = y[-self.subseq_length - self.n_prediction_steps + 1:]
-
             dataset = TimeSeriesSequence(
                 X=X, Y=y,
                 # This dataset is used for loading test data in a batched format
@@ -425,7 +456,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
             dataset_test,
             batch_size=min(batch_size, len(dataset)),
             shuffle=False,
-            collate_fn=custom_collate_fn,
+            collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
         )
 
     def get_train_data_loader(self) -> torch.utils.data.DataLoader:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 6fb7da5be..574c0d3b9 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -43,8 +43,10 @@ class ForecastingTrainerChoice(TrainerChoice):
     @property
     def _fit_requirements(self) -> Optional[List[FitRequirement]]:
         fit_requirements = super()._fit_requirements
-        fit_requirements.append(FitRequirement("target_scaler", (BaseTargetScaler,),
-                                               user_defined=False, dataset_property=False))
+        fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
+                                               user_defined=False, dataset_property=False),
+                                FitRequirement("window_size", (int,), user_defined=False, dataset_property=False)]
+                                )
         return fit_requirements
 
     def get_budget_tracker(self, X):
@@ -82,6 +84,7 @@ def prepare_trainer(self, X):
             task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
             labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
             step_interval=X['step_interval'],
+            window_size=X['window_size'],
             dataset_properties=X['dataset_properties'],
             target_scaler=X['target_scaler'],
             backcast_loss_ratio=X.get('backcast_loss_ratio', 0.0)
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 6b9787215..d3d3dbeee 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -44,6 +44,7 @@ def prepare(
             task_type: int,
             labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
             step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+            window_size: int = 20,
             dataset_properties: Optional[Dict] = None,
             target_scaler: BaseTargetScaler = TargetNoScaler(),
             backcast_loss_ratio: Optional[float] = None,
@@ -67,6 +68,7 @@ def prepare(
         self.metrics_kwargs = metric_kwargs
         self.target_scaler = target_scaler  # typing: BaseTargetScaler
         self.backcast_loss_ratio = backcast_loss_ratio
+        self.window_size = window_size
 
     def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                     writer: Optional[SummaryWriter],
@@ -151,7 +153,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        past_target = data['past_target']
+        past_target = data['past_target'][:, -self.window_size:]
 
         # prepare
         past_target = past_target.float()
@@ -226,7 +228,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
         with torch.no_grad():
             for step, (data, future_targets) in enumerate(test_loader):
-                past_target = data['past_target']
+                past_target = data['past_target'][:, -self.window_size:]
 
                 mase_coefficients.append(data['mase_coefficient'])
 
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index c7ecd1e78..bbbf50756 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -75,7 +75,7 @@ def __str__(self) -> str:
             self.hyperparameter, self.value_range, self.default_value, self.log)
 
 
-def custom_collate_fn(batch: List, collate: Callable = default_collate) -> List[Optional[torch.Tensor]]:
+def custom_collate_fn(batch: List, x_collector: Callable = default_collate) -> List[Optional[torch.Tensor]]:
     """
     In the case of not providing a y tensor, in a
     dataset of form {X, y}, y would be None.
@@ -86,7 +86,7 @@ def custom_collate_fn(batch: List, collate: Callable = default_collate) -> List[
 
     Args:
         batch (List): a batch from a dataset
-        collate (callable): how the data is collected, e.g., when one want to pad sequences with different lengths.
+        x_collector (callable): how the data is collected, e.g., when one want to pad sequences with different lengths.
             collate is only applied to X, for y, the normal default_collate is applied.
 
     Returns:
@@ -96,7 +96,7 @@ def custom_collate_fn(batch: List, collate: Callable = default_collate) -> List[
     items = list(zip(*batch))
 
     # The feature will always be available
-    items[0] = collate(items[0])
+    items[0] = x_collector(items[0])
     if None in items[1]:
         items[1] = list(items[1])
     else:

From 2ab286bb909f7b3e855a2e81919867e7f8279f52 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 12:58:36 +0100
Subject: [PATCH 104/347] integrate SeqBuilder to SequenceCollector

---
 .../forecasting_encoder/MLPEncoder.py         | 21 +----
 .../forecasting_encoder/NBEATSEncoder.py      | 11 +--
 .../base_forecasting_encoder.py               |  4 +
 .../time_series_forecasting_data_loader.py    | 89 ++++---------------
 4 files changed, 24 insertions(+), 101 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
index 1b555c119..5957fe585 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
@@ -19,8 +19,6 @@
 class TimeSeriesMLPrecpocessor(EncoderNetwork):
     def __init__(self,
                  window_size: int,
-                 fill_lower_resolution_seq: bool = False,
-                 fill_kwargs: Dict = {},
                  ):
         """
         Transform the input features (B, T, N) to fit the requirement of MLP
@@ -32,8 +30,6 @@ def __init__(self,
         """
         super().__init__()
         self.window_size = window_size
-        self.fill_lower_resolution_seq = fill_lower_resolution_seq
-        self.fill_interval = fill_kwargs.get('loader_sample_interval', 1)
 
     def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
@@ -57,13 +53,6 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
             if x.shape[1] > self.window_size:
                 # we need to ensure that the input size fits the network shape
                 x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
-        if self.fill_lower_resolution_seq and x.shape[1] < self.window_size:
-            x = F.conv_transpose1d(x.transpose(1, 2),
-                                   F.pad(torch.ones((1, 1, 1)), (1, 1)),
-                                   stride=self.fill_interval,
-                                   padding=1).transpose(1, 2)
-            if x.shape[1] < self.window_size:
-                x = torch.cat([torch.zeros(x.shape[0], self.window_size - x.shape[1], x.shape[2]), x], dim=1)
         x = x.flatten(-2)
         return x
 
@@ -71,8 +60,6 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
 class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
     _fixed_seq_length = True
     window_size = 1
-    fill_lower_resolution_seq = False
-    fill_kwargs = {}
 
     def encoder_properties(self):
         encoder_properties = super().encoder_properties()
@@ -91,17 +78,11 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
         # when resolution is smaller
-        if 'sample_interval' in X and X['sample_interval'] > 1.:
-            self.fill_lower_resolution_seq = True
-            self.fill_kwargs = {'loader_sample_interval': X['sample_interval']}
-
         return super().fit(X, y)
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1] * self.window_size
-        feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size,
-                                                        fill_lower_resolution_seq=self.fill_lower_resolution_seq,
-                                                        fill_kwargs=self.fill_kwargs)
+        feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
         return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features))
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
index 3927deaa5..61ea1654e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
@@ -25,8 +25,6 @@ class NBEATSEncoder(BaseForecastingEncoder):
     """
     _fixed_seq_length = True
     window_size = 1
-    fill_lower_resolution_seq = False
-    fill_kwargs = {}
 
     def encoder_properties(self):
         encoder_properties = super().encoder_properties()
@@ -43,17 +41,10 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
-        # when resolution is smaller
-        if 'sample_interval' in X and X['sample_interval'] > 1.:
-            self.fill_lower_resolution_seq = True
-            self.fill_kwargs = {'loader_sample_interval': X['sample_interval']}
         return super().fit(X, y)
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size,
-                                                 fill_lower_resolution_seq=self.fill_lower_resolution_seq,
-                                                 fill_kwargs=self.fill_kwargs
-                                                 )
+        preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
         return preprocessor
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
index 128cef415..a04f08a0a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -116,10 +116,14 @@ def encoder_properties(self):
         bijective_seq_output, determines if the network returns a sequence with the same sequence length as the input
         sequence when output_seq is set True
         fix_input_shape if the input shape is fixed, this is useful for building network head
+        lagged_input, if lagged input values are applied, this technique is implemented in DeepAR and Transformer
+        implemented in gluonTS:
+        https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/model/deepar/module.py
         """
         encoder_properties = {'has_hidden_states': False,
                               'bijective_seq_output': True,
                               'fixed_input_seq_length': False,
+                              'lagged_input': False,
                               }
         return encoder_properties
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index e76a736e0..f541cf18f 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -66,8 +66,9 @@ class PadSequenceCollector:
 
     """
 
-    def __init__(self, window_size: int, target_padding_value: float = 0.0):
+    def __init__(self, window_size: int, sample_interval:int= 1, target_padding_value: float = 0.0):
         self.window_size = window_size
+        self.sample_interval = sample_interval
         self.target_padding_value = target_padding_value
 
     def __call__(self, batch, padding_value=0.0):
@@ -77,7 +78,14 @@ def __call__(self, batch, padding_value=0.0):
             seq = pad_sequence_from_start(batch,
                                           seq_minimal_length=self.window_size,
                                           batch_first=True, padding_value=padding_value) # type: torch.Tensor
-            return seq
+            if self.sample_interval > 1:
+                subseq_length = seq.shape[1]
+                first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
+                sample_indices = torch.arange(first_indices, 0, step=self.sample_interval)
+                return seq[:, sample_indices]
+            else:
+                return seq
+
         elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
                 and elem_type.__name__ != 'string_':
             if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
@@ -183,47 +191,6 @@ def __call__(self, data: np.ndarray) -> np.ndarray:
         return data
 
 
-class SequenceBuilder(object):
-    """build a time sequence token from the given time sequence
-    it requires two hyperparameters: sample_interval and window size
-    let's assume we have a time sequence
-    x = [0 1 2 3 4 5 6 7 8 9 10].with window_size=3 and sample resolution=2
-    then the extracted time series is [6, 8, 10] (or x[-5,-3,-1])
-    if window_size=3 and sample_resolution=3
-    then the extracted token is [4, 7, 10] (or x[-7,-4,-1])
-
-    Parameters
-    ----------
-    sample_interval : int, default=1
-        sample resolution
-
-    window_size : int, default=1
-        sliding window size
-    """
-
-    def __init__(self, sample_interval: int = 1, ):
-        """
-        initialization
-        Args:
-            sample_interval: int: sample resolution
-            window_size: int: the size of the sliding window
-        """
-        self.sample_interval = sample_interval
-        # assuming that subseq_length is 10, e.g., we can only start from -10. sample_interval = -4
-        # we will sample the following indices: [-9,-5,-1]
-        # self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
-
-    def __call__(self, data: np.ndarray) -> np.ndarray:
-        if self.sample_interval == 1:
-            return data
-        else:
-            subseq_length = len(data)
-            first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
-            sample_indices = np.arange(first_indices, 0, step=self.sample_interval)
-
-            return data[sample_indices]
-
-
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
     """This class is an interface to read time sequence data
 
@@ -264,7 +231,7 @@ def __init__(self,
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.subseq_length = self.window_size
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
-        self.padding_collector = PadSequenceCollector(self.window_size, 0.0)
+        self.padding_collector = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -277,23 +244,22 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Returns:
             A instance of self
         """
+        self.check_requirements(X, y)
+
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
+        padding_value = X.get('required_padding_value', 0.0)
+        self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value)
+
         if sample_interval > 1:
             # for lower resolution, window_size should be smaller
-            self.window_size = (self.window_size - 1) // self.sample_interval + 1
+            self.window_size = (self.window_size - 1) // sample_interval + 1
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
         # this value corresponds to budget type num_sample_per_seq
         fraction_samples_per_seq = X.get('fraction_samples_per_seq', 1.0)
         self.sample_interval = sample_interval
 
-        padding_value = X.get('required_padding_value', 0.0)
-
-        self.padding_collector.target_padding_value = padding_value
-        # Make sure there is an optimizer
-        self.check_requirements(X, y)
-
         # Incorporate the transform to the dataset
         datamanager = X['backend'].load_datamanager()  # type: TimeSeriesForcecastingDataset
 
@@ -346,24 +312,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         # TODO consider the case where num_instances_train is greater than num_instances_dataset,
         # In which case we simply iterate through all the datasets
 
-        """
-        # to allow a time sequence data with resolution self.sample_interval and windows size with self.window_size
-        # we need to drop the first part of each sequence
-        for seq_idx, seq_length in enumerate(datamanager.sequence_lengths_train):
-            idx_end = idx_start + seq_length
-            #full_sequence = np.random.choice(np.arange(idx_start, idx_end)[self.subseq_length:], 5)
-            #full_sequence = np.arange(idx_start, idx_end)[self.subseq_length:]
-            #full_sequence = np.random.choice(np.arange(idx_start, idx_end)[self.subseq_length:], 5)
-            full_sequence = np.arange(idx_start, idx_end)
-            valid_indices.append(full_sequence)
-            idx_start = idx_end
-
-        valid_indices = np.hstack([valid_idx for valid_idx in valid_indices])
-        _, sampler_indices_train, _ = np.intersect1d(train_split, valid_indices, return_indices=True)
-        """
-        # test_indices not required as testsets usually lies on the trail of hte sequence
-        # _, sampler_indices_test, _ = np.intersect1d(test_split, valid_indices)
-
         sampler_indices_train = np.arange(num_instances_dataset)
 
         self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
@@ -414,8 +362,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         # if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
         #    candidate_transformations.extend(X['preprocess_transforms'])
-        if self.sample_interval > 1:
-            candidate_transformations.append(SequenceBuilder(sample_interval=self.sample_interval))
+
         candidate_transformations.append(ExpandTransformTimeSeries())
         if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
             candidate_transformations.extend(X['preprocess_transforms'])

From d90d6300f785e3dee82856f40c83da693aaffb3c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 13:34:21 +0100
Subject: [PATCH 105/347] restore SequenceBuilder to reduce memory usage

---
 .../time_series_forecasting_data_loader.py    | 54 +++++++++++++++----
 1 file changed, 44 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index f541cf18f..601675308 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -66,9 +66,8 @@ class PadSequenceCollector:
 
     """
 
-    def __init__(self, window_size: int, sample_interval:int= 1, target_padding_value: float = 0.0):
+    def __init__(self, window_size: int, target_padding_value: float = 0.0):
         self.window_size = window_size
-        self.sample_interval = sample_interval
         self.target_padding_value = target_padding_value
 
     def __call__(self, batch, padding_value=0.0):
@@ -78,13 +77,7 @@ def __call__(self, batch, padding_value=0.0):
             seq = pad_sequence_from_start(batch,
                                           seq_minimal_length=self.window_size,
                                           batch_first=True, padding_value=padding_value) # type: torch.Tensor
-            if self.sample_interval > 1:
-                subseq_length = seq.shape[1]
-                first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
-                sample_indices = torch.arange(first_indices, 0, step=self.sample_interval)
-                return seq[:, sample_indices]
-            else:
-                return seq
+            return seq
 
         elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
                 and elem_type.__name__ != 'string_':
@@ -190,6 +183,43 @@ def __call__(self, data: np.ndarray) -> np.ndarray:
             data = np.expand_dims(data, axis=-1)
         return data
 
+class SequenceBuilder(object):
+    """build a time sequence token from the given time sequence
+    it requires two hyperparameters: sample_interval and window size
+    let's assume we have a time sequence
+    x = [0 1 2 3 4 5 6 7 8 9 10].with window_size=3 and sample resolution=2
+    then the extracted time series is [6, 8, 10] (or x[-5,-3,-1])
+    if window_size=3 and sample_resolution=3
+    then the extracted token is [4, 7, 10] (or x[-7,-4,-1])
+    Parameters
+    ----------
+    sample_interval : int, default=1
+        sample resolution
+    window_size : int, default=1
+        sliding window size
+    """
+
+    def __init__(self, sample_interval: int = 1, ):
+        """
+        initialization
+        Args:
+            sample_interval: int: sample resolution
+            window_size: int: the size of the sliding window
+        """
+        self.sample_interval = sample_interval
+        # assuming that subseq_length is 10, e.g., we can only start from -10. sample_interval = -4
+        # we will sample the following indices: [-9,-5,-1]
+        # self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
+
+    def __call__(self, data: np.ndarray) -> np.ndarray:
+        if self.sample_interval == 1:
+            return data
+        else:
+            subseq_length = len(data)
+            first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
+            sample_indices = np.arange(first_indices, 0, step=self.sample_interval)
+
+            return data[sample_indices]
 
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
     """This class is an interface to read time sequence data
@@ -249,11 +279,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
-        self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value)
 
         if sample_interval > 1:
             # for lower resolution, window_size should be smaller
             self.window_size = (self.window_size - 1) // sample_interval + 1
+
+        self.padding_collector = PadSequenceCollector(self.window_size, padding_value)
+
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
         # this value corresponds to budget type num_sample_per_seq
@@ -362,6 +394,8 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         # if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
         #    candidate_transformations.extend(X['preprocess_transforms'])
+        if self.sample_interval > 1:
+            candidate_transformations.append(SequenceBuilder(sample_interval=self.sample_interval))
 
         candidate_transformations.append(ExpandTransformTimeSeries())
         if "test" in mode or not X['dataset_properties']['is_small_preprocess']:

From adcf8a06c71d21a10b37451804ff1d3135c881d2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 17:40:39 +0100
Subject: [PATCH 106/347] move scaler to network

---
 .../setup/network/forecasting_network.py      | 134 +++++++++++++-----
 .../forecasting_decoder/NBEATSDecoder.py      |   4 +-
 .../InceptionTimeEncoder.py                   |  10 ++
 .../forecasting_encoder/TCNEncoder.py         |  21 ++-
 .../forecasting_network_head/NBEATS_head.py   |   4 +-
 .../forecasting_base_trainer.py               |  95 ++++---------
 .../pipeline/time_series_forecasting.py       |   6 +-
 7 files changed, 169 insertions(+), 105 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index deae048b1..4a1af8b47 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -9,6 +9,12 @@
 import torch
 from torch import nn
 
+from torch.distributions import (
+    AffineTransform,
+    TransformedDistribution,
+)
+
+
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
@@ -21,6 +27,14 @@
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
 
 
+class TransformedDistribution_(TransformedDistribution):
+    def mean(self):
+        mean = self.base_dist.mean
+        for transform in self.transforms:
+            mean = transform(mean)
+        return mean
+
+
 class ForecastingNet(nn.Module):
     future_target_required = False
 
@@ -29,7 +43,9 @@ def __init__(self,
                  network_encoder: EncoderNetwork,
                  network_decoder: nn.Module,
                  network_head: Optional[nn.Module],
+                 window_size: int,
                  n_prediction_steps: int,
+                 target_scaler: BaseTargetScaler,
                  encoder_properties: Dict,
                  decoder_properties: Dict,
                  output_type: str = 'regression',
@@ -65,7 +81,11 @@ def __init__(self,
         self.decoder = network_decoder
         self.head = network_head
 
+        self.target_scaler = target_scaler
+
         self.n_prediction_steps = n_prediction_steps
+        self.window_size = window_size
+
         self.output_type = output_type
         self.forecast_strategy = forecast_strategy
         self.num_samples = num_samples
@@ -77,6 +97,36 @@ def __init__(self,
                                  'for decoder!')
         self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
         self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
+        self._device = torch.device('cpu')
+
+    @property
+    def device(self):
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device):
+        self.to(device)
+        self._device = device
+
+    def rescale_output(self,
+                       outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                       loc: Optional[torch.Tensor],
+                       scale: Optional[torch.Tensor],
+                       device: torch.device = torch.device('cpu')):
+        if loc is not None or scale is not None:
+            if isinstance(outputs, torch.distributions.Distribution):
+                transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
+                                            scale=1.0 if scale is None else scale.to(device),
+                                            )
+                outputs = TransformedDistribution_(outputs, [transform])
+            else:
+                if loc is None:
+                    outputs = outputs * scale.to(device)
+                elif scale is None:
+                    outputs = outputs + loc.to(device)
+                else:
+                    outputs = outputs * scale.to(device) + loc.to(device)
+        return outputs
 
     def forward(self,
                 targets_past: torch.Tensor,
@@ -85,10 +135,12 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        targets_past, _, loc, scale = self.target_scaler(targets_past)
         if features_past is not None:
             x_past = torch.cat([targets_past, features_past], dim=1)
         else:
             x_past = targets_past
+        x_past = x_past.to(device=self.device)
         x_past = self.embedding(x_past)
         if self.encoder_has_hidden_states:
             x_past, _ = self.encoder(x_past)
@@ -96,7 +148,7 @@ def forward(self,
             x_past = self.encoder(x_past)
         x_past = self.decoder(x_past)
         output = self.head(x_past)
-        return output
+        return self.rescale_output(output, loc, scale, self.device)
 
     def pred_from_net_output(self, net_output):
         if self.output_type == 'regression':
@@ -150,8 +202,10 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        past_target, targets_future, loc, scale = self.target_scaler(targets_past, targets_future)
         x_past = targets_past if features_past is None else torch.cat([targets_past, features_past], dim=-1)
 
+        x_past = x_past.to(self.device)
         x_past = self.embedding(x_past)
 
         if self.training:
@@ -160,32 +214,37 @@ def forward(self,
 
             x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
                                                                                 dim=-1)
+            x_future = x_future.to(self.device)
 
             _, hidden_states = self.encoder(x_past)
             x_future, _ = self.decoder(x_future, hidden_states)
             net_output = self.head(x_future)
 
-            return net_output
+            return self.rescale_output(net_output, loc, scale, self.device)
         else:
             all_predictions = []
-            predicted_target = targets_past[:, [-1]]
+            predicted_target = targets_past[:, [-1]].to(self.device)
 
             _, hidden_states = self.encoder(x_past)
+            if features_future is not None:
+                features_future = features_future.to(self.devicee)
+
             for idx_pred in range(self.n_prediction_steps):
                 x_future = predicted_target if features_future is None else torch.cat(
                     [predicted_target, features_future[:, [idx_pred], :]],
                     dim=-1)
+                x_future = x_future.to(self.device)
 
                 x_future, hidden_states = self.decoder(x_future, hx=hidden_states)
                 net_output = self.head(x_future[:, -1:, ])
-                predicted_target = self.pred_from_net_output(net_output).to(targets_past.device)
+                predicted_target = self.pred_from_net_output(net_output)
 
                 all_predictions.append(net_output)
 
             if self.output_type != 'distribution':
                 all_predictions = torch.cat(all_predictions, dim=1)
 
-            return all_predictions
+            return self.rescale_output(all_predictions, loc, scale, self.device)
 
     def predict(self,
                 targets_past: torch.Tensor,
@@ -220,14 +279,17 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        past_target, targets_future, loc, scale = self.target_scaler(targets_past, targets_future)
         x_past = targets_past if features_past is None else torch.cat([targets_past, features_past], dim=-1)
 
+        x_past = x_past.to(self.device)
         # TODO consider static features
         x_past = self.embedding(x_past)
 
         if self.training:
             x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
                                                                                 dim=-1)
+            x_future = x_future.to(self.device)
             x_future = self.embedding(x_future)
 
             x_input = torch.cat([x_past, x_future[:, :-1]], dim=1)
@@ -238,7 +300,7 @@ def forward(self,
                 x_input = self.encoder(x_input, output_seq=True)
 
             net_output = self.head(self.decoder(x_input))
-            return net_output
+            return self.rescale_output(net_output, loc, scale, self.device)
         else:
             all_predictions = []
             batch_size = targets_past.shape[0]
@@ -257,7 +319,8 @@ def forward(self,
             else:
                 # For other models, the full past targets are passed to the network.
                 encoder_output = self.encoder(x_past)
-                repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples, dim=0).squeeze(1)
+                repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples,
+                                                                      dim=0).squeeze(1).to(self.device)
 
             repeated_static_feat = features_static.repeat_interleave(
                 repeats=self.num_samples, dim=0
@@ -282,6 +345,7 @@ def forward(self,
                                                                                    repeated_time_feat[:, k:k + 1]],
                                                                                   dim=-1)
                 if self.encoder_has_hidden_states:
+                    x_next = x_next.to(self.device)
                     encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
                 else:
                     x_next = torch.cat([repeated_past_target, x_next], dim=1)
@@ -297,22 +361,24 @@ def forward(self,
 
             all_predictions = torch.cat(all_predictions, dim=1).unflatten(0, (batch_size, self.num_samples))
 
-            return all_predictions
+            if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
+                raise ValueError(
+                    f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
+                    f"but this network has {self.output_type} and {self.forecast_strategy}")
+            if self.aggregation == 'mean':
+                return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale, self.device)
+            elif self.aggregation == 'median':
+                return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale, self.device)
+            else:
+                raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
     def pred_from_net_output(self, net_output: torch.Tensor):
-        if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
-            raise ValueError(f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
-                             f"but this network has {self.output_type} and {self.forecast_strategy}")
-        if self.aggregation == 'mean':
-            return torch.mean(net_output, dim=1)
-        elif self.aggregation == 'median':
-            return torch.median(net_output, dim=1)[0]
-        else:
-            raise ValueError(f'Unknown aggregation: {self.aggregation}')
+        return net_output
 
 
 class NBEATSNet(ForecastingNet):
     future_target_required = False
+
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
@@ -320,13 +386,25 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        forecast = torch.zeros([self.n_prediction_steps, *targets_past[1:]])
+        targets_past, _, loc, scale = self.target_scaler(targets_past)
+        targets_past = targets_past.to(self.device)
+
+        batch_size = targets_past.shape[0]
+        output_shape = targets_past.shape[2:]
+        forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
+
+        forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
         backcast = self.encoder(targets_past)
         for block in self.decoder:
             backcast_block, forecast_block = block(backcast)
 
             backcast = backcast - backcast_block
             forecast = forecast + forecast_block
+        backcast = backcast.reshape(targets_past.shape)
+        forecast = forecast.reshape(forcast_shape)
+
+        backcast = self.rescale_output(backcast, loc, scale, self.device)
+        forecast = self.rescale_output(forecast, loc, scale, self.device)
         if self.training:
             return backcast, forecast
         else:
@@ -352,7 +430,6 @@ def __init__(
         self.forecast_strategy = forecast_strategy
         self.num_samples = num_samples
         self.aggregation = aggregation
-        self.window_size = None
 
     @property
     def _required_fit_requirements(self):
@@ -362,6 +439,7 @@ def _required_fit_requirements(self):
             FitRequirement("network_encoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_decoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
+            FitRequirement("target_scaler", (BaseTargetScaler, ), user_defined=False, dataset_property=False),
             FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
             FitRequirement("encoder_properties", (Dict,), user_defined=False, dataset_property=False),
             FitRequirement("decoder_properties", (Dict,), user_defined=False, dataset_property=False),
@@ -381,15 +459,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    network_encoder=X['network_encoder'],
                                    network_decoder=X['network_decoder'],
                                    network_head=X['network_head'],
+                                   window_size=X['window:size'],
                                    n_prediction_steps=X['dataset_properties']['n_prediction_steps'],
                                    encoder_properties=X['encoder_properties'],
                                    decoder_properties=X['decoder_properties'],
+                                   target_scaler=X['target_scaler'],
                                    output_type=self.net_out_type,
                                    forecast_strategy=self.forecast_strategy,
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
 
-        self.window_size = X['window_size']
 
         if X['decoder_properties']['has_hidden_states']:
             # decoder is RNN
@@ -428,25 +507,12 @@ def predict(self, loader: torch.utils.data.DataLoader,
 
         for i, (X_batch, Y_batch) in enumerate(loader):
             # Predict on batch
-            X = X_batch['past_target'][:, -self.window_size:]
+            X = X_batch['past_target']
 
             X = X.float()
 
-            if target_scaler is None:
-                loc = 0.
-                scale = 1.
-            else:
-                X, _, loc, scale = target_scaler(X)
-
-            X = X.to(self.device)
-
             with torch.no_grad():
                 Y_batch_pred = self.network.predict(X)
-                if loc is None:
-                    loc = 0.
-                if scale is None:
-                    scale = 1.
-                Y_batch_pred = Y_batch_pred.cpu() * scale + loc
 
             Y_batch_preds.append(Y_batch_pred.cpu())
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
index 8a5445673..4cf6dd528 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -53,8 +53,10 @@ def __init__(self,
 
     def build_backbone(self):
         layers: List[nn.Module] = list()
+        n_in_features = self.n_in_features
         for _ in range(self.num_layers):
-            self._add_layer(layers, self.n_in_features)
+            self._add_layer(layers, n_in_features)
+            n_in_features = self.width
         return layers
 
     def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
index 6cf1a0310..3cd31d76d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
@@ -95,11 +95,14 @@ def __init__(self,
         bottleneck_size = self.config["bottleneck_size"]
         kernel_size = self.config["kernel_size"]
         n_res_inputs = in_features
+
+        receptive_field = 1
         for i in range(self.config["num_blocks"]):
             block = _InceptionBlock(n_inputs=n_inputs,
                                     n_filters=n_filters,
                                     bottleneck=bottleneck_size,
                                     kernel_size=kernel_size)
+            receptive_field += max(kernel_size, 3) - 1
             self.__setattr__(f"inception_block_{i}", block)
 
             # add a residual block after every 3 inception blocks
@@ -109,6 +112,7 @@ def __init__(self,
                                                                        n_outputs=n_res_outputs))
                 n_res_inputs = n_res_outputs
             n_inputs = block.get_n_outputs()
+        self.receptive_field = receptive_field
 
     def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
         # swap sequence and feature dimensions for use with convolutional nets
@@ -127,6 +131,7 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
 
 
 class InceptionTimeEncoder(BaseForecastingEncoder):
+    _receptive_field = 1
     """
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
@@ -134,6 +139,7 @@ class InceptionTimeEncoder(BaseForecastingEncoder):
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         encoder = _InceptionTime(in_features=input_shape[-1],
                                   config=self.config)
+        self._receptive_field = encoder.receptive_field
         return encoder
 
     @staticmethod
@@ -146,6 +152,10 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_time_series': True,
         }
 
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'window_size': self._receptive_field})
+        return super().transform(X)
+
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
index 50f28b111..ee56bb5df 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
@@ -8,6 +8,8 @@
     UniformIntegerHyperparameter
 )
 
+import numpy as np
+
 import torch
 from torch import nn
 from torch.nn.utils import weight_norm
@@ -76,17 +78,28 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int =
         super(_TemporalConvNet, self).__init__()
         layers: List[Any] = []
         num_levels = len(num_channels)
+        receptive_field = 1
+
+        # stride_values = []
+
         for i in range(num_levels):
             dilation_size = 2 ** i
             in_channels = num_inputs if i == 0 else num_channels[i - 1]
             out_channels = num_channels[i]
+            stride = 1
+            # stride_values.extend([stride, stride])
             layers += [_TemporalBlock(in_channels,
                                       out_channels,
                                       kernel_size,
-                                      stride=1,
+                                      stride=stride,
                                       dilation=dilation_size,
                                       padding=(kernel_size - 1) * dilation_size,
                                       dropout=dropout)]
+            # receptive_field_block = 1 + (kernel_size - 1) * dilation_size * \
+            #                        (int(np.prod(stride_values[:-2])) * (1 + stride_values[-2]))
+            receptive_field_block = 1 + 2 * (kernel_size - 1) * dilation_size  # stride = 1, we ignore stide computation
+            receptive_field += receptive_field_block
+        self.receptive_field = receptive_field
         self.network = nn.Sequential(*layers)
 
     def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
@@ -101,6 +114,7 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
 
 
 class TCNEncoder(BaseForecastingEncoder):
+    _receptive_field = 1
     """
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
@@ -113,6 +127,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
                                    kernel_size=self.config["kernel_size"],
                                    dropout=self.config["dropout"] if self.config["use_dropout"] else 0.0
                                    )
+        self._receptive_field = encoder.receptive_field
         return encoder
 
     @staticmethod
@@ -126,6 +141,10 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_time_series': True,
         }
 
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'window_size': self._receptive_field})
+        return super().transform(X)
+
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index 4825d12b9..7182e2c7f 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -55,11 +55,12 @@ def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, bac
                                   TransposeLinear(coefficients_backcast * norm))
     forecast_head = nn.Sequential(base_layer,
                                   TransposeLinear(coefficients_forecast * norm))
+
     return backcast_head, forecast_head
 
 
 def get_seasonality_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    base_layer = nn.Linear(block_width, thetas_dim, bias=False)
+    base_layer = nn.Linear(block_width, forecast_length, bias=False)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=False)
 
@@ -85,7 +86,6 @@ def get_frequencies(n):
                                   TransposeLinear(torch.cat([s1_b, s2_b])))
     forecast_head = nn.Sequential(base_layer,
                                   TransposeLinear(torch.cat([s1_f, s2_f])))
-
     return backcast_head, forecast_head
 
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index d3d3dbeee..0acd5d701 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -12,11 +12,7 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-from torch.distributions import (
-    AffineTransform,
-    TransformedDistribution,
-)
-
+from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
@@ -69,6 +65,7 @@ def prepare(
         self.target_scaler = target_scaler  # typing: BaseTargetScaler
         self.backcast_loss_ratio = backcast_loss_ratio
         self.window_size = window_size
+        self.model.device = self.device
 
     def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                     writer: Optional[SummaryWriter],
@@ -119,26 +116,15 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
         else:
             return loss_sum / N, {}
 
-    def rescale_output(self,
-                       outputs: Union[torch.distributions.Distribution, torch.Tensor],
-                       loc: Optional[torch.Tensor],
-                       scale: Optional[torch.Tensor],
-                       device: torch.device = torch.device('cpu')):
-        # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
-        if loc is not None or scale is not None:
-            if isinstance(outputs, torch.distributions.Distribution):
-                transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
-                                            scale=1.0 if scale is None else scale.to(device),
-                                            )
-                outputs = TransformedDistribution(outputs, [transform])
-            else:
-                if loc is None:
-                    outputs = outputs * scale.to(device)
-                elif scale is None:
-                    outputs = outputs + loc.to(device)
-                else:
-                    outputs = outputs * scale.to(device) + loc.to(device)
-        return outputs
+    def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
+        if self.task_type in REGRESSION_TASKS or FORECASTING_TASKS:
+            targets = targets.float()
+            # make sure that targets will have same shape as outputs (really important for mse loss for example)
+            if targets.ndim == 1:
+                targets = targets.unsqueeze(1)
+        else:
+            targets = targets.long()
+        return targets
 
     def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor) \
             -> Tuple[float, torch.Tensor]:
@@ -153,16 +139,17 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        past_target = data['past_target'][:, -self.window_size:]
+        past_target = data['past_target'][:, -self.window_size:].float()
+
 
+        """
         # prepare
         past_target = past_target.float()
         if self.model.future_target_required or isinstance(self.model, NBEATSNet):
             past_target, scaled_future_targets, loc, scale = self.target_scaler(past_target, future_targets)
         else:
             past_target, _, loc, scale = self.target_scaler(past_target)
-
-        past_target = past_target.to(self.device)
+        """
 
         future_targets = self.cast_targets(future_targets)
 
@@ -171,28 +158,26 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         if isinstance(self.model, NBEATSNet):
             past_target, criterion_kwargs_past = self.data_preparation(past_target,
-                                                                       scaled_future_targets.to(self.device))
-            past_target, criterion_kwargs_future = self.data_preparation(past_target, past_target)
+                                                                       past_target.to(self.device))
+            past_target, criterion_kwargs_future = self.data_preparation(past_target, future_targets.to(self.device))
             backcast, forecast = self.model(past_target)
 
             loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
+            loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
+
             loss_backcast = loss_func_backcast(self.criterion, backcast)
-            loss_forecast = loss_func_backcast(self.criterion, forecast)
+            loss_forecast = loss_func_forecast(self.criterion, forecast)
             loss = loss_forecast + loss_backcast * self.backcast_loss_ratio
-        else:
 
+            outputs = forecast
+        else:
             if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
-                future_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
-                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
-            else:
-                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
-
-            if self.model.future_target_required:
-                outputs = self.model(past_target, scaled_future_targets.float().to(self.device))
+                all_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
+                past_target, criterion_kwargs = self.data_preparation(past_target, all_targets.to(self.device))
             else:
-                outputs = self.model(past_target)
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets.to(self.device))
 
-            outputs = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
+            outputs = self.model(past_target, future_targets)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
             loss = loss_func(self.criterion, outputs)
@@ -237,11 +222,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 # prepare
                 past_target = past_target.float()
 
-                past_target, _, loc, scale = self.target_scaler(past_target)
-
-                past_target = past_target.to(self.device)
-
-                future_targets = self.cast_targets(future_targets)
+                future_targets = self.cast_targets(future_targets).to(self.device)
 
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
@@ -250,34 +231,20 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 if isinstance(self.model, ForecastingDeepARNet):
                     # DeepAR only generate sampled points, we replace log_prob loss with MSELoss
                     outputs = self.model.pred_from_net_output(outputs)
-                    outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
-                    loss = F.mse_loss(outputs_rescaled, future_targets)
+                    loss = F.mse_loss(outputs, future_targets)
                     outputs = outputs.detach().cpu()
                 else:
                     if isinstance(outputs, list):
-                        outputs_rescaled = [self.rescale_output(output,
-                                                                loc=loc,
-                                                                scale=scale,
-                                                                device=self.device) for output in outputs]
-
-                        loss = [self.criterion(output_rescaled, future_targets) for output_rescaled in outputs_rescaled]
+                        loss = [self.criterion(output, future_targets) for output in outputs]
                         loss = torch.mean(torch.Tensor(loss))
                     else:
-                        outputs_rescaled = self.rescale_output(outputs, loc=loc, scale=scale, device=self.device)
-                        loss = self.criterion(outputs_rescaled, future_targets)
+                        loss = self.criterion(outputs, future_targets)
                     outputs = self.model.pred_from_net_output(outputs).detach().cpu()
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size
 
-                if loc is None and scale is None:
-                    outputs_data.append(outputs)
-                else:
-                    if loc is None:
-                        loc = 0.
-                    if scale is None:
-                        scale = 1.
-                    outputs_data.append(outputs * scale + loc)
+                outputs_data.append(outputs)
                 targets_data.append(future_targets.detach().cpu())
 
                 if writer:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 2165bd25c..889fc7c81 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -288,7 +288,7 @@ def _get_hyperparameter_search_space(self,
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
                     forbidden_backcast_false)
                 )
-            if 'NBEASTDecoder' in network_decoder_hp.choices:
+            if 'NBEATSDecoder' in network_decoder_hp.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
                     forbidden_encoder_NBEATS)
@@ -355,6 +355,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             # ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
             ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+            ("target_scaler", TargetScalerChoice(default_dataset_properties,
+                                                 random_state=self.random_state)),
             ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
@@ -370,8 +372,6 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                                           random_state=self.random_state)),
             ("lr_scheduler", SchedulerChoice(default_dataset_properties,
                                              random_state=self.random_state)),
-            ("target_scaler", TargetScalerChoice(default_dataset_properties,
-                                                 random_state=self.random_state)),
             ("trainer", ForecastingTrainerChoice(default_dataset_properties, random_state=self.random_state)),
         ])
         return steps

From c3da078f8a13f99d83f0d8d8d0ec0de1b1b56057 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Dec 2021 21:23:50 +0100
Subject: [PATCH 107/347] lag sequence

---
 autoPyTorch/datasets/time_series_dataset.py   |   3 +-
 .../setup/network/forecasting_network.py      | 191 ++++++++++++++----
 .../forecasting_decoder/RNNDecoder.py         |  29 ++-
 .../base_forecasting_decoder.py               |   5 +
 .../forecasting_encoder/RNNEncoder.py         |  33 ++-
 .../time_series_forecasting_data_loader.py    |   2 +-
 .../forecasting_base_trainer.py               |  38 ++--
 .../pipeline/time_series_forecasting.py       |   6 +-
 .../utils/forecasting_time_features.py        |  33 +++
 9 files changed, 272 insertions(+), 68 deletions(-)
 create mode 100644 autoPyTorch/utils/forecasting_time_features.py

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 7751b479f..76ae91981 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -564,7 +564,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
         dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
                                    'upper_window_size': self.upper_window_size,
-                                   'sp': self.seasonality,  # For metric computation
+                                   'sp': self.seasonality,  # For metric computation,
+                                   'freq': self.freq,
                                    'sequence_lengths_train': self.sequence_lengths_train})
         return dataset_properties
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 4a1af8b47..f1fa667bc 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -14,7 +14,7 @@
     TransformedDistribution,
 )
 
-
+from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
@@ -25,6 +25,8 @@
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
+    pad_sequence_from_start
 
 
 class TransformedDistribution_(TransformedDistribution):
@@ -35,6 +37,55 @@ def mean(self):
         return mean
 
 
+def get_lagged_subsequences(
+        sequence: torch.Tensor,
+        subsequences_length: int,
+        lags_seq: List[int]
+) -> torch.Tensor:
+    """
+    Returns lagged subsequences of a given sequence. This is similar to gluonTS's implementation the only difference
+    is that we pad the sequence that is not long enough
+
+    Parameters
+    ----------
+    sequence : Tensor
+        the sequence from which lagged subsequences should be extracted.
+        Shape: (N, T, C).
+    subsequences_length : int
+        length of the subsequences to be extracted.
+
+    Returns
+    --------
+    lagged : Tensor
+        a tensor of shape (N, S, C, I), where S = subsequences_length and
+        I = len(indices), containing lagged subsequences. Specifically,
+        lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+    """
+    batch_size = sequence.shape[0]
+    sequence_length = sequence.shape[1]
+
+    lagged_values = []
+    for lag_index in lags_seq:
+        begin_index = -lag_index - subsequences_length
+        end_index = -lag_index if lag_index > 0 else None
+
+        if end_index is not None and end_index < -sequence_length:
+            lagged_values.append(torch.zeros([batch_size, subsequences_length, *sequence.shape[2:]]))
+            continue
+        if begin_index < -sequence_length:
+            if end_index is not None:
+                pad_shape = [batch_size, subsequences_length - sequence_length - end_index, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence[:, :end_index, ...]], dim=1))
+            else:
+                pad_shape = [batch_size, subsequences_length - sequence_length, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence], dim=1))
+            continue
+        else:
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+    lagged_seq = torch.stack(lagged_values, -1).reshape(batch_size, subsequences_length, -1)
+    return lagged_seq
+
+
 class ForecastingNet(nn.Module):
     future_target_required = False
 
@@ -44,8 +95,8 @@ def __init__(self,
                  network_decoder: nn.Module,
                  network_head: Optional[nn.Module],
                  window_size: int,
-                 n_prediction_steps: int,
                  target_scaler: BaseTargetScaler,
+                 dataset_properties: Dict,
                  encoder_properties: Dict,
                  decoder_properties: Dict,
                  output_type: str = 'regression',
@@ -64,7 +115,7 @@ def __init__(self,
             network_encoder (EncoderNetwork): Encoder network, could be selected to return a sequence or a
             network_decoder (nn.Module): network decoder
             network_head (nn.Module): network head, maps the output of decoder to the final output
-            n_prediction_steps (int): how many steps the network want to predict
+            dataset_properties (Dict): dataset properties
             encoder_properties (Dict): encoder properties
             decoder_properties: (Dict): decoder properties
             output_type (str): the form that the network outputs. It could be regression, distribution and
@@ -83,7 +134,7 @@ def __init__(self,
 
         self.target_scaler = target_scaler
 
-        self.n_prediction_steps = n_prediction_steps
+        self.n_prediction_steps = dataset_properties['n_prediction_steps']
         self.window_size = window_size
 
         self.output_type = output_type
@@ -99,6 +150,9 @@ def __init__(self,
         self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
         self._device = torch.device('cpu')
 
+        self.encoder_lagged_input = encoder_properties['lagged_input']
+        self.decoder_lagged_input = decoder_properties['lagged_input']
+
     @property
     def device(self):
         return self._device
@@ -128,6 +182,20 @@ def rescale_output(self,
                     outputs = outputs * scale.to(device) + loc.to(device)
         return outputs
 
+    def scale_value(self,
+                    outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                    loc: Optional[torch.Tensor],
+                    scale: Optional[torch.Tensor],
+                    device: torch.device = torch.device('cpu')):
+        if loc is not None or scale is not None:
+            if loc is None:
+                outputs = outputs / scale.to(device)
+            elif scale is None:
+                outputs = outputs - loc.to(device)
+            else:
+                outputs = (outputs - loc.to(device)) / scale.to(device)
+        return outputs
+
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
@@ -135,11 +203,20 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        targets_past, _, loc, scale = self.target_scaler(targets_past)
-        if features_past is not None:
-            x_past = torch.cat([targets_past, features_past], dim=1)
+        if self.encoder_lagged_input:
+            targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
+            targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
+            x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
         else:
+            if self.window_size < targets_past.shape[1]:
+                targets_past = targets_past[:, -self.window_size]
+            targets_past = targets_past[:, -self.window_size:]
+            targets_past, _, loc, scale = self.target_scaler(targets_past)
             x_past = targets_past
+
+        if features_past is not None:
+            x_past = torch.cat([x_past, features_past], dim=1)
+
         x_past = x_past.to(device=self.device)
         x_past = self.embedding(x_past)
         if self.encoder_has_hidden_states:
@@ -202,15 +279,30 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        past_target, targets_future, loc, scale = self.target_scaler(targets_past, targets_future)
-        x_past = targets_past if features_past is None else torch.cat([targets_past, features_past], dim=-1)
+        if self.encoder_lagged_input:
+            targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
+            targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
+            x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
+        else:
+            if self.window_size < targets_past.shape[1]:
+                targets_past = targets_past[:, -self.window_size]
+            targets_past = targets_past[:, -self.window_size]
+            targets_past, _, loc, scale = self.target_scaler(targets_past)
+            x_past = targets_past
+
+        x_past = x_past if features_past is None else torch.cat([x_past, features_past], dim=-1)
 
         x_past = x_past.to(self.device)
         x_past = self.embedding(x_past)
 
         if self.training:
             # we do one step ahead forecasting
-            targets_future = torch.cat([targets_past[:, [-1], :], targets_future[:, :-1, :]], dim=1)
+            if self.decoder_lagged_input:
+                targets_future = torch.cat([targets_past, targets_future[:, :-1, :]], dim=1)
+                targets_future = get_lagged_subsequences(targets_future, self.n_prediction_steps,
+                                                         self.decoder.lagged_value)
+            else:
+                targets_future = torch.cat([targets_past[:, [-1], :], targets_future[:, :-1, :]], dim=1)
 
             x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
                                                                                 dim=-1)
@@ -223,21 +315,26 @@ def forward(self,
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
             all_predictions = []
-            predicted_target = targets_past[:, [-1]].to(self.device)
+            predicted_target = targets_past[:, [-1]]
 
             _, hidden_states = self.encoder(x_past)
             if features_future is not None:
-                features_future = features_future.to(self.devicee)
+                features_future = features_future
 
             for idx_pred in range(self.n_prediction_steps):
-                x_future = predicted_target if features_future is None else torch.cat(
-                    [predicted_target, features_future[:, [idx_pred], :]],
+                if self.decoder_lagged_input:
+                    x_future = torch.cat([targets_past, predicted_target.cpu()], dim=1)
+                    x_future = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+                else:
+                    x_future = predicted_target[:, [-1]]
+                x_future = x_future if features_future is None else torch.cat(
+                    [x_future, features_future[:, [idx_pred], :]],
                     dim=-1)
                 x_future = x_future.to(self.device)
 
                 x_future, hidden_states = self.decoder(x_future, hx=hidden_states)
                 net_output = self.head(x_future[:, -1:, ])
-                predicted_target = self.pred_from_net_output(net_output)
+                predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()], dim=1)
 
                 all_predictions.append(net_output)
 
@@ -279,14 +376,29 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        past_target, targets_future, loc, scale = self.target_scaler(targets_past, targets_future)
-        x_past = targets_past if features_past is None else torch.cat([targets_past, features_past], dim=-1)
+        if self.encoder_lagged_input:
+            targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
+            targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
+            x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
+        else:
+            if self.window_size < targets_past.shape[1]:
+                targets_past = targets_past[:, -self.window_size]
+
+            targets_past, _, loc, scale = self.target_scaler(targets_past)
+            x_past = targets_past
+
+        x_past = x_past if features_past is None else torch.cat([x_past, features_past], dim=-1)
 
         x_past = x_past.to(self.device)
         # TODO consider static features
         x_past = self.embedding(x_past)
 
         if self.training:
+            if self.encoder_lagged_input:
+                targets_future = torch.cat([targets_past, targets_future], dim=1)
+                targets_future = get_lagged_subsequences(targets_future, self.n_prediction_steps,
+                                                         self.encoder.lagged_value)
+
             x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
                                                                                 dim=-1)
             x_future = x_future.to(self.device)
@@ -302,7 +414,7 @@ def forward(self,
             net_output = self.head(self.decoder(x_input))
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
-            all_predictions = []
+            all_samples = []
             batch_size = targets_past.shape[0]
 
             if self.encoder_has_hidden_states:
@@ -319,8 +431,8 @@ def forward(self,
             else:
                 # For other models, the full past targets are passed to the network.
                 encoder_output = self.encoder(x_past)
-                repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples,
-                                                                      dim=0).squeeze(1).to(self.device)
+            repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples,
+                                                                  dim=0).squeeze(1)
 
             repeated_static_feat = features_static.repeat_interleave(
                 repeats=self.num_samples, dim=0
@@ -336,19 +448,25 @@ def forward(self,
 
             next_sample = next_sample.transpose(0, 1).reshape(
                 (next_sample.shape[0] * next_sample.shape[1], 1, -1)
-            )
+            ).cpu()
 
-            all_predictions.append(next_sample)
+            all_samples.append(next_sample)
 
             for k in range(1, self.n_prediction_steps):
-                x_next = next_sample if repeated_time_feat is None else torch.cat([next_sample,
-                                                                                   repeated_time_feat[:, k:k + 1]],
-                                                                                  dim=-1)
+                if self.encoder_lagged_input:
+                    x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
+                    x_next = get_lagged_subsequences(x_next, 1, self.encoder.lagged_value)
+                else:
+                    x_next = next_sample
+
+                x_next = x_next if repeated_time_feat is None else torch.cat([x_next,
+                                                                              repeated_time_feat[:, k:k + 1]],
+                                                                             dim=-1)
                 if self.encoder_has_hidden_states:
                     x_next = x_next.to(self.device)
                     encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
                 else:
-                    x_next = torch.cat([repeated_past_target, x_next], dim=1)
+                    x_next = torch.cat([repeated_past_target, *all_samples], dim=1).to(self.device)
                     encoder_output = self.encoder(x_next)
                 # for training, the encoder output a sequence. Thus for prediction, the network should have the same
                 # format output
@@ -356,19 +474,19 @@ def forward(self,
 
                 net_output = self.head(self.decoder(encoder_output))
 
-                next_sample = net_output.sample()
-                all_predictions.append(next_sample)
+                next_sample = net_output.sample().cpu()
+                all_samples.append(next_sample)
 
-            all_predictions = torch.cat(all_predictions, dim=1).unflatten(0, (batch_size, self.num_samples))
+            all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
 
             if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
                 raise ValueError(
                     f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
                     f"but this network has {self.output_type} and {self.forecast_strategy}")
             if self.aggregation == 'mean':
-                return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale, self.device)
+                return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
             elif self.aggregation == 'median':
-                return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale, self.device)
+                return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
             else:
                 raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
@@ -386,6 +504,9 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        if self.window_size < targets_past.shape[1]:
+            targets_past = targets_past[:, -self.window_size]
+        targets_past = targets_past[:, -self.window_size:]
         targets_past, _, loc, scale = self.target_scaler(targets_past)
         targets_past = targets_past.to(self.device)
 
@@ -434,12 +555,13 @@ def __init__(
     @property
     def _required_fit_requirements(self):
         return [
+            FitRequirement('dataset_properties', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_encoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_decoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
-            FitRequirement("target_scaler", (BaseTargetScaler, ), user_defined=False, dataset_property=False),
+            FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
             FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
             FitRequirement("encoder_properties", (Dict,), user_defined=False, dataset_property=False),
             FitRequirement("decoder_properties", (Dict,), user_defined=False, dataset_property=False),
@@ -459,8 +581,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    network_encoder=X['network_encoder'],
                                    network_decoder=X['network_decoder'],
                                    network_head=X['network_head'],
-                                   window_size=X['window:size'],
-                                   n_prediction_steps=X['dataset_properties']['n_prediction_steps'],
+                                   window_size=X['window_size'],
+                                   dataset_properties=X['dataset_properties'],
                                    encoder_properties=X['encoder_properties'],
                                    decoder_properties=X['decoder_properties'],
                                    target_scaler=X['target_scaler'],
@@ -469,7 +591,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
 
-
         if X['decoder_properties']['has_hidden_states']:
             # decoder is RNN
             self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
index 8dc3fb39c..8961cbb7e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
@@ -1,5 +1,6 @@
 from abc import ABC
-from typing import Any, Dict, Optional, Tuple, List
+from typing import Any, Dict, Optional, Tuple, List, Union
+import warnings
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -12,6 +13,9 @@
 import torch
 from torch import nn
 
+import numpy as np
+from gluonts.time_feature.lag import get_lags_for_frequency
+
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
@@ -19,6 +23,7 @@
 
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
+from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 
 class RNN_Module(nn.Module):
@@ -27,13 +32,16 @@ def __init__(self,
                  hidden_size: int,
                  num_layers: int,
                  cell_type: str,
-                 config: Dict[str, Any]):
+                 config: Dict[str, Any],
+                 lagged_value: Optional[Union[List, np.ndarray]]=None):
         super().__init__()
         self.config = config
         if cell_type == 'lstm':
             cell = nn.LSTM
         else:
             cell = nn.GRU
+        self.lagged_value = lagged_value
+        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
         self.lstm = cell(input_size=in_features,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
@@ -59,6 +67,7 @@ def __init__(self, **kwargs: Dict):
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.auto_regressive = True
         self.rnn_kwargs = None
+        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -69,7 +78,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
     def _build_decoder(self,
                        input_shape: Tuple[int, ...],
                        n_prediction_heads: int,
-                       dataset_properties: Dict) -> Tuple[List[nn.Module], int]:
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
         # RNN decoder only allows RNN encoder, these parameters need to exists.
         hidden_size = self.rnn_kwargs['hidden_size']
         num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs[
@@ -80,6 +89,7 @@ def _build_decoder(self,
                              num_layers=num_layers,
                              cell_type=cell_type,
                              config=self.config,
+                             lagged_value=self.lagged_value
                              )
         return decoder, hidden_size
 
@@ -87,11 +97,24 @@ def decoder_properties(self):
         decoder_properties = super().decoder_properties()
         decoder_properties.update({'has_hidden_states': True,
                                    'recurrent': True,
+                                   'lagged_input': True,
                                    })
         return decoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.rnn_kwargs = X['rnn_kwargs']
+
+        freq = X['dataset_properties'].get('freq', None)
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        if freq is not None:
+            try:
+                freq = FREQUENCY_MAP[freq]
+                self.lagged_value = [0] + get_lags_for_frequency(freq)
+            except Exception:
+                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
+                # If
+                pass
         return super().fit(X, y)
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
index 79b1a6ce6..d84208544 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -39,6 +39,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
     def decoder_properties(self):
         decoder_properties = {'has_hidden_states': False,
                               'recurrent': False,
+                              'lagged_input': False,
                               'multi_blocks': False,
                               }
         return decoder_properties
@@ -74,6 +75,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if fixed_input_seq_length:
             input_shape = (X["window_size"], input_shape[-1])
 
+        if encoder_properties.get('lagged_input', False):
+            lagged_value = X['network_encoder'].lagged_value
+            input_shape = (X["window_size"], input_shape[-1] * len(lagged_value))
+
         self.decoder, self.n_decoder_output_features = self.build_decoder(
             input_shape=get_output_shape(X['network_encoder'], input_shape=input_shape,
                                          has_hidden_states=has_hidden_states),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
index 427f5a345..c20a62372 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
@@ -1,4 +1,6 @@
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, List, Union
+import warnings
+import numpy as np
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -10,25 +12,29 @@
 
 import torch
 from torch import nn
+from gluonts.time_feature.lag import get_lags_for_frequency
+
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
     import BaseForecastingEncoder, EncoderNetwork
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-
+from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 class _RNN(EncoderNetwork):
     # we only consder GRU and LSTM here
     def __init__(self,
                  in_features: int,
-                 config: Dict[str, Any]):
+                 config: Dict[str, Any],
+                 lagged_value: Optional[Union[List, np.ndarray]]=None):
         super().__init__()
         self.config = config
         if config['cell_type'] == 'lstm':
             cell_type = nn.LSTM
         else:
             cell_type = nn.GRU
-
+        self.lagged_value = lagged_value
+        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
         self.lstm = cell_type(input_size=in_features,
                               hidden_size=config["hidden_size"],
                               num_layers=config["num_layers"],
@@ -70,18 +76,35 @@ class RNNEncoder(BaseForecastingEncoder):
 
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
+        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         encoder = _RNN(in_features=input_shape[-1],
-                       config=self.config)
+                       config=self.config,
+                       lagged_value=self.lagged_value)
         return encoder
 
     def encoder_properties(self):
         encoder_properties = super().encoder_properties()
         encoder_properties.update({'has_hidden_states': True,
+                                   'lagged_input': True,
                                    })
         return encoder_properties
 
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        freq = X['dataset_properties'].get('freq', None)
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        if freq is not None:
+            try:
+                freq = FREQUENCY_MAP[freq]
+                self.lagged_value = [0] + get_lags_for_frequency(freq)
+            except Exception:
+                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
+                # If
+                pass
+        return super().fit(X, y)
+
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         rnn_kwargs = {'hidden_size': self.config['hidden_size'],
                       'num_layers': self.config['num_layers'],
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 601675308..4e04436a1 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -28,7 +28,7 @@
 
 def pad_sequence_from_start(sequences: List[torch.Tensor],
                             seq_minimal_length: int,
-                            batch_first=False,
+                            batch_first=True,
                             padding_value=0.0) -> torch.Tensor:
     r"""
     This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we pad new values from the start of
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 0acd5d701..98a74c5fd 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -139,17 +139,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        past_target = data['past_target'][:, -self.window_size:].float()
-
-
-        """
-        # prepare
-        past_target = past_target.float()
-        if self.model.future_target_required or isinstance(self.model, NBEATSNet):
-            past_target, scaled_future_targets, loc, scale = self.target_scaler(past_target, future_targets)
-        else:
-            past_target, _, loc, scale = self.target_scaler(past_target)
-        """
+        past_target = data['past_target'].float()
 
         future_targets = self.cast_targets(future_targets)
 
@@ -157,6 +147,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         self.optimizer.zero_grad()
 
         if isinstance(self.model, NBEATSNet):
+            past_target = past_target[:, -self.window_size:]
             past_target, criterion_kwargs_past = self.data_preparation(past_target,
                                                                        past_target.to(self.device))
             past_target, criterion_kwargs_future = self.data_preparation(past_target, future_targets.to(self.device))
@@ -172,7 +163,10 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             outputs = forecast
         else:
             if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
-                all_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
+                if self.window_size> past_target.shape[1]:
+                    all_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
+                else:
+                    all_targets = torch.cat([past_target[:, 1 - self.window_size:, ], future_targets], dim=1)
                 past_target, criterion_kwargs = self.data_preparation(past_target, all_targets.to(self.device))
             else:
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets.to(self.device))
@@ -213,27 +207,29 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
         with torch.no_grad():
             for step, (data, future_targets) in enumerate(test_loader):
-                past_target = data['past_target'][:, -self.window_size:]
+                past_target = data['past_target'].float()
 
                 mase_coefficients.append(data['mase_coefficient'])
 
                 batch_size = past_target.shape[0]
 
-                # prepare
-                past_target = past_target.float()
-
-                future_targets = self.cast_targets(future_targets).to(self.device)
-
-                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+                if isinstance(self.model, ForecastingDeepARNet):
+                    future_targets = self.cast_targets(future_targets)
 
-                outputs = self.model(past_target)
+                    past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
-                if isinstance(self.model, ForecastingDeepARNet):
+                    outputs = self.model(past_target)
                     # DeepAR only generate sampled points, we replace log_prob loss with MSELoss
                     outputs = self.model.pred_from_net_output(outputs)
                     loss = F.mse_loss(outputs, future_targets)
                     outputs = outputs.detach().cpu()
                 else:
+                    # prepare
+                    future_targets = self.cast_targets(future_targets).to(self.device)
+
+                    past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+
+                    outputs = self.model(past_target)
                     if isinstance(outputs, list):
                         loss = [self.criterion(output, future_targets) for output in outputs]
                         loss = torch.mean(torch.Tensor(loss))
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 889fc7c81..b21f07a6c 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -242,10 +242,12 @@ def _get_hyperparameter_search_space(self,
                     forbidden_losses_all.append(forbidden_hp_dist)
 
             network_encoder_hp = cs.get_hyperparameter('network_encoder:__choice__')
-            if 'MLPEncoder' in network_encoder_hp.choices:
+            if 'MLPEncoder' or 'TCNEncoder' or 'InceptionTimeEncoder' in network_encoder_hp.choices:
+                forbidden = ['MLPEncoder', 'TCNEncoder', 'InceptionTimeEncoder']
+                forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
                 for hp_ar in hp_auto_regressive:
                     forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
-                    forbidden_hp_mlpencoder = ForbiddenEqualsClause(network_encoder_hp, 'MLPEncoder')
+                    forbidden_hp_mlpencoder = ForbiddenInClause(network_encoder_hp, forbidden_deepAREncoder)
                     forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                     forbidden_losses_all.append(forbidden_hp_ar_mlp)
 
diff --git a/autoPyTorch/utils/forecasting_time_features.py b/autoPyTorch/utils/forecasting_time_features.py
new file mode 100644
index 000000000..f576f9221
--- /dev/null
+++ b/autoPyTorch/utils/forecasting_time_features.py
@@ -0,0 +1,33 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+
+
+from typing import List, Optional
+
+import numpy as np
+from pandas.tseries.frequencies import to_offset
+
+# Frequencies used by GluonTS framework
+FREQUENCY_MAP = {
+    "minutely": "1min",
+    "10_minutes": "10min",
+    "half_hourly": "30min",
+    "hourly": "1H",
+    "daily": "1D",
+    "weekly": "1W",
+    "monthly": "1M",
+    "quarterly": "1Q",
+    "yearly": "1Y"
+}
+

From 5d679735d1d7d83b74d5732d634b4b96aa95f29b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 30 Dec 2021 19:53:13 +0100
Subject: [PATCH 108/347] merge encoder and decoder as a single pipeline

---
 .../setup/network/forecasting_network.py      |  31 +-
 .../InceptionTimeEncoder.py                   |  18 +-
 .../MLPEncoder.py                             |  13 +-
 .../NBEATSEncoder.py                          |  21 +-
 .../RNNEncoder.py                             |  27 +-
 .../TCNEncoder.py                             |  18 +-
 .../forecasting_backbone/__init__.py          | 377 ++++++++++++++++++
 .../base_forecasting_encoder.py               |   4 +
 .../forecasting_decoder/MLPDecoder.py         |  11 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  10 +-
 .../forecasting_decoder/RNNDecoder.py         |  14 +-
 .../forecasting_decoder/__init__.py           |  34 +-
 .../base_forecasting_decoder.py               |  10 +-
 .../forecasting_encoder/__init__.py           |  51 ---
 .../forecasting_network_head/NBEATS_head.py   |   2 +-
 .../forecasting_head.py                       |   2 +
 .../forecasting_base_trainer.py               |  11 +-
 .../pipeline/time_series_forecasting.py       |  47 ++-
 18 files changed, 545 insertions(+), 156 deletions(-)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_encoder => forecasting_backbone}/InceptionTimeEncoder.py (94%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_encoder => forecasting_backbone}/MLPEncoder.py (96%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_encoder => forecasting_backbone}/NBEATSEncoder.py (76%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_encoder => forecasting_backbone}/RNNEncoder.py (91%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_encoder => forecasting_backbone}/TCNEncoder.py (96%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
 rename autoPyTorch/pipeline/components/setup/network_backbone/{forecasting_encoder => forecasting_backbone}/base_forecasting_encoder.py (98%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_backbone}/forecasting_decoder/MLPDecoder.py (96%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_backbone}/forecasting_decoder/NBEATSDecoder.py (98%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_backbone}/forecasting_decoder/RNNDecoder.py (95%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_backbone}/forecasting_decoder/__init__.py (90%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/{ => forecasting_backbone}/forecasting_decoder/base_forecasting_decoder.py (97%)
 delete mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index f1fa667bc..9668ad08e 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -14,22 +14,23 @@
     TransformedDistribution,
 )
 
-from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder \
     import EncoderNetwork
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
-from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
-    pad_sequence_from_start
 
 
 class TransformedDistribution_(TransformedDistribution):
+    """
+    We implement the mean function such that we do not need to enquire base mean every time
+    """
+    @property
     def mean(self):
         mean = self.base_dist.mean
         for transform in self.transforms:
@@ -203,14 +204,14 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
+
         if self.encoder_lagged_input:
             targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
             targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
             x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
         else:
             if self.window_size < targets_past.shape[1]:
-                targets_past = targets_past[:, -self.window_size]
-            targets_past = targets_past[:, -self.window_size:]
+                targets_past = targets_past[:, -self.window_size:]
             targets_past, _, loc, scale = self.target_scaler(targets_past)
             x_past = targets_past
 
@@ -219,6 +220,7 @@ def forward(self,
 
         x_past = x_past.to(device=self.device)
         x_past = self.embedding(x_past)
+
         if self.encoder_has_hidden_states:
             x_past, _ = self.encoder(x_past)
         else:
@@ -285,8 +287,7 @@ def forward(self,
             x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
         else:
             if self.window_size < targets_past.shape[1]:
-                targets_past = targets_past[:, -self.window_size]
-            targets_past = targets_past[:, -self.window_size]
+                targets_past = targets_past[:, -self.window_size:]
             targets_past, _, loc, scale = self.target_scaler(targets_past)
             x_past = targets_past
 
@@ -340,6 +341,8 @@ def forward(self,
 
             if self.output_type != 'distribution':
                 all_predictions = torch.cat(all_predictions, dim=1)
+            else:
+                all_predictions = self.pred_from_net_output(all_predictions)
 
             return self.rescale_output(all_predictions, loc, scale, self.device)
 
@@ -350,7 +353,10 @@ def predict(self,
                 features_static: Optional[torch.Tensor] = None
                 ):
         net_output = self(targets_past, features_past, features_future)
-        return self.pred_from_net_output(net_output)
+        if self.output_type != 'distribution':
+            return self.pred_from_net_output(net_output)
+        else:
+            return net_output
 
 
 class ForecastingDeepARNet(ForecastingNet):
@@ -382,7 +388,7 @@ def forward(self,
             x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
         else:
             if self.window_size < targets_past.shape[1]:
-                targets_past = targets_past[:, -self.window_size]
+                targets_past = targets_past[:, -self.window_size:]
 
             targets_past, _, loc, scale = self.target_scaler(targets_past)
             x_past = targets_past
@@ -505,8 +511,7 @@ def forward(self,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
         if self.window_size < targets_past.shape[1]:
-            targets_past = targets_past[:, -self.window_size]
-        targets_past = targets_past[:, -self.window_size:]
+            targets_past = targets_past[:, -self.window_size:]
         targets_past, _, loc, scale = self.target_scaler(targets_past)
         targets_past = targets_past.to(self.device)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
similarity index 94%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
index 3cd31d76d..ae48e1a9e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
@@ -9,8 +9,9 @@
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder\
-    import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder, EncoderNetwork
+)
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
@@ -142,6 +143,13 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         self._receptive_field = encoder.receptive_field
         return encoder
 
+    @staticmethod
+    def allowed_decoders():
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder']
+
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {
@@ -149,7 +157,8 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'name': 'InceptionTimeBackbone',
             'handles_tabular': False,
             'handles_image': False,
-            'handles_time_series': True,
+            # TODO consider InceptionTime for forecasting
+            'handles_time_series': False,
         }
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -166,11 +175,12 @@ def get_hyperparameter_search_space(
         num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                            value_range=(4, 64),
                                                                            default_value=32,
-                                                                           log=True
+                                                                           log=True,
                                                                            ),
         kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
                                                                            value_range=(4, 64),
                                                                            default_value=32,
+                                                                           log=True,
                                                                            ),
         bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
                                                                                value_range=(16, 64),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
similarity index 96%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
index 5957fe585..69f909c38 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
@@ -2,14 +2,14 @@
 
 import torch
 from torch import nn
-import torch.nn.functional as F
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
-    import BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder, EncoderNetwork
+)
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
@@ -69,6 +69,13 @@ def encoder_properties(self):
         })
         return encoder_properties
 
+    @staticmethod
+    def allowed_decoders():
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder']
+
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
         requirements_list = super()._required_fit_arguments
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
similarity index 76%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
index 61ea1654e..976296271 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
@@ -3,18 +3,14 @@
 from torch import nn
 
 from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
-    UniformFloatHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition, InCondition, EqualsCondition, AndConjunction
 
-from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
-    import BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder, EncoderNetwork
+)
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
-from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.MLPEncoder import \
+from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.MLPEncoder import \
     TimeSeriesMLPrecpocessor
 
 
@@ -33,6 +29,13 @@ def encoder_properties(self):
         })
         return encoder_properties
 
+    @staticmethod
+    def allowed_decoders():
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['NBEATSDecoder']
+
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
         requirements_list = super()._required_fit_arguments
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
similarity index 91%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index c20a62372..d1ea423e7 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -14,19 +14,20 @@
 from torch import nn
 from gluonts.time_feature.lag import get_lags_for_frequency
 
-
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
-    import BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder, EncoderNetwork
+)
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
+
 class _RNN(EncoderNetwork):
     # we only consder GRU and LSTM here
     def __init__(self,
                  in_features: int,
                  config: Dict[str, Any],
-                 lagged_value: Optional[Union[List, np.ndarray]]=None):
+                 lagged_value: Optional[Union[List, np.ndarray]] = None):
         super().__init__()
         self.config = config
         if config['cell_type'] == 'lstm':
@@ -84,6 +85,13 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
                        lagged_value=self.lagged_value)
         return encoder
 
+    @staticmethod
+    def allowed_decoders():
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder', 'RNNDecoder']
+
     def encoder_properties(self):
         encoder_properties = super().encoder_properties()
         encoder_properties.update({'has_hidden_states': True,
@@ -144,8 +152,16 @@ def get_hyperparameter_search_space(
                                                                            default_value=0.2),
             bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
                                                                                  value_range=(True, False),
-                                                                                 default_value=True)
+                                                                                 default_value=True),
+            decoder_type: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='decoder_type',
+                                      value_range=('MLPDecoder', 'RNNDecoder'),
+                                      default_value='MLPDecoder')
     ) -> ConfigurationSpace:
+        """
+        get hyperparameter search space
+
+        """
         cs = CS.ConfigurationSpace()
 
         # TODO consider lstm layers with different hidden size
@@ -159,6 +175,7 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, cell_type, CategoricalHyperparameter)
         add_hyperparameter(cs, hidden_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, bidirectional, CategoricalHyperparameter)
+        add_hyperparameter(cs, decoder_type, CategoricalHyperparameter)
 
         cs.add_condition(CS.AndConjunction(CS.EqualsCondition(dropout, use_dropout, True),
                                            CS.GreaterThanCondition(dropout, num_layers, 1)))
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
similarity index 96%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index ee56bb5df..7bae11c18 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -8,15 +8,13 @@
     UniformIntegerHyperparameter
 )
 
-import numpy as np
-
 import torch
 from torch import nn
 from torch.nn.utils import weight_norm
-
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder, EncoderNetwork
+)
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder \
-    import BaseForecastingEncoder, EncoderNetwork
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
@@ -130,6 +128,13 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         self._receptive_field = encoder.receptive_field
         return encoder
 
+    @staticmethod
+    def allowed_decoders():
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder']
+
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Any]:
@@ -157,7 +162,8 @@ def get_hyperparameter_search_space(
                                                                                log=True),
             kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
                                                                                value_range=(4, 64),
-                                                                               default_value=32),
+                                                                               default_value=32,
+                                                                               log=True),
             use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
                                                                                value_range=(True, False),
                                                                                default_value=False),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
new file mode 100644
index 000000000..7936d281d
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -0,0 +1,377 @@
+import os
+from collections import OrderedDict
+from typing import Dict, Optional, List, Any
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
+from ConfigSpace.conditions import EqualsCondition, OrConjunction
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder,
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
+    decoders, decoder_addons, add_decoder
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class ForecastingBackboneChoice(autoPyTorchChoice):
+    """
+    A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
+    the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
+    the encoder.
+    """
+    def __init__(self,
+                 **kwargs,
+                 ):
+        super().__init__(**kwargs)
+        self.decoder_choice = None
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
+
+    def get_decoder_components(self) -> Dict[str, autoPyTorchComponent]:
+        components = OrderedDict()
+        components.update(decoders)
+        components.update(decoder_addons.components)
+        return components
+
+    def get_available_components(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        include: List[str] = None,
+        exclude: List[str] = None,
+        components: Optional[Dict[str, autoPyTorchComponent]] = None
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate backbones
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        if components is None:
+            available_comp = self.get_components()
+        else:
+            available_comp = components
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # backbones based on the dataset!
+            # TODO: Think if there is any case where a backbone
+            # is not recommended for a certain dataset
+
+            components_dict[name] = entry
+
+        return components_dict
+
+    def get_hyperparameter_search_space(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        default: Optional[str] = None,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
+            default (Optional[str]): Default backbone to use
+            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
+                list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]: which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_encoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        available_decoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=None, exclude=None,
+            components=self.get_decoder_components())
+
+        if len(available_encoders) == 0:
+            raise ValueError("No Encoder found")
+        if len(available_decoders) == 0:
+            raise ValueError("No Decoder found")
+
+        if default is None:
+            defaults = self._defaults_network
+            for default_ in defaults:
+                if default_ in available_encoders:
+                    default = default_
+                    break
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_encoders):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_encoders,
+                                                               choice_hyperparameter.value_range))
+            hp_encoder = CSH.CategoricalHyperparameter('__choice__',
+                                                     choice_hyperparameter.value_range,
+                                                     default_value=choice_hyperparameter.default_value)
+        else:
+            hp_encoder = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_encoders.keys()),
+                default_value=default
+            )
+        cs.add_hyperparameter(hp_encoder)
+
+        decoder2encoder = {key: [] for key in available_decoders.keys()}
+        encoder2decoder = {}
+        for encoder_name in hp_encoder.choices:
+            updates = self._get_search_space_updates(prefix=encoder_name)
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                     **updates)
+            parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+            cs.add_configuration_space(
+                encoder_name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+            allowed_decoders = available_encoders[encoder_name].allowed_decoders()
+            if len(allowed_decoders) > 1:
+                if 'decoder_type' not in config_space:
+                    raise ValueError('When a specific encoder has more than one allowed decoder, its ConfigSpace'
+                                     'must contain the hyperparameter "decoder_type" ! Please check your encoder '
+                                     'setting!')
+                hp_decoder_choice = config_space.get_hyperparameter('decoder_type').choices
+                if not set(hp_decoder_choice).issubset(allowed_decoders):
+                    raise ValueError('The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
+                allowed_decoders = hp_decoder_choice
+            for decoder_name in allowed_decoders:
+                decoder2encoder[decoder_name].append(encoder_name)
+            encoder2decoder[encoder_name] = allowed_decoders
+
+        for decoder_name in available_decoders.keys():
+            updates = self._get_search_space_updates(prefix=decoder_name)
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                            **updates)
+            compatible_encoders = decoder2encoder[decoder_name]
+            encoders_with_multi_decoder = []
+            encoder_with_uni_decoder = []
+            for encoder in compatible_encoders:
+                if len(encoder2decoder[encoder]) > 1:
+                    encoders_with_multi_decoder.append(encoder)
+                else:
+                    encoder_with_uni_decoder.append(encoder)
+
+            cs.add_configuration_space(
+                decoder_name,
+                config_space,
+                #parent_hyperparameter=parent_hyperparameter
+            )
+            hps = cs.get_hyperparameters() # type: List[CSH.Hyperparameter]
+            conditions_to_add = []
+            for hp in hps:
+                # TODO consider if this will raise any unexpected behavior
+                if hp.name.startswith(decoder_name):
+                    # From the implementation of ConfigSpace
+                    # Only add a condition if the parameter is a top-level
+                    # parameter of the new configuration space (this will be some
+                    #  kind of tree structure).
+                    if cs.get_parents_of(hp):
+                        continue
+                    or_cond = []
+                    for encoder_uni in encoder_with_uni_decoder:
+                        or_cond.append(EqualsCondition(hp,
+                                                       hp_encoder,
+                                                       encoder_uni))
+                    for encode_multi in encoders_with_multi_decoder:
+                        hp_decoder_type = cs.get_hyperparameter(f'{encode_multi}:decoder_type')
+                        or_cond.append(EqualsCondition(hp,
+                                                       hp_decoder_type,
+                                                       decoder_name))
+                    if len(or_cond) > 1:
+                        conditions_to_add.append(OrConjunction(*or_cond))
+                    else:
+                        conditions_to_add.append(or_cond[0])
+            cs.add_conditions(conditions_to_add)
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
+
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        """
+        Applies a configuration to the given component.
+        This method translate a hierarchical configuration key,
+        to an actual parameter of the autoPyTorch component.
+
+        Args:
+            configuration (Configuration):
+                Which configuration to apply to the chosen component
+            init_params (Optional[Dict[str, any]]):
+                Optional arguments to initialize the chosen component
+
+        Returns:
+            self: returns an instance of self
+        """
+        new_params = {}
+
+        params = configuration.get_dictionary()
+        choice = params['__choice__']
+        del params['__choice__']
+
+        for param, value in params.items():
+            param = param.replace(choice + ':', '')
+            new_params[param] = value
+
+        if init_params is not None:
+            for param, value in init_params.items():
+                param = param.replace(choice + ':', '')
+                new_params[param] = value
+
+        decoder_components = self.get_decoder_components()
+
+        decoder_type = None
+
+        decoder_params = {}
+        decoder_params_names = []
+        for param, value in new_params.items():
+            if decoder_type is None:
+                for decoder_component in decoder_components.keys():
+                    if param.startswith(decoder_component):
+                        decoder_type = decoder_component
+                        decoder_params_names.append(param)
+                        param = param.replace(decoder_type + ':', '')
+                        decoder_params[param] = value
+            else:
+                if param.startswith(decoder_type):
+                    decoder_params_names.append(param)
+                    param = param.replace(decoder_type + ':', '')
+                    decoder_params[param] = value
+
+        for param_name in decoder_params_names:
+            del new_params[param_name]
+
+        new_params['random_state'] = self.random_state
+        decoder_params['random_state'] = self.random_state
+
+        self.new_params = new_params
+        self.choice = self.get_components()[choice](**new_params)
+        self.decoder_choice = decoder_components[decoder_type](**decoder_params)
+        from sklearn.pipeline import Pipeline
+        self.pipe = Pipeline([('encoder', self.choice), ('decoder', self.decoder_choice)])
+        return self
+
+
+    @property
+    def _defaults_network(self):
+        return ['MLPEncoder',
+                'RNNEncpder']
+
+    def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
+        """Handy method to check if a component is fitted
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+        """
+        # Allows to use check_is_fitted on the choice object
+        self.fitted_ = True
+        assert self.pipe is not None, "Cannot call fit without initializing the component"
+        return self.pipe.fit(X, y)
+        #self.choice.fit(X, y)
+        #self.choice.transform(X)
+        #return self.choice
+
+    def transform(self, X: Dict) -> Dict:
+        assert self.pipe is not None, "Cannot call transform before the object is initialized"
+        return self.pipe.transform(X)
+
+    @property
+    def _defaults_network(self):
+        return ['MLPEncoder']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
similarity index 98%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
index a04f08a0a..755a68941 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
@@ -88,6 +88,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         return self
 
+    @staticmethod
+    def allowed_decoders():
+        raise NotImplementedError
+
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
         X.update({'network_encoder': self.encoder})
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
similarity index 96%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index e20764437..4e37f241e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -1,17 +1,16 @@
-from abc import ABC
-from typing import Dict, Optional, Tuple, Union, Any
+from typing import Dict, Optional, Tuple, Union
 
 from torch import nn
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition, EqualsCondition
+from ConfigSpace.conditions import GreaterThanCondition
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
     BaseForecastingDecoder
 
 
@@ -47,6 +46,10 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_time_series': True,
         }
 
+    @property
+    def fitted_encoder(self):
+        return ['RNNEncoder', 'TCNEncoder', 'MLEncoder', 'NBEATSEncoder']
+
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
similarity index 98%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 4cf6dd528..2300c67e2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -1,10 +1,9 @@
-from typing import Any, Dict, List, Optional, Union, Tuple
+from typing import List
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
     UniformFloatHyperparameter
 from ConfigSpace.conditions import GreaterThanCondition, InCondition, EqualsCondition, AndConjunction
-from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction
 
 from typing import Dict, Optional, Tuple, Union, Any
 
@@ -15,7 +14,7 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
     BaseForecastingDecoder
 
 # TODO we need to rewrite NBEATS part to make it neater!!!
@@ -116,6 +115,10 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_time_series': True,
         }
 
+    @property
+    def fitted_encoder(self):
+        return ['NBEATSEncoder']
+
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'backcast_loss_ratio': self.config['backcast_loss_ratio']})
         return super().transform(X)
@@ -317,5 +320,4 @@ def get_hyperparameter_search_space(
             else:
                 cs.add_condition(dropout_condition_1)
 
-
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
similarity index 95%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 8961cbb7e..a8ddee9f0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -1,4 +1,3 @@
-from abc import ABC
 from typing import Any, Dict, Optional, Tuple, List, Union
 import warnings
 
@@ -6,8 +5,7 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
     CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    Constant
+    UniformFloatHyperparameter
 )
 
 import torch
@@ -18,11 +16,10 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
     BaseForecastingDecoder
 
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
 from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 
@@ -93,6 +90,10 @@ def _build_decoder(self,
                              )
         return decoder, hidden_size
 
+    @property
+    def fitted_encoder(self):
+        return ['RNNEncoder']
+
     def decoder_properties(self):
         decoder_properties = super().decoder_properties()
         decoder_properties.update({'has_hidden_states': True,
@@ -113,7 +114,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 self.lagged_value = [0] + get_lags_for_frequency(freq)
             except Exception:
                 warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
-                # If
                 pass
         return super().fit(X, y)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
similarity index 90%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index 1e1498fdd..7df7ac6a2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -5,21 +5,13 @@
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
     autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.setup.network_head.base_network_head import (
-    NetworkHeadComponent,
 )
 
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.base_forecasting_decoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
     BaseForecastingDecoder,
 )
 
@@ -29,11 +21,15 @@
 )
 
 directory = os.path.split(__file__)[0]
-_decoders = find_components(__package__,
+decoders = find_components(__package__,
                          directory,
                          BaseForecastingDecoder)
 
-_addons = ThirdPartyComponents(BaseForecastingDecoder)
+decoder_addons = ThirdPartyComponents(BaseForecastingDecoder)
+
+
+def add_decoder(encoder: BaseForecastingDecoder) -> None:
+    decoder_addons.add_component(encoder)
 
 
 class ForecastingDecoderChoice(NetworkBackboneChoice):
@@ -49,8 +45,8 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """
         components = OrderedDict()
 
-        components.update(_decoders)
-        components.update(_addons.components)
+        components.update(decoders)
+        components.update(decoder_addons.components)
 
         return components
 
@@ -173,20 +169,20 @@ def get_hyperparameter_search_space(
                                  "choices in {} got {}".format(self.__class__.__name__,
                                                                available_heads,
                                                                choice_hyperparameter.value_range))
-            head = CSH.CategoricalHyperparameter('__choice__',
+            decoder = CSH.CategoricalHyperparameter('__choice__',
                                                  choice_hyperparameter.value_range,
                                                  default_value=choice_hyperparameter.default_value)
         else:
-            head = CSH.CategoricalHyperparameter(
+            decoder = CSH.CategoricalHyperparameter(
                 '__choice__',
                 list(available_heads.keys()),
                 default_value=default)
-        cs.add_hyperparameter(head)
-        for name in head.choices:
+        cs.add_hyperparameter(decoder)
+        for name in decoder.choices:
             updates = self._get_search_space_updates(prefix=name)
             config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
                                                                                  **updates)
-            parent_hyperparameter = {'parent': head, 'value': name}
+            parent_hyperparameter = {'parent': decoder, 'value': name}
             cs.add_configuration_space(
                 name,
                 config_space,
@@ -195,4 +191,6 @@ def get_hyperparameter_search_space(
 
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
+        import pdb
+        pdb.set_trace()
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
similarity index 97%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index d84208544..0a74674aa 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -19,11 +19,10 @@ def __init__(self,
                  **kwargs: Any):
         super().__init__()
         self.add_fit_requirements(self._required_fit_requirements)
-        self.head: Optional[nn.Module] = None
         self.auto_regressive = kwargs.get('auto_regressive', False)
 
         self.config = kwargs
-        self.decoder = None
+        self.decoder: Optional[nn.Module] = None
         self.n_decoder_output_features = None
         self.n_prediction_heads = 1
 
@@ -36,6 +35,10 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
         ]
 
+    @property
+    def fitted_encoder(self):
+        return []
+
     def decoder_properties(self):
         decoder_properties = {'has_hidden_states': False,
                               'recurrent': False,
@@ -46,7 +49,7 @@ def decoder_properties(self):
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
-        Builds the head component and assigns it to self.head
+        Builds the head component and assigns it to self.decoder
 
         Args:
             X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
@@ -62,7 +65,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         X.update({"auto_regressive": auto_regressive})
 
-
         if auto_regressive:
             self.n_prediction_heads = 1
         else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
deleted file mode 100644
index c6d10b546..000000000
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_encoder/__init__.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Dict, List, Optional
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-import numpy as np
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder,
-)
-
-directory = os.path.split(__file__)[0]
-_backbones = find_components(__package__,
-                             directory,
-                             BaseForecastingEncoder)
-_addons = ThirdPartyComponents(BaseForecastingEncoder)
-
-
-def add_encoder(encoder: BaseForecastingEncoder) -> None:
-    _addons.add_component(encoder)
-
-
-class ForecastingEncoderChoice(NetworkBackboneChoice):
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
-        """Returns the available backbone components
-
-        Args:
-            None
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: all basebackbone components available
-                as choices for learning rate scheduling
-        """
-        components = OrderedDict()
-        components.update(_backbones)
-        components.update(_addons.components)
-        return components
-
-    @property
-    def _defaults_network(self):
-        return ['RNNBackbone', 'RNNPBackbone']
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index 7182e2c7f..a6b267432 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -6,7 +6,7 @@
 import numpy as np
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder.NBEATSDecoder import NBEATSBLock
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.NBEATSDecoder import NBEATSBLock
 
 
 class TransposeLinear(nn.Module):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index fc20c30c6..ca0938257 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -52,6 +52,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         Returns:
             Self
         """
+
+
         self.check_requirements(X, y)
         output_shape = X['dataset_properties']['output_shape']
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 98a74c5fd..9611b3f32 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -19,7 +19,7 @@
     TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet, \
-    NBEATSNet
+    NBEATSNet, ForecastingSeq2SeqNet
 
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 
@@ -213,16 +213,16 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 batch_size = past_target.shape[0]
 
-                if isinstance(self.model, ForecastingDeepARNet):
+                if isinstance(self.model, ForecastingDeepARNet) or isinstance(self.model, ForecastingSeq2SeqNet):
                     future_targets = self.cast_targets(future_targets)
 
                     past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
                     outputs = self.model(past_target)
                     # DeepAR only generate sampled points, we replace log_prob loss with MSELoss
-                    outputs = self.model.pred_from_net_output(outputs)
-                    loss = F.mse_loss(outputs, future_targets)
+                    # outputs = self.model.pred_from_net_output(outputs)
                     outputs = outputs.detach().cpu()
+                    loss = F.mse_loss(outputs, future_targets)
                 else:
                     # prepare
                     future_targets = self.cast_targets(future_targets).to(self.device)
@@ -235,7 +235,8 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                         loss = torch.mean(torch.Tensor(loss))
                     else:
                         loss = self.criterion(outputs, future_targets)
-                    outputs = self.model.pred_from_net_output(outputs).detach().cpu()
+                    outputs = self.model.pred_from_net_output(outputs)
+                    outputs = outputs.detach().cpu()
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index b21f07a6c..4f1b2ce61 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -1,12 +1,9 @@
 import copy
 import warnings
-from collections import OrderedDict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-from ConfigSpace.hyperparameters import Constant
 from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause, ForbiddenInClause
-from ConfigSpace.conditions import EqualsCondition, NotEqualsCondition
 
 import numpy as np
 
@@ -28,9 +25,9 @@
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_encoder import \
-    ForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_decoder import ForecastingDecoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
+    ForecastingDecoderChoice
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
@@ -201,12 +198,12 @@ def _get_hyperparameter_search_space(self,
                             """
 
         # dist_cls and auto_regressive are only activate if the network outputs distribution
-        if 'loss' in self.named_steps.keys() and 'network_head' in self.named_steps.keys():
+        if 'loss' in self.named_steps.keys() and 'network_backbone' in self.named_steps.keys():
             hp_loss = cs.get_hyperparameter('loss:__choice__')
 
             hp_auto_regressive = []
             for hp_name in cs.get_hyperparameter_names():
-                if hp_name.startswith('network_decoder:'):
+                if hp_name.startswith('network_backbone:'):
                     if hp_name.endswith(':auto_regressive'):
                         hp_auto_regressive.append(cs.get_hyperparameter(hp_name))
 
@@ -241,7 +238,8 @@ def _get_hyperparameter_search_space(self,
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_loss)
                     forbidden_losses_all.append(forbidden_hp_dist)
 
-            network_encoder_hp = cs.get_hyperparameter('network_encoder:__choice__')
+            network_encoder_hp = cs.get_hyperparameter('network_backbone:__choice__')
+
             if 'MLPEncoder' or 'TCNEncoder' or 'InceptionTimeEncoder' in network_encoder_hp.choices:
                 forbidden = ['MLPEncoder', 'TCNEncoder', 'InceptionTimeEncoder']
                 forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
@@ -251,6 +249,15 @@ def _get_hyperparameter_search_space(self,
                     forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                     forbidden_losses_all.append(forbidden_hp_ar_mlp)
 
+            if 'MLPEncoder' in network_encoder_hp.choices:
+                forbidden = ['MLPEncoder']
+                forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
+                for hp_ar in hp_auto_regressive:
+                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_mlpencoder = ForbiddenInClause(network_encoder_hp, forbidden_deepAREncoder)
+                    forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
+                    forbidden_losses_all.append(forbidden_hp_ar_mlp)
+
             forecast_strategy = cs.get_hyperparameter('network:forecast_strategy')
             if 'mean' in forecast_strategy.choices:
                 for hp_ar in hp_auto_regressive:
@@ -264,24 +271,18 @@ def _get_hyperparameter_search_space(self,
 
             # NBEATS
             forbidden_NBEATS = []
-            network_decoder_hp = cs.get_hyperparameter('network_decoder:__choice__')
             encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'NBEATSEncoder']
-            decoders_non_NBEATS = [choice for choice in network_decoder_hp.choices if choice != 'NBEATSDecoder']
             loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
             data_loader_backcast = cs.get_hyperparameter('data_loader:backcast')
 
             forbidden_encoder_NBEATS = ForbiddenInClause(network_encoder_hp, encoder_non_BEATS)
-            forbidden_decoder_NBEATS = ForbiddenInClause(network_decoder_hp, decoders_non_NBEATS)
             forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
             forbidden_backcast_false = ForbiddenEqualsClause(data_loader_backcast, False)
 
+
             # Ensure that NBEATS encoder only works with NBEATS decoder
             if 'NBEATSEncoder' in network_encoder_hp.choices:
-                forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
-                    forbidden_decoder_NBEATS)
-                )
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
                     forbidden_loss_non_regression)
@@ -290,6 +291,7 @@ def _get_hyperparameter_search_space(self,
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
                     forbidden_backcast_false)
                 )
+            """
             if 'NBEATSDecoder' in network_decoder_hp.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
@@ -305,15 +307,17 @@ def _get_hyperparameter_search_space(self,
                 )
             forbidden_NBEATS.append(ForbiddenAndConjunction(
                 forbidden_backcast,
-                forbidden_encoder_NBEATS
+                forbidden_decoder_NBEATS
             ))
+            """
             forbidden_NBEATS.append(ForbiddenAndConjunction(
                 forbidden_backcast,
-                forbidden_decoder_NBEATS
+                forbidden_encoder_NBEATS
             ))
 
             cs.add_forbidden_clauses(forbidden_NBEATS)
 
+        """
         # rnn head only allow rnn backbone
         if 'network_encoder' in self.named_steps.keys() and 'network_decoder' in self.named_steps.keys():
             hp_encoder_choice = cs.get_hyperparameter('network_encoder:__choice__')
@@ -329,6 +333,7 @@ def _get_hyperparameter_search_space(self,
 
                 cs.add_forbidden_clause(ForbiddenAndConjunction(forbidden_clause_encoder, forbidden_clause_decoder))
             cs.get_hyperparameter_names()
+        """
 
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
@@ -362,10 +367,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
-            ("network_encoder", ForecastingEncoderChoice(default_dataset_properties,
-                                                         random_state=self.random_state)),
-            ("network_decoder", ForecastingDecoderChoice(default_dataset_properties,
-                                                         random_state=self.random_state)),
+            ("network_backbone", ForecastingBackboneChoice(dataset_properties=default_dataset_properties,
+                                                           random_state=self.random_state)),
             ("network_head", ForecastingHead(random_state=self.random_state)),
             ("network", ForecastingNetworkComponent(random_state=self.random_state)),
             ("network_init", NetworkInitializerChoice(default_dataset_properties,

From 45d207803fb4f1b0d6696b3b741c2ac93f811749 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 3 Jan 2022 19:43:28 +0100
Subject: [PATCH 109/347] faster lag_seq builder

---
 .../setup/network/forecasting_network.py      | 192 +++++++++++-------
 .../forecasting_backbone/RNNEncoder.py        |   5 +-
 .../forecasting_decoder/MLPDecoder.py         |  31 ++-
 .../forecasting_decoder/RNNDecoder.py         |   5 +-
 .../forecasting_base_trainer.py               |   3 +-
 .../pipeline/time_series_forecasting.py       |   1 +
 6 files changed, 152 insertions(+), 85 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 9668ad08e..f601f8bba 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -8,6 +8,7 @@
 
 import torch
 from torch import nn
+import warnings
 
 from torch.distributions import (
     AffineTransform,
@@ -30,6 +31,7 @@ class TransformedDistribution_(TransformedDistribution):
     """
     We implement the mean function such that we do not need to enquire base mean every time
     """
+
     @property
     def mean(self):
         mean = self.base_dist.mean
@@ -41,11 +43,13 @@ def mean(self):
 def get_lagged_subsequences(
         sequence: torch.Tensor,
         subsequences_length: int,
-        lags_seq: List[int]
-) -> torch.Tensor:
+        lags_seq: Optional[List[int]] = None,
+        mask: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """
-    Returns lagged subsequences of a given sequence. This is similar to gluonTS's implementation the only difference
-    is that we pad the sequence that is not long enough
+    Returns lagged subsequences of a given sequence, this allows the model to receive the input from the past targets
+    outside the sliding windows. This implementation is similar to gluonTS's implementation
+     the only difference is that we pad the sequence that is not long enough
 
     Parameters
     ----------
@@ -54,37 +58,55 @@ def get_lagged_subsequences(
         Shape: (N, T, C).
     subsequences_length : int
         length of the subsequences to be extracted.
+    lags_seq: Optional[List[int]]
+        lags of the sequence, indicating the sequence that needs to be extracted
+    lag_mask: Optional[torch.Tensor]
+        a mask tensor indicating
 
     Returns
     --------
     lagged : Tensor
-        a tensor of shape (N, S, C, I), where S = subsequences_length and
-        I = len(indices), containing lagged subsequences. Specifically,
-        lagged[i, j, :, k] = sequence[i, -indices[k]-S+j, :].
+        a tensor of shape (N, S, I * C), where S = subsequences_length and
+        I = len(indices), containing lagged subsequences.
     """
     batch_size = sequence.shape[0]
-    sequence_length = sequence.shape[1]
-
-    lagged_values = []
-    for lag_index in lags_seq:
-        begin_index = -lag_index - subsequences_length
-        end_index = -lag_index if lag_index > 0 else None
-
-        if end_index is not None and end_index < -sequence_length:
-            lagged_values.append(torch.zeros([batch_size, subsequences_length, *sequence.shape[2:]]))
-            continue
-        if begin_index < -sequence_length:
-            if end_index is not None:
-                pad_shape = [batch_size, subsequences_length - sequence_length - end_index, *sequence.shape[2:]]
-                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence[:, :end_index, ...]], dim=1))
-            else:
-                pad_shape = [batch_size, subsequences_length - sequence_length, *sequence.shape[2:]]
-                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence], dim=1))
-            continue
-        else:
-            lagged_values.append(sequence[:, begin_index:end_index, ...])
-    lagged_seq = torch.stack(lagged_values, -1).reshape(batch_size, subsequences_length, -1)
-    return lagged_seq
+    num_features = sequence.shape[2]
+    if mask is None:
+        if lags_seq is None:
+            warnings.warn('Neither lag_mask or lags_seq is given, we simply return the input value')
+            return sequence, None
+        # generate mask
+        num_lags = len(lags_seq)
+
+        # build a mask
+        mask_length = max(lags_seq) + subsequences_length
+        mask = torch.zeros((num_lags, mask_length), dtype=torch.bool)
+        for i, lag_index in enumerate(lags_seq):
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            mask[i, begin_index: end_index] = True
+    else:
+        num_lags = mask.shape[0]
+        mask_length = mask.shape[1]
+
+    mask_extend = mask.clone()
+
+    if mask_length > sequence.shape[1]:
+        sequence = torch.cat([sequence.new_zeros([batch_size, mask_length - sequence.shape[1], num_features]),
+                              sequence], dim=1)
+    elif mask_length < sequence.shape[1]:
+        mask_extend = torch.cat([mask.new_zeros([num_lags, sequence.shape[1] - mask_length]), mask_extend], dim=1)
+    #  (N, 1, T, C)
+    sequence = sequence.unsqueeze(1)
+
+    # (I, T, 1)
+    mask_extend = mask_extend.unsqueeze(-1)
+
+    # (N, I, S, C)
+    lagged_seq = torch.masked_select(sequence, mask_extend).reshape(batch_size, num_lags, subsequences_length, -1)
+    lagged_seq = torch.transpose(lagged_seq, 1, 2).reshape(batch_size, subsequences_length, -1)
+
+    return lagged_seq, mask
 
 
 class ForecastingNet(nn.Module):
@@ -135,7 +157,7 @@ def __init__(self,
 
         self.target_scaler = target_scaler
 
-        self.n_prediction_steps = dataset_properties['n_prediction_steps']
+        self.n_prediction_steps = dataset_properties['n_prediction_steps']  # type: int
         self.window_size = window_size
 
         self.output_type = output_type
@@ -154,6 +176,11 @@ def __init__(self,
         self.encoder_lagged_input = encoder_properties['lagged_input']
         self.decoder_lagged_input = decoder_properties['lagged_input']
 
+        if self.encoder_lagged_input:
+            self.cached_lag_mask_encoder = None
+        if self.decoder_lagged_input:
+            self.cached_lag_mask_decoder = None
+
     @property
     def device(self):
         return self._device
@@ -208,7 +235,10 @@ def forward(self,
         if self.encoder_lagged_input:
             targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
             targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-            x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
+            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_past,
+                                                                           self.window_size,
+                                                                           self.encoder.lagged_value,
+                                                                           self.cached_lag_mask_encoder)
         else:
             if self.window_size < targets_past.shape[1]:
                 targets_past = targets_past[:, -self.window_size:]
@@ -216,7 +246,7 @@ def forward(self,
             x_past = targets_past
 
         if features_past is not None:
-            x_past = torch.cat([x_past, features_past], dim=1)
+            x_past = torch.cat([features_past, x_past], dim=1)
 
         x_past = x_past.to(device=self.device)
         x_past = self.embedding(x_past)
@@ -284,14 +314,17 @@ def forward(self,
         if self.encoder_lagged_input:
             targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
             targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-            x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
+            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_past,
+                                                                           self.window_size,
+                                                                           self.encoder.lagged_value,
+                                                                           self.cached_lag_mask_encoder)
         else:
             if self.window_size < targets_past.shape[1]:
                 targets_past = targets_past[:, -self.window_size:]
             targets_past, _, loc, scale = self.target_scaler(targets_past)
             x_past = targets_past
 
-        x_past = x_past if features_past is None else torch.cat([x_past, features_past], dim=-1)
+        x_past = x_past if features_past is None else torch.cat([features_past, x_past], dim=-1)
 
         x_past = x_past.to(self.device)
         x_past = self.embedding(x_past)
@@ -300,12 +333,14 @@ def forward(self,
             # we do one step ahead forecasting
             if self.decoder_lagged_input:
                 targets_future = torch.cat([targets_past, targets_future[:, :-1, :]], dim=1)
-                targets_future = get_lagged_subsequences(targets_future, self.n_prediction_steps,
-                                                         self.decoder.lagged_value)
+                targets_future, self.cached_lag_mask_decoder = get_lagged_subsequences(targets_future,
+                                                                                       self.n_prediction_steps,
+                                                                                       self.decoder.lagged_value,
+                                                                                       self.cached_lag_mask_decoder)
             else:
                 targets_future = torch.cat([targets_past[:, [-1], :], targets_future[:, :-1, :]], dim=1)
 
-            x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
+            x_future = targets_future if features_future is None else torch.cat([features_future, targets_future],
                                                                                 dim=-1)
             x_future = x_future.to(self.device)
 
@@ -325,12 +360,11 @@ def forward(self,
             for idx_pred in range(self.n_prediction_steps):
                 if self.decoder_lagged_input:
                     x_future = torch.cat([targets_past, predicted_target.cpu()], dim=1)
-                    x_future = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+                    x_future, _ = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
                 else:
                     x_future = predicted_target[:, [-1]]
                 x_future = x_future if features_future is None else torch.cat(
-                    [x_future, features_future[:, [idx_pred], :]],
-                    dim=-1)
+                    [features_future[:, [idx_pred], :], x_future], dim=-1)
                 x_future = x_future.to(self.device)
 
                 x_future, hidden_states = self.decoder(x_future, hx=hidden_states)
@@ -375,6 +409,8 @@ def __init__(self,
         # this determines the training targets
         self.encoder_bijective_seq_output = kwargs['encoder_properties']['bijective_seq_output']
 
+        self.cached_lag_mask_encoder_test = None
+
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
@@ -382,35 +418,33 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.encoder_lagged_input:
-            targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
-            targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-            x_past = get_lagged_subsequences(targets_past, self.window_size, self.encoder.lagged_value)
-        else:
-            if self.window_size < targets_past.shape[1]:
-                targets_past = targets_past[:, -self.window_size:]
-
-            targets_past, _, loc, scale = self.target_scaler(targets_past)
-            x_past = targets_past
-
-        x_past = x_past if features_past is None else torch.cat([x_past, features_past], dim=-1)
-
-        x_past = x_past.to(self.device)
-        # TODO consider static features
-        x_past = self.embedding(x_past)
-
         if self.training:
             if self.encoder_lagged_input:
-                targets_future = torch.cat([targets_past, targets_future], dim=1)
-                targets_future = get_lagged_subsequences(targets_future, self.n_prediction_steps,
-                                                         self.encoder.lagged_value)
+                targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    targets_past[:, -self.window_size:])
+                targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
+                targets_future = self.scale_value(targets_future, loc, scale)
+
+                targets_all = torch.cat([targets_past, targets_future[:, :-1]], dim=1)
+                seq_length = self.window_size + self.n_prediction_steps
+                targets_all, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_all,
+                                                                                    seq_length - 1,
+                                                                                    self.encoder.lagged_value,
+                                                                                    self.cached_lag_mask_encoder)
+            else:
+                if self.window_size < targets_past.shape[1]:
+                    targets_past = targets_past[:, -self.window_size:]
+                targets_past, _, loc, scale = self.target_scaler(targets_past)
+                targets_future = self.scale_value(targets_future, loc, scale)
+                targets_all = torch.cat([targets_past, targets_future[:, :-1]], dim=1)
 
-            x_future = targets_future if features_future is None else torch.cat([targets_future, features_future],
-                                                                                dim=-1)
-            x_future = x_future.to(self.device)
-            x_future = self.embedding(x_future)
+            x_input = targets_all
+            if features_past is not None:
+                features_all = torch.cat([features_past, features_future], dim=1)
+                x_input = torch.cat([features_all, x_input], dim=-1)
+            x_input = x_input.to(self.device)
 
-            x_input = torch.cat([x_past, x_future[:, :-1]], dim=1)
+            x_input = self.embedding(x_input)
 
             if self.encoder_has_hidden_states:
                 x_input, _ = self.encoder(x_input, output_seq=True)
@@ -420,6 +454,27 @@ def forward(self,
             net_output = self.head(self.decoder(x_input))
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
+            if self.encoder_lagged_input:
+                targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    targets_past[:, -self.window_size:])
+                targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
+                x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(targets_past,
+                                                                                    self.window_size,
+                                                                                    self.encoder.lagged_value,
+                                                                                    self.cached_lag_mask_encoder_test)
+            else:
+                if self.window_size < targets_past.shape[1]:
+                    targets_past = targets_past[:, -self.window_size:]
+
+                targets_past, _, loc, scale = self.target_scaler(targets_past)
+                x_past = targets_past
+
+            x_past = x_past if features_past is None else torch.cat([features_past, x_past], dim=-1)
+
+            x_past = x_past.to(self.device)
+            # TODO consider static features
+            x_past = self.embedding(x_past)
+
             all_samples = []
             batch_size = targets_past.shape[0]
 
@@ -461,13 +516,12 @@ def forward(self,
             for k in range(1, self.n_prediction_steps):
                 if self.encoder_lagged_input:
                     x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                    x_next = get_lagged_subsequences(x_next, 1, self.encoder.lagged_value)
+                    x_next, _ = get_lagged_subsequences(x_next, 1, self.encoder.lagged_value)
                 else:
                     x_next = next_sample
 
-                x_next = x_next if repeated_time_feat is None else torch.cat([x_next,
-                                                                              repeated_time_feat[:, k:k + 1]],
-                                                                             dim=-1)
+                x_next = x_next if repeated_time_feat is None else torch.cat([repeated_time_feat[:, k:k + 1],
+                                                                              x_next], dim=-1)
                 if self.encoder_has_hidden_states:
                     x_next = x_next.to(self.device)
                     encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index d1ea423e7..202013701 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -77,7 +77,7 @@ class RNNEncoder(BaseForecastingEncoder):
 
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
-        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.lagged_value = [0, 1, 2, 3, 4, 5, 6]
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         encoder = _RNN(in_features=input_shape[-1],
@@ -106,7 +106,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if freq is not None:
             try:
                 freq = FREQUENCY_MAP[freq]
-                self.lagged_value = [0] + get_lags_for_frequency(freq)
+                lagged_values = get_lags_for_frequency(freq)
+                self.lagged_value = [lag - 1 for lag in lagged_values]
             except Exception:
                 warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
                 # If
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 4e37f241e..48fb4e68c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -4,7 +4,7 @@
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition
+from ConfigSpace.conditions import GreaterThanCondition, EqualsCondition
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
@@ -21,17 +21,20 @@ def _build_decoder(self,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
         layers = []
         in_features = input_shape[-1]
+        num_decoder_output_features = in_features
         if self.config["num_layers"] > 0:
             for i in range(1, self.config["num_layers"]):
                 layers.append(nn.Linear(in_features=in_features,
                                         out_features=self.config[f"units_layer_{i}"]))
                 layers.append(_activations[self.config["activation"]]())
                 in_features = self.config[f"units_layer_{i}"]
-        layers.append(nn.Linear(in_features=in_features,
-                                out_features=self.config['units_final_layer'] * n_prediction_heads))
-        if 'activation' in self.config:
-            layers.append(_activations[self.config["activation"]]())
-        num_decoder_output_features = self.config['units_final_layer']
+                num_decoder_output_features = in_features
+        if 'units_final_layer' in self.config:
+            layers.append(nn.Linear(in_features=in_features,
+                                    out_features=self.config['units_final_layer'] * n_prediction_heads))
+            if 'activation' in self.config:
+                layers.append(_activations[self.config["activation"]]())
+            num_decoder_output_features = self.config['units_final_layer']
 
         return nn.Sequential(*layers), num_decoder_output_features
 
@@ -64,13 +67,13 @@ def get_hyperparameter_search_space(
                                                                               value_range=tuple(_activations.keys()),
                                                                               default_value=list(_activations.keys())[
                                                                                   0]),
+            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False),
             units_final_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_final_layer",
                                                                                      value_range=(16, 128),
                                                                                      default_value=32,
                                                                                      log=True),
-            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
-                                                                                   value_range=(True, False),
-                                                                                   default_value=False),
     ) -> ConfigurationSpace:
         """
         Builds the mlp head layer. The decoder implementation follows the idea from:
@@ -136,8 +139,14 @@ def get_hyperparameter_search_space(
                 # hyperparameter.
                 cs.add_condition(GreaterThanCondition(num_units_hp, num_layers_hp, i))
 
-        add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
+        # add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
 
         # TODO let dataset_properties decide if auto_regressive models is applicable
-        add_hyperparameter(cs, auto_regressive, CategoricalHyperparameter)
+        auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
+        units_final_layer = get_hyperparameter(units_final_layer, UniformIntegerHyperparameter)
+
+        cond_units_final_layer = EqualsCondition(units_final_layer, auto_regressive, False)
+        cs.add_hyperparameters([auto_regressive, units_final_layer])
+        cs.add_condition(cond_units_final_layer)
+
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index a8ddee9f0..9c869b408 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -64,7 +64,7 @@ def __init__(self, **kwargs: Dict):
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.auto_regressive = True
         self.rnn_kwargs = None
-        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.lagged_value = [0, 1, 2, 3, 4, 5, 6]
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -111,7 +111,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if freq is not None:
             try:
                 freq = FREQUENCY_MAP[freq]
-                self.lagged_value = [0] + get_lags_for_frequency(freq)
+                lagged_values = get_lags_for_frequency(freq)
+                self.lagged_value = [lag - 1 for lag in lagged_values]
             except Exception:
                 warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
                 pass
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 9611b3f32..706532d6b 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -163,7 +163,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             outputs = forecast
         else:
             if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
-                if self.window_size> past_target.shape[1]:
+                if self.window_size > past_target.shape[1]:
                     all_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
                 else:
                     all_targets = torch.cat([past_target[:, 1 - self.window_size:, ], future_targets], dim=1)
@@ -174,6 +174,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             outputs = self.model(past_target, future_targets)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
+
             loss = loss_func(self.criterion, outputs)
 
         loss.backward()
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 4f1b2ce61..993254b78 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -335,6 +335,7 @@ def _get_hyperparameter_search_space(self,
             cs.get_hyperparameter_names()
         """
 
+
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
         return cs

From e4c53582891de89679a244ce65abe670c02a2be2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 4 Jan 2022 15:25:56 +0100
Subject: [PATCH 110/347] maint

---
 .../data_loader/time_series_forecasting_data_loader.py       | 4 ++--
 .../trainer/forecasting_trainer/forecasting_base_trainer.py  | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 4e04436a1..a31c7db28 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -336,8 +336,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
         seq_train_length[seq_idx_inactivate] = 0
         # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
-        num_instances_per_seqs = np.round(np.ceil(num_instances_train / num_instances_dataset * seq_train_length) *
-                                          fraction_samples_per_seq)
+        num_instances_per_seqs = np.ceil(np.ceil(num_instances_train / num_instances_dataset * seq_train_length) *
+                                         fraction_samples_per_seq)
         num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
         # at least one element of each sequence should be selected
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 706532d6b..4a2e635e9 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -166,7 +166,10 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
                 if self.window_size > past_target.shape[1]:
                     all_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
                 else:
-                    all_targets = torch.cat([past_target[:, 1 - self.window_size:, ], future_targets], dim=1)
+                    if self.window_size == 1:
+                        all_targets = future_targets
+                    else:
+                        all_targets = torch.cat([past_target[:, 1 - self.window_size:, ], future_targets], dim=1)
                 past_target, criterion_kwargs = self.data_preparation(past_target, all_targets.to(self.device))
             else:
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets.to(self.device))

From a0a01b36e5ebac17e39c46c509d07af32eb9fb96 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 5 Jan 2022 15:16:57 +0100
Subject: [PATCH 111/347] new init, faster DeepAR inference in trainer

---
 autoPyTorch/api/base_task.py                  |  6 +-
 autoPyTorch/api/time_series_forecasting.py    | 10 +++-
 .../configs/forecasting_init_cfgs.json        | 59 ++++++++++++++++++
 autoPyTorch/optimizer/smbo.py                 | 14 ++++-
 autoPyTorch/optimizer/utils.py                | 60 +++++++++++++++++--
 .../setup/network/forecasting_network.py      | 16 ++++-
 .../forecasting_backbone/RNNEncoder.py        |  3 +-
 .../forecasting_backbone/__init__.py          |  1 -
 .../forecasting_decoder/MLPDecoder.py         |  2 +-
 .../forecasting_decoder/RNNDecoder.py         | 35 +++--------
 .../forecasting_decoder/__init__.py           |  2 -
 .../forecasting_base_trainer.py               | 45 +++++++-------
 12 files changed, 192 insertions(+), 61 deletions(-)
 create mode 100644 autoPyTorch/configs/forecasting_init_cfgs.json

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 5e4317736..8dcb72146 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -753,7 +753,8 @@ def _search(
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
         dask_client: Optional[dask.distributed.Client] = None,
-        time_series_forecasting: bool = False
+        time_series_forecasting: bool = False,
+        **kwargs: Dict[str, Any]
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -880,6 +881,8 @@ def _search(
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
             time_series_forecasting: bool
                 if time series forecasting task is implemented.
+            kwargs: Dict
+                additional arguments
 
         Returns:
             self
@@ -1091,6 +1094,7 @@ def _search(
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
                 time_series_forecasting=time_series_forecasting,
+                **kwargs,
             )
             try:
                 run_history, self._results_manager.trajectory, budget_type = \
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 5c1c71829..0155dc1f6 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -140,6 +140,8 @@ def search(
             portfolio_selection: Optional[str] = None,
             shift_input_data: bool = True,
             normalize_y: bool = True,
+            suggested_init_models: Optional[List[str]] = ['MLP', 'DeepAR'],
+            custom_init_setting_path: Optional[str] = None,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -243,6 +245,10 @@ def search(
             train_with_log_prob: bool
                 if the network is trained with log_prob losses, this will create a network header that is different
                 from the current version.
+            suggested_init_models: Optional[List[str]]
+                suggested initial models with their default configurations setting
+            custom_init_setting_path: Optional[str]
+                path to a json file that contains the initial configuration suggested by the users
         Returns:
             self
 
@@ -322,7 +328,9 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
-            time_series_forecasting=self.time_series_forecasting
+            time_series_forecasting=self.time_series_forecasting,
+            suggested_init_models=suggested_init_models,
+            custom_init_setting_path=custom_init_setting_path,
         )
 
     def predict(
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
new file mode 100644
index 000000000..ac7421de2
--- /dev/null
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -0,0 +1,59 @@
+{
+    "trainer": {
+        "data_loader:batch_size": 32,
+        "data_loader:backcast": false,
+        "data_loader:num_batches_per_epoch": 50,
+        "imputer:numerical_strategy": "median",
+        "lr_scheduler:__choice__": "ReduceLROnPlateau",
+        "lr_scheduler:ReduceLROnPlateau:mode": "min",
+        "lr_scheduler:ReduceLROnPlateau:factor": 0.5,
+        "lr_scheduler:ReduceLROnPlateau:patience": 10,
+        "optimizer:__choice__": "AdamOptimizer",
+        "optimizer:AdamOptimizer:lr": 0.001,
+        "optimizer:AdamOptimizer:weight_decay": 1e-08,
+        "optimizer:AdamOptimizer:beta1": 0.9,
+        "optimizer:AdamOptimizer:beta2": 0.999,
+        "network_init:__choice__": "XavierInit",
+        "network_init:XavierInit:bias_strategy": "Zero",
+        "target_scaler:__choice__": "TargetMeanAbsScaler",
+        "trainer:__choice__": "ForecastingStandardTrainer",
+        "network_embedding:__choice__": "NoEmbedding"
+    },
+    "models": {
+        "MLP": {
+            "loss:__choice__": "DistributionLoss",
+            "network_backbone:__choice__": "MLPEncoder",
+            "network_backbone:MLPEncoder:num_groups": 1,
+            "network_backbone:MLPEncoder:num_units_1": 40,
+            "network_backbone:MLPEncoder:activation": "relu",
+            "network_backbone:MLPEncoder:use_dropout": false,
+            "network_backbone:MLPEncoder:normalization": "NoNorm",
+            "network_backbone:MLPDecoder:num_layers": 0,
+            "network_backbone:MLPDecoder:units_final_layer": 40,
+            "network_backbone:MLPDecoder:auto_regressive": false,
+            "network:forecast_strategy": "sample",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "network:aggregation": "median",
+            "network:num_samples": 100
+        },
+        "DeepAR": {
+            "loss:__choice__": "DistributionLoss",
+            "network_backbone:__choice__": "RNNEncoder",
+            "network_backbone:RNNEncoder:num_layers": 2,
+            "network_backbone:RNNEncoder:hidden_size": 40,
+            "network_backbone:RNNEncoder:bidirectional": false,
+            "network_backbone:RNNEncoder:use_dropout": true,
+            "network_backbone:RNNEncoder:dropout": 0.1,
+            "network_backbone:RNNEncoder:cell_type": "lstm",
+            "network_backbone:RNNEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:MLPDecoder:num_layers": 0,
+            "network_backbone:MLPDecoder:auto_regressive": true,
+            "network:forecast_strategy": "sample",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "network:aggregation": "median",
+            "network:num_samples": 100
+        }
+    }
+}
\ No newline at end of file
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 20c5cffd9..c9dcf5a68 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -27,7 +27,7 @@
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
-from autoPyTorch.optimizer.utils import read_return_initial_configurations
+from autoPyTorch.optimizer.utils import read_return_initial_configurations, read_forecasting_init_configurations
 
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -116,7 +116,8 @@ def __init__(self,
                  pynisher_context: str = 'spawn',
                  min_budget: int = 5,
                  max_budget: int = 50,
-                 time_series_forecasting: bool = False
+                 time_series_forecasting: bool = False,
+                 **kwargs: Dict[str, Any]
                  ):
         """
         Interface to SMAC. This method calls the SMAC optimize method, and allows
@@ -194,6 +195,8 @@ def __init__(self,
             time_series_forecasting (bool):
                 If we want to apply this optimizer to optimize time series prediction tasks (which has a different
                 tae)
+            kwargs (Dict):
+                Additional Arguments for forecasting intialization tasks
         """
         super(AutoMLSMBO, self).__init__()
         # data related
@@ -254,6 +257,13 @@ def __init__(self,
         if portfolio_selection is not None:
             self.initial_configurations = read_return_initial_configurations(config_space=config_space,
                                                                              portfolio_selection=portfolio_selection)
+        suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models', None)
+        custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path', None)
+        if suggested_init_models is not None or custom_init_setting_path is not None:
+            self.initial_configurations = read_forecasting_init_configurations(
+                config_space=config_space,
+                suggested_init_models=suggested_init_models,
+                custom_init_setting_path=custom_init_setting_path)
 
     def reset_data_manager(self) -> None:
         if self.datamanager is not None:
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 6fb9d5024..5954327fe 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -1,16 +1,15 @@
 import json
 import os
 import warnings
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
 
 def read_return_initial_configurations(
-    config_space: ConfigurationSpace,
-    portfolio_selection: str
+        config_space: ConfigurationSpace,
+        portfolio_selection: str
 ) -> List[Configuration]:
-
     # read and validate initial configurations
     portfolio_path = portfolio_selection if portfolio_selection != "greedy" else \
         os.path.join(os.path.dirname(__file__), '../configs/greedy_portfolio.json')
@@ -31,3 +30,56 @@ def read_return_initial_configurations(
                           f"Therefore, it can't be used as an initial "
                           f"configuration as it does not match the current config space. ")
     return initial_configurations
+
+
+def read_forecasting_init_configurations(config_space: ConfigurationSpace,
+                                         suggested_init_models: Optional[List[str]] = None,
+                                         custom_init_setting_path: Optional[str] = None,
+                                         ):
+    forecasting_init_path = os.path.join(os.path.dirname(__file__), '../configs/forecasting_init_cfgs.json')
+    initial_configurations_dict: List[Dict] = list()
+    initial_configurations = []
+
+    if suggested_init_models:
+        with open(forecasting_init_path, 'r') as f:
+            forecasting_init_dict: [Dict[str, Any]] = json.load(f)
+        cfg_trainer: Dict = forecasting_init_dict['trainer']
+        models_name_to_cfgs: Dict = forecasting_init_dict['models']
+
+        window_size = config_space.get_default_configuration()["data_loader:window_size"]
+        for model_name in suggested_init_models:
+            cfg_tmp = cfg_trainer.copy()
+            if model_name != 'NBEATS':
+                cfg_tmp['data_loader:window_size'] = window_size
+            model_cfg = models_name_to_cfgs.get(model_name, None)
+            if model_cfg is None:
+                warnings.warn(f'Cannot to find the corresponding information of model {model_name} from,'
+                              f' forecasting_init_cfgs, currently only {list(models_name_to_cfgs.keys())} are '
+                              f'supported')
+
+            cfg_tmp.update(model_cfg)
+            initial_configurations_dict.append(cfg_tmp)
+
+    if custom_init_setting_path is not None:
+        try:
+            with open(custom_init_setting_path, 'r') as f:
+                initial_configurations_custom_dict: Union[List[Dict[str, Any]], Dict] = json.load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError("The path: {} provided for 'custome_setting_path' for "
+                                    "the file containing the custom initial configurations "
+                                    "does not exist. Please provide a valid path".format(custom_init_setting_path))
+        if isinstance(initial_configurations_custom_dict, list):
+            initial_configurations_dict.extend(initial_configurations_custom_dict)
+        else:
+            initial_configurations_dict.append(initial_configurations_custom_dict)
+
+    for configuration_dict in initial_configurations_dict:
+        try:
+            configuration = Configuration(config_space, configuration_dict)
+            initial_configurations.append(configuration)
+        except Exception as e:
+            warnings.warn(f"Failed to convert {configuration_dict} into"
+                          f" a Configuration with error {e}. "
+                          f"Therefore, it can't be used as an initial "
+                          f"configuration as it does not match the current config space. ")
+    return initial_configurations
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index f601f8bba..1404d1f02 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -410,6 +410,11 @@ def __init__(self,
         self.encoder_bijective_seq_output = kwargs['encoder_properties']['bijective_seq_output']
 
         self.cached_lag_mask_encoder_test = None
+        self.only_generate_future_dist = False
+
+    def train(self, mode: bool = True) -> nn.Module:
+        self.only_generate_future_dist = False
+        return super().train(mode=mode)
 
     def forward(self,
                 targets_past: torch.Tensor,
@@ -450,7 +455,8 @@ def forward(self,
                 x_input, _ = self.encoder(x_input, output_seq=True)
             else:
                 x_input = self.encoder(x_input, output_seq=True)
-
+            if self.only_generate_future_dist:
+                x_input = x_input[:, -self.n_prediction_steps:]
             net_output = self.head(self.decoder(x_input))
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
@@ -550,7 +556,13 @@ def forward(self,
             else:
                 raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
-    def pred_from_net_output(self, net_output: torch.Tensor):
+    def predict(self,
+                targets_past: torch.Tensor,
+                features_past: Optional[torch.Tensor] = None,
+                features_future: Optional[torch.Tensor] = None,
+                features_static: Optional[torch.Tensor] = None
+                ):
+        net_output = self(targets_past, features_past, features_future)
         return net_output
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index 202013701..7746784dd 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -118,7 +118,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         rnn_kwargs = {'hidden_size': self.config['hidden_size'],
                       'num_layers': self.config['num_layers'],
                       'bidirectional': self.config['bidirectional'],
-                      'cell_type': self.config['cell_type']}  # used for initialize
+                      'cell_type': self.config['cell_type'],
+                      'dropout': self.config.get('dropout', 0.0)}  # used for initialize
         X.update({'rnn_kwargs': rnn_kwargs})
         return super().transform(X)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 7936d281d..419f0e2d7 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -274,7 +274,6 @@ def get_hyperparameter_search_space(
                     else:
                         conditions_to_add.append(or_cond[0])
             cs.add_conditions(conditions_to_add)
-
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 48fb4e68c..a29f807ff 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -14,7 +14,7 @@
     BaseForecastingDecoder
 
 
-class ForecastingMLPHeader(BaseForecastingDecoder):
+class ForecastingMLPDecoder(BaseForecastingDecoder):
     def _build_decoder(self,
                        input_shape: Tuple[int, ...],
                        n_prediction_heads: int,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 9c869b408..91d6425e5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -3,10 +3,8 @@
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter
-)
+from ConfigSpace.hyperparameters import Constant
+
 
 import torch
 from torch import nn
@@ -29,10 +27,9 @@ def __init__(self,
                  hidden_size: int,
                  num_layers: int,
                  cell_type: str,
-                 config: Dict[str, Any],
+                 dropout: float,
                  lagged_value: Optional[Union[List, np.ndarray]]=None):
         super().__init__()
-        self.config = config
         if cell_type == 'lstm':
             cell = nn.LSTM
         else:
@@ -42,7 +39,7 @@ def __init__(self,
         self.lstm = cell(input_size=in_features,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
-                         dropout=config.get("dropout", 0.0),
+                         dropout=dropout,
                          bidirectional=False,
                          batch_first=True)
 
@@ -54,7 +51,7 @@ def forward(self, x: torch.Tensor,
         return outputs, hidden_state
 
 
-class ForecastingRNNHeader(BaseForecastingDecoder):
+class ForecastingRNNDecoder(BaseForecastingDecoder):
     """
     Standard searchable RNN decoder for time series data, only works when the encoder is
     """
@@ -68,7 +65,7 @@ def __init__(self, **kwargs: Dict):
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
-        fit_requirement = super(ForecastingRNNHeader, self)._required_fit_requirements
+        fit_requirement = super(ForecastingRNNDecoder, self)._required_fit_requirements
         fit_requirement.append(FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False))
         return fit_requirement
 
@@ -81,11 +78,12 @@ def _build_decoder(self,
         num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs[
             'num_layers']
         cell_type = self.rnn_kwargs['cell_type']
+        dropout = self.rnn_kwargs['dropout']
         decoder = RNN_Module(in_features=dataset_properties['output_shape'][-1],
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              cell_type=cell_type,
-                             config=self.config,
+                             dropout=dropout,
                              lagged_value=self.lagged_value
                              )
         return decoder, hidden_size
@@ -131,22 +129,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict] = None,
-            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
-                                                                               value_range=(True, False),
-                                                                               default_value=False),
-            dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
-                                                                           value_range=(0., 0.5),
-                                                                           default_value=0.2),
     ) -> ConfigurationSpace:
         cs = CS.ConfigurationSpace()
-
-        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
-        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
-
-        cs.add_hyperparameters([use_dropout, dropout])
-
-        # Add plain hyperparameters
-        # Hidden size is given by the encoder architecture
-        cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
-
+        cs.add_hyperparameter(Constant('decoder_type', 'RNN'))  # this helps the encoder to recognize the decoder.
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index 7df7ac6a2..69178763f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -191,6 +191,4 @@ def get_hyperparameter_search_space(
 
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
-        import pdb
-        pdb.set_trace()
         return cs
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 4a2e635e9..233e3072b 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -81,6 +81,8 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
+        import time
+        time_start = time.time()
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -111,6 +113,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
+        print(f'time used for trainging epoch: {time.time() - time_start}')
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
@@ -200,7 +203,13 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
             float: test loss
             Dict[str, float]: scores for each desired metric
         """
-        self.model.eval()
+        import time
+        time_start = time.time()
+        if not isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
+            # To save time, we simply make one step prediction for DeepAR and Seq2Seq
+            self.model.eval()
+        if isinstance(self.model, ForecastingDeepARNet):
+            self.model.only_generate_future_dist = True
 
         loss_sum = 0.0
         N = 0
@@ -217,30 +226,25 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 batch_size = past_target.shape[0]
 
-                if isinstance(self.model, ForecastingDeepARNet) or isinstance(self.model, ForecastingSeq2SeqNet):
-                    future_targets = self.cast_targets(future_targets)
+                future_targets = self.cast_targets(future_targets)
 
-                    past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
-                    outputs = self.model(past_target)
-                    # DeepAR only generate sampled points, we replace log_prob loss with MSELoss
-                    # outputs = self.model.pred_from_net_output(outputs)
-                    outputs = outputs.detach().cpu()
-                    loss = F.mse_loss(outputs, future_targets)
+                if isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
+                    outputs = self.model(past_target, future_targets)
                 else:
-                    # prepare
-                    future_targets = self.cast_targets(future_targets).to(self.device)
+                    outputs = self.model(past_target)
 
-                    past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+                # prepare
+                future_targets = future_targets.to(self.device)
 
-                    outputs = self.model(past_target)
-                    if isinstance(outputs, list):
-                        loss = [self.criterion(output, future_targets) for output in outputs]
-                        loss = torch.mean(torch.Tensor(loss))
-                    else:
-                        loss = self.criterion(outputs, future_targets)
-                    outputs = self.model.pred_from_net_output(outputs)
-                    outputs = outputs.detach().cpu()
+                if isinstance(outputs, list):
+                    loss = [self.criterion(output, future_targets) for output in outputs]
+                    loss = torch.mean(torch.Tensor(loss))
+                else:
+                    loss = self.criterion(outputs, future_targets)
+                outputs = self.model.pred_from_net_output(outputs)
+                outputs = outputs.detach().cpu()
 
                 loss_sum += loss.item() * batch_size
                 N += batch_size
@@ -263,6 +267,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         self._scheduler_step(step_interval=StepIntervalUnit.valid, loss=loss_sum / N)
 
         self.model.train()
+        print(f'time for evaluation: {time.time() - time_start}')
         return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
 
     def compute_metrics(self, outputs_data: List[torch.Tensor], targets_data: List[torch.Tensor]

From 27e0eb0e14dfc71af28b8538a337ead453ede40d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 6 Jan 2022 12:53:01 +0100
Subject: [PATCH 112/347] more losses types

---
 autoPyTorch/api/time_series_forecasting.py    |  4 +-
 .../configs/forecasting_init_cfgs.json        | 87 ++++++++++++++++++-
 autoPyTorch/datasets/time_series_dataset.py   | 10 +--
 autoPyTorch/optimizer/smbo.py                 |  5 +-
 autoPyTorch/optimizer/utils.py                |  5 +-
 .../RegressionLoss.py                         |  8 +-
 .../setup/network/forecasting_network.py      |  2 +-
 .../forecasting_backbone/TCNEncoder.py        | 64 +++++++++-----
 .../forecasting_decoder/NBEATSDecoder.py      | 28 +++---
 .../forecasting_decoder/RNNDecoder.py         |  2 +-
 .../time_series_forecasting_data_loader.py    | 13 ++-
 .../pipeline/components/training/losses.py    | 49 ++++++++++-
 .../trainer/forecasting_trainer/__init__.py   |  2 +-
 .../forecasting_base_trainer.py               | 12 ++-
 14 files changed, 231 insertions(+), 60 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 0155dc1f6..5ee60736e 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -26,7 +26,7 @@
 
 class TimeSeriesForecastingTask(BaseTask):
     """
-    Time Series Forcasting API to the pipelines.
+    Time Series Forecasting API to the pipelines.
     Args:
         seed (int): seed to be used for reproducibility.
         n_jobs (int), (default=1): number of consecutive processes to spawn.
@@ -140,7 +140,7 @@ def search(
             portfolio_selection: Optional[str] = None,
             shift_input_data: bool = True,
             normalize_y: bool = True,
-            suggested_init_models: Optional[List[str]] = ['MLP', 'DeepAR'],
+            suggested_init_models: Optional[List[str]] = None,
             custom_init_setting_path: Optional[str] = None,
     ) -> 'BaseTask':
         """
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index ac7421de2..f3b73fb8f 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -22,6 +22,8 @@
     "models": {
         "MLP": {
             "loss:__choice__": "DistributionLoss",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
             "network_backbone:__choice__": "MLPEncoder",
             "network_backbone:MLPEncoder:num_groups": 1,
             "network_backbone:MLPEncoder:num_units_1": 40,
@@ -32,28 +34,107 @@
             "network_backbone:MLPDecoder:units_final_layer": 40,
             "network_backbone:MLPDecoder:auto_regressive": false,
             "network:forecast_strategy": "sample",
-            "network:net_out_type": "distribution",
-            "loss:DistributionLoss:dist_cls": "studentT",
             "network:aggregation": "median",
             "network:num_samples": 100
         },
         "DeepAR": {
             "loss:__choice__": "DistributionLoss",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
             "network_backbone:__choice__": "RNNEncoder",
+            "network_backbone:RNNEncoder:cell_type": "lstm",
             "network_backbone:RNNEncoder:num_layers": 2,
             "network_backbone:RNNEncoder:hidden_size": 40,
             "network_backbone:RNNEncoder:bidirectional": false,
             "network_backbone:RNNEncoder:use_dropout": true,
             "network_backbone:RNNEncoder:dropout": 0.1,
-            "network_backbone:RNNEncoder:cell_type": "lstm",
             "network_backbone:RNNEncoder:decoder_type": "MLPDecoder",
             "network_backbone:MLPDecoder:num_layers": 0,
             "network_backbone:MLPDecoder:auto_regressive": true,
             "network:forecast_strategy": "sample",
+            "network:aggregation": "median",
+            "network:num_samples": 100
+        },
+        "Seq2SeqRNN2MLP": {
+            "loss:__choice__": "DistributionLoss",
             "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
+            "network_backbone:__choice__": "RNNEncoder",
+            "network_backbone:RNNEncoder:cell_type": "gru",
+            "network_backbone:RNNEncoder:num_layers": 1,
+            "network_backbone:RNNEncoder:hidden_size": 50,
+            "network_backbone:RNNEncoder:bidirectional": true,
+            "network_backbone:RNNEncoder:use_dropout": false,
+            "network_backbone:RNNEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:MLPDecoder:num_layers": 0,
+            "network_backbone:MLPDecoder:auto_regressive": false,
+            "network_backbone:MLPDecoder:units_final_layer": 30,
+            "network:forecast_strategy": "sample",
             "network:aggregation": "median",
             "network:num_samples": 100
+        },
+        "Seq2SeqTCN2MLP": {
+            "loss:__choice__": "DistributionLoss",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "network_backbone:__choice__": "TCNEncoder",
+            "network_backbone:TCNEncoder:use_dropout": false,
+            "network_backbone:TCNEncoder:num_blocks": 3,
+            "network_backbone:TCNEncoder:num_filters_1": 30,
+            "network_backbone:TCNEncoder:kernel_size_1": 7,
+            "network_backbone:TCNEncoder:num_filters_2": 30,
+            "network_backbone:TCNEncoder:kernel_size_2": 3,
+            "network_backbone:TCNEncoder:num_filters_3": 30,
+            "network_backbone:TCNEncoder:kernel_size_3": 3,
+            "network_backbone:MLPDecoder:num_layers": 0,
+            "network_backbone:MLPDecoder:auto_regressive": false,
+            "network_backbone:MLPDecoder:units_final_layer": 30,
+            "network:forecast_strategy": "sample",
+            "network:aggregation": "median",
+            "network:num_samples": 100
+        },
+        "Seq2SeqRNN2RNN": {
+            "loss:__choice__": "DistributionLoss",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "network_backbone:__choice__": "RNNEncoder",
+            "network_backbone:RNNEncoder:cell_type": "gru",
+            "network_backbone:RNNEncoder:num_layers": 3,
+            "network_backbone:RNNEncoder:hidden_size": 32,
+            "network_backbone:RNNEncoder:bidirectional": true,
+            "network_backbone:RNNEncoder:use_dropout": false,
+            "network_backbone:RNNEncoder:decoder_type": "RNNDecoder",
+            "network_backbone:RNNDecoder:decoder_type": "RNNDecoder",
+            "network:forecast_strategy": "sample",
+            "network:aggregation": "median",
+            "network:num_samples": 100
+        },
+        "NBEATS": {
+            "data_loader:backcast": true,
+            "data_loader:backcast_period": 2,
+            "loss:__choice__": "RegressionLoss",
+            "loss:RegressionLoss:loss_name": "mase",
+            "network:net_out_type": "regression",
+            "network_backbone:__choice__": "NBEATSEncoder",
+            "network_backbone:NBEATSDecoder:use_dropout": true,
+            "network_backbone:NBEATSDecoder:backcast_loss_ratio": 0.0,
+            "network_backbone:NBEATSDecoder:normalization": "NoNorm",
+            "network_backbone:NBEATSDecoder:activation": "relu",
+            "network_backbone:NBEATSDecoder:num_stacks": 2,
+            "network_backbone:NBEATSDecoder:num_blocks_1": 3,
+            "network_backbone:NBEATSDecoder:num_layers_1": 2,
+            "network_backbone:NBEATSDecoder:width_1": 256,
+            "network_backbone:NBEATSDecoder:weight_sharing_1": true,
+            "network_backbone:NBEATSDecoder:stack_type_1": "trend",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_interpretable_1": 3,
+            "network_backbone:NBEATSDecoder:dropout_1": 0.1,
+            "network_backbone:NBEATSDecoder:num_blocks_2": 3,
+            "network_backbone:NBEATSDecoder:num_layers_2": 2,
+            "network_backbone:NBEATSDecoder:width_2": 512,
+            "network_backbone:NBEATSDecoder:weight_sharing_2": true,
+            "network_backbone:NBEATSDecoder:stack_type_2": "seasonality",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_interpretable_2": 3,
+            "network_backbone:NBEATSDecoder:dropout_2": 0.1
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 76ae91981..db81e5dc2 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -116,12 +116,9 @@ def __getitem__(self, index: int, train: bool = True) \
             # Y_Past does not need to be fed to the network, we keep it as np array
         else:
             Y_future = None
-        if train:
-            # TODO consider static information and missing information
-            return {"past_target": torch.from_numpy(X)},  Y_future
-        else:
-            return {"past_target": torch.from_numpy(X),
-                    "mase_coefficient": self.mase_coefficient},  Y_future
+
+        return {"past_target": torch.from_numpy(X),
+                "mase_coefficient": self.mase_coefficient}, Y_future
 
     def __len__(self) -> int:
         return self.X.shape[0]
@@ -331,7 +328,6 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
-
     def __getitem__(self, idx, train=True):
         if idx < 0:
             if -idx > len(self):
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index c9dcf5a68..d3dca4243 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -259,7 +259,10 @@ def __init__(self,
                                                                              portfolio_selection=portfolio_selection)
         suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models', None)
         custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path', None)
-        if suggested_init_models is not None or custom_init_setting_path is not None:
+
+        # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
+        # do not provide any initial configurations
+        if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
             self.initial_configurations = read_forecasting_init_configurations(
                 config_space=config_space,
                 suggested_init_models=suggested_init_models,
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 5954327fe..bd3f0b662 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -40,13 +40,16 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
     initial_configurations_dict: List[Dict] = list()
     initial_configurations = []
 
-    if suggested_init_models:
+    if suggested_init_models or suggested_init_models is None:
         with open(forecasting_init_path, 'r') as f:
             forecasting_init_dict: [Dict[str, Any]] = json.load(f)
         cfg_trainer: Dict = forecasting_init_dict['trainer']
         models_name_to_cfgs: Dict = forecasting_init_dict['models']
 
         window_size = config_space.get_default_configuration()["data_loader:window_size"]
+        if suggested_init_models is None:
+            suggested_init_models = list(models_name_to_cfgs.keys())
+
         for model_name in suggested_init_models:
             cfg_tmp = cfg_trainer.copy()
             if model_name != 'NBEATS':
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
index c9fba041e..a21a10a04 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
@@ -11,7 +11,7 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
-from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss
+from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss, MAPELoss, MASELoss
 
 
 class RegressionLoss(ForecastingLossComponents):
@@ -26,6 +26,10 @@ def __init__(self,
             self.loss = L1Loss
         elif loss_name == 'mse':
             self.loss = MSELoss
+        elif loss_name == 'mase':
+            self.loss = MASELoss
+        elif loss_name == 'mape':
+            self.loss = MAPELoss
         else:
             raise ValueError(f"Unsupported loss type {loss_name}!")
         self.random_state = random_state
@@ -47,7 +51,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             loss_name: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="loss_name",
-                                                                             value_range=('l1', 'mse'),
+                                                                             value_range=('l1', 'mse', 'mase', 'mape'),
                                                                              default_value='mse'),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 1404d1f02..86c346d39 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -595,9 +595,9 @@ def forward(self,
         backcast = backcast.reshape(targets_past.shape)
         forecast = forecast.reshape(forcast_shape)
 
-        backcast = self.rescale_output(backcast, loc, scale, self.device)
         forecast = self.rescale_output(forecast, loc, scale, self.device)
         if self.training:
+            backcast = self.rescale_output(backcast, loc, scale, self.device)
             return backcast, forecast
         else:
             return forecast
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index 7bae11c18..3bbb40c6c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -72,7 +72,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class _TemporalConvNet(EncoderNetwork):
-    def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int = 2, dropout: float = 0.2):
+    def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: List[int], dropout: float = 0.2):
         super(_TemporalConvNet, self).__init__()
         layers: List[Any] = []
         num_levels = len(num_channels)
@@ -88,14 +88,14 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: int =
             # stride_values.extend([stride, stride])
             layers += [_TemporalBlock(in_channels,
                                       out_channels,
-                                      kernel_size,
+                                      kernel_size[i],
                                       stride=stride,
                                       dilation=dilation_size,
-                                      padding=(kernel_size - 1) * dilation_size,
+                                      padding=(kernel_size[i] - 1) * dilation_size,
                                       dropout=dropout)]
             # receptive_field_block = 1 + (kernel_size - 1) * dilation_size * \
             #                        (int(np.prod(stride_values[:-2])) * (1 + stride_values[-2]))
-            receptive_field_block = 1 + 2 * (kernel_size - 1) * dilation_size  # stride = 1, we ignore stide computation
+            receptive_field_block = 1 + 2 * (kernel_size[i] - 1) * dilation_size  # stride = 1, we ignore stide computation
             receptive_field += receptive_field_block
         self.receptive_field = receptive_field
         self.network = nn.Sequential(*layers)
@@ -116,13 +116,16 @@ class TCNEncoder(BaseForecastingEncoder):
     """
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
+
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        num_channels = [self.config["num_filters_0"]]
-        for i in range(1, self.config["num_blocks"]):
+        num_channels = [self.config["num_filters_1"]]
+        kernel_size = [self.config["kernel_size_1"]]
+        for i in range(2, self.config["num_blocks"] + 1):
             num_channels.append(self.config[f"num_filters_{i}"])
+            kernel_size.append(self.config[f"kernel_size_{i}"])
         encoder = _TemporalConvNet(input_shape[-1],
                                    num_channels,
-                                   kernel_size=self.config["kernel_size"],
+                                   kernel_size=kernel_size,
                                    dropout=self.config["dropout"] if self.config["use_dropout"] else 0.0
                                    )
         self._receptive_field = encoder.receptive_field
@@ -161,7 +164,7 @@ def get_hyperparameter_search_space(
                                                                                default_value=32,
                                                                                log=True),
             kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
-                                                                               value_range=(4, 64),
+                                                                               value_range=(2, 64),
                                                                                default_value=32,
                                                                                log=True),
             use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
@@ -174,27 +177,44 @@ def get_hyperparameter_search_space(
         cs = ConfigurationSpace()
 
         min_num_blocks, max_num_blocks = num_blocks.value_range
-        num_blocks_hp = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
-        cs.add_hyperparameter(num_blocks_hp)
-
-        add_hyperparameter(cs, kernel_size, UniformIntegerHyperparameter)
-
-        use_dropout_hp = get_hyperparameter(use_dropout, CategoricalHyperparameter)
-        cs.add_hyperparameter(use_dropout_hp)
+        num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
+        cs.add_hyperparameter(num_blocks)
 
-        dropout_hp = get_hyperparameter(dropout, UniformFloatHyperparameter)
-        cs.add_hyperparameter(dropout_hp)
-        cs.add_condition(CS.EqualsCondition(dropout_hp, use_dropout_hp, True))
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        cs.add_hyperparameter(use_dropout)
 
-        for i in range(0, int(max_num_blocks)):
+        for i in range(1, int(max_num_blocks) + 1):
             num_filter_search_space = HyperparameterSearchSpace(f"num_filters_{i}",
                                                                 value_range=num_filters.value_range,
                                                                 default_value=num_filters.default_value,
                                                                 log=num_filters.log)
+            kernel_size_search_space = HyperparameterSearchSpace(f"kernel_size_{i}",
+                                                                 value_range=kernel_size.value_range,
+                                                                 default_value=kernel_size.default_value,
+                                                                 log=kernel_size.log)
             num_filters_hp = get_hyperparameter(num_filter_search_space, UniformIntegerHyperparameter)
+            kernel_size_hp = get_hyperparameter(kernel_size_search_space, UniformIntegerHyperparameter)
             cs.add_hyperparameter(num_filters_hp)
-            if i >= int(min_num_blocks):
-                cs.add_condition(CS.GreaterThanCondition(
-                    num_filters_hp, num_blocks_hp, i))
+            cs.add_hyperparameter(kernel_size_hp)
+            if i > int(min_num_blocks):
+                cs.add_conditions([
+                    CS.GreaterThanCondition(num_filters_hp, num_blocks, i - 1),
+                    CS.GreaterThanCondition(kernel_size_hp, num_blocks, i - 1)
+                ])
+
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % i,
+                                                             value_range=dropout.value_range,
+                                                             default_value=dropout.default_value,
+                                                             log=dropout.log)
+            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
+            cs.add_hyperparameter(dropout_hp)
+
+            dropout_condition_1 = CS.EqualsCondition(dropout_hp, use_dropout, True)
+
+            if i > int(min_num_blocks):
+                dropout_condition_2 = CS.GreaterThanCondition(dropout_hp, num_blocks, i - 1)
+                cs.add_condition(CS.AndConjunction(dropout_condition_1, dropout_condition_2))
+            else:
+                cs.add_condition(dropout_condition_1)
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 2300c67e2..d55fc3d7d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -9,7 +9,6 @@
 
 from torch import nn
 
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
@@ -17,6 +16,7 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
     BaseForecastingDecoder
 
+
 # TODO we need to rewrite NBEATS part to make it neater!!!
 
 
@@ -93,7 +93,7 @@ def decoder_properties(self):
         return decoder_properties
 
     def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
-                       dataset_properties:Dict) -> Tuple[nn.Module, int]:
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
         in_features = input_shape[-1]
         stacks = [[] for _ in range(self.config['num_stacks'])]
         for stack_idx in range(1, self.config['num_stacks'] + 1):
@@ -143,7 +143,7 @@ def get_hyperparameter_search_space(
             ),
             width: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 'width',
-                value_range=(256, 2048),
+                value_range=(32, 1024),
                 default_value=512,
                 log=True
             ),
@@ -158,15 +158,15 @@ def get_hyperparameter_search_space(
                 default_value='generic'),
             expansion_coefficient_length_generic: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 'expansion_coefficient_length_generic',
-                value_range=(1, 4),
-                default_value=3,
-            ),
-            expansion_coefficient_length_interpretable: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'expansion_coefficient_length_interpretable',
                 value_range=(16, 64),
                 default_value=32,
                 log=True
             ),
+            expansion_coefficient_length_interpretable: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_interpretable',
+                value_range=(1, 4),
+                default_value=3,
+            ),
             activation: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="activation",
                 value_range=tuple(_activations.keys()),
@@ -289,20 +289,26 @@ def get_hyperparameter_search_space(
             cs.add_hyperparameters([*hps, expansion_coefficient_length_generic_hp,
                                     expansion_coefficient_length_interpretable_hp])
 
+            cond_ecl_generic_cond_1 = EqualsCondition(expansion_coefficient_length_generic_hp, stack_type_hp, 'generic')
+            cond_ecl_interpretable_cond_1 = InCondition(expansion_coefficient_length_interpretable_hp,
+                                                        stack_type_hp, ('seasonality', 'trend'))
+
             if stack_idx > int(min_num_stacks):
                 # The units of layer i should only exist
                 # if there are at least i layers
                 for hp in hps:
                     cs.add_condition(GreaterThanCondition(hp, num_stacks, stack_idx - 1))
                 cond_ecl_generic = AndConjunction(
-                    GreaterThanCondition(expansion_coefficient_length_generic_hp, num_stacks, stack_idx -1),
-                    EqualsCondition(expansion_coefficient_length_generic_hp, stack_type_hp, 'generic')
+                    GreaterThanCondition(expansion_coefficient_length_generic_hp, num_stacks, stack_idx - 1),
+                    cond_ecl_generic_cond_1
                 )
                 cond_ecl_interpretable = AndConjunction(
                     GreaterThanCondition(expansion_coefficient_length_interpretable_hp, num_stacks, stack_idx - 1),
-                    InCondition(expansion_coefficient_length_interpretable_hp, stack_type_hp, ('seasonality', 'trend'))
+                    cond_ecl_interpretable_cond_1
                 )
                 cs.add_conditions([cond_ecl_generic, cond_ecl_interpretable])
+            else:
+                cs.add_conditions([cond_ecl_generic_cond_1, cond_ecl_interpretable_cond_1])
 
             dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % stack_idx,
                                                              value_range=dropout.value_range,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 91d6425e5..eabc04826 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -131,5 +131,5 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict] = None,
     ) -> ConfigurationSpace:
         cs = CS.ConfigurationSpace()
-        cs.add_hyperparameter(Constant('decoder_type', 'RNN'))  # this helps the encoder to recognize the decoder.
+        cs.add_hyperparameter(Constant('decoder_type', 'RNNDecoder'))  # this helps the encoder to recognize the decoder.
         return cs
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index a31c7db28..4f659dc0e 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -276,6 +276,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
         self.check_requirements(X, y)
 
+        # Incorporate the transform to the dataset
+        datamanager = X['backend'].load_datamanager()  # type: TimeSeriesForcecastingDataset
+
+        self.n_prediction_steps = datamanager.n_prediction_steps
+        if self.backcast:
+            self.window_size = self.backcast_period * self.n_prediction_steps
+
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
@@ -292,8 +299,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         fraction_samples_per_seq = X.get('fraction_samples_per_seq', 1.0)
         self.sample_interval = sample_interval
 
-        # Incorporate the transform to the dataset
-        datamanager = X['backend'].load_datamanager()  # type: TimeSeriesForcecastingDataset
+
 
         # TODO, consider bucket setting
         self.train_transform = self.build_transform(X, mode='train')
@@ -315,7 +321,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         else:
             self.dataset_small_preprocess = False
 
-        self.n_prediction_steps = datamanager.n_prediction_steps
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
         train_split, test_split = datamanager.splits[X['split_id']]
@@ -486,7 +491,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                                                   default_value=False),
                                         backcast_period: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter='backcast_period',
-                                                                  value_range=(2, 7),
+                                                                  value_range=(1, 7),
                                                                   default_value=2)
                                         ) -> ConfigurationSpace:
         """
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 1ae17ff95..d2123d6df 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -41,6 +41,47 @@ def forward(self, input_dist: torch.distributions.Distribution, target_tensor: t
             return -scores
 
 
+class MAPELoss(Loss):
+    __constants__ = ['reduction']
+
+    def __init__(self, reduction: str = 'mean') -> None:
+        super(MAPELoss, self).__init__(reduction)
+
+    def forward(self, input: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
+        loss = torch.abs(input - target_tensor) / (torch.abs(target_tensor) + 1e-8)
+        if self.reduction == 'mean':
+            return loss.mean()
+        elif self.reduction == 'sum':
+            return loss.sum()
+        else:
+            return loss
+
+
+class MASELoss(Loss):
+    __constants__ = ['reduction']
+
+    def __init__(self, reduction: str = 'mean') -> None:
+        super(MASELoss, self).__init__(reduction)
+        self._mase_coefficient = 1.0
+
+    def set_mase_coefficient(self, mase_coefficient: torch.Tensor) -> 'MASELoss':
+        if len(mase_coefficient.shape) == 2:
+            mase_coefficient = mase_coefficient.unsqueeze(1)
+        self._mase_coefficient = mase_coefficient
+        return self
+
+    def forward(self,
+                input: torch.distributions.Distribution,
+                target_tensor: torch.Tensor) -> torch.Tensor:
+        loss = torch.abs(input - target_tensor) * self._mase_coefficient
+        if self.reduction == 'mean':
+            return loss.mean()
+        elif self.reduction == 'sum':
+            return loss.sum()
+        else:
+            return loss
+
+
 losses = dict(
     classification=dict(
         CrossEntropyLoss=dict(
@@ -59,7 +100,12 @@ def forward(self, input_dist: torch.distributions.Distribution, target_tensor: t
             module=MSELoss, supported_output_types=[CONTINUOUS]),
         L1Loss=dict(
             module=L1Loss, supported_output_types=[CONTINUOUS]),
-    ))
+        MAPELoss=dict(
+            module=MAPELoss, supported_output_types=[CONTINUOUS]),
+        MASELoss=dict(
+            module=MASELoss, supported_output_types=[CONTINUOUS]),
+    )
+)
 
 default_losses: Dict[str, Type[Loss]] = dict(classification=CrossEntropyLoss,
                                              regression=MSELoss,
@@ -67,6 +113,7 @@ def forward(self, input_dist: torch.distributions.Distribution, target_tensor: t
 
 LOSS_TYPES = ['regression', 'distribution']
 
+
 def get_default(task: int) -> Type[Loss]:
     """
     Utility function to get default loss for the task
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 574c0d3b9..c9ebaf835 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -57,7 +57,7 @@ def get_budget_tracker(self, X):
         else:
             max_epochs = None
         return BudgetTracker(
-            budget_type=X['budget_type'],
+            budget_type='epochs' if X['budget_type'] in FORECASTING_BUDGET_TYPE else X['budget_type'],
             max_runtime=X['runtime'] if 'runtime' in X else None,
             max_epochs=max_epochs,
         )
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 233e3072b..18bd5b691 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -12,6 +12,7 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
+
 from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
@@ -20,6 +21,8 @@
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet, \
     NBEATSNet, ForecastingSeq2SeqNet
+from autoPyTorch.pipeline.components.training.losses import MASELoss
+
 
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 
@@ -81,8 +84,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
-        import time
-        time_start = time.time()
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -113,7 +114,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
-        print(f'time used for trainging epoch: {time.time() - time_start}')
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
@@ -146,6 +146,9 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         future_targets = self.cast_targets(future_targets)
 
+        if isinstance(self.criterion, MASELoss):
+            self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
+
         # training
         self.optimizer.zero_grad()
 
@@ -161,6 +164,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
             loss_backcast = loss_func_backcast(self.criterion, backcast)
             loss_forecast = loss_func_forecast(self.criterion, forecast)
+
             loss = loss_forecast + loss_backcast * self.backcast_loss_ratio
 
             outputs = forecast
@@ -223,6 +227,8 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 past_target = data['past_target'].float()
 
                 mase_coefficients.append(data['mase_coefficient'])
+                if isinstance(self.criterion, MASELoss):
+                    self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
 
                 batch_size = past_target.shape[0]
 

From 3710cb224409a5557f3b4301ed9a99d1403cf019 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 6 Jan 2022 18:46:16 +0100
Subject: [PATCH 113/347] maint

---
 .../configs/forecasting_init_cfgs.json        |  60 ++-
 autoPyTorch/optimizer/utils.py                |   2 +-
 .../forecasting_backbone/RNNEncoder.py        |   4 +-
 .../forecasting_decoder/NBEATSDecoder.py      | 380 +++++++++++++-----
 .../forecasting_decoder/RNNDecoder.py         |   4 +-
 .../time_series_forecasting_data_loader.py    |   2 -
 .../pipeline/components/training/losses.py    |   4 +-
 7 files changed, 323 insertions(+), 133 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index f3b73fb8f..671bbc90e 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -55,7 +55,7 @@
             "network:aggregation": "median",
             "network:num_samples": 100
         },
-        "Seq2SeqRNN2MLP": {
+        "Seq2Seq-RNN2MLP": {
             "loss:__choice__": "DistributionLoss",
             "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
@@ -73,7 +73,7 @@
             "network:aggregation": "median",
             "network:num_samples": 100
         },
-        "Seq2SeqTCN2MLP": {
+        "Seq2Seq-TCN2MLP": {
             "loss:__choice__": "DistributionLoss",
             "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
@@ -93,7 +93,7 @@
             "network:aggregation": "median",
             "network:num_samples": 100
         },
-        "Seq2SeqRNN2RNN": {
+        "Seq2Seq-RNN2RNN": {
             "loss:__choice__": "DistributionLoss",
             "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
@@ -109,32 +109,52 @@
             "network:aggregation": "median",
             "network:num_samples": 100
         },
-        "NBEATS": {
+        "NBEATS-I": {
             "data_loader:backcast": true,
             "data_loader:backcast_period": 2,
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mase",
             "network:net_out_type": "regression",
             "network_backbone:__choice__": "NBEATSEncoder",
-            "network_backbone:NBEATSDecoder:use_dropout": true,
             "network_backbone:NBEATSDecoder:backcast_loss_ratio": 0.0,
             "network_backbone:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:NBEATSDecoder:activation": "relu",
-            "network_backbone:NBEATSDecoder:num_stacks": 2,
-            "network_backbone:NBEATSDecoder:num_blocks_1": 3,
-            "network_backbone:NBEATSDecoder:num_layers_1": 2,
-            "network_backbone:NBEATSDecoder:width_1": 256,
-            "network_backbone:NBEATSDecoder:weight_sharing_1": true,
-            "network_backbone:NBEATSDecoder:stack_type_1": "trend",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_interpretable_1": 3,
-            "network_backbone:NBEATSDecoder:dropout_1": 0.1,
-            "network_backbone:NBEATSDecoder:num_blocks_2": 3,
-            "network_backbone:NBEATSDecoder:num_layers_2": 2,
-            "network_backbone:NBEATSDecoder:width_2": 512,
-            "network_backbone:NBEATSDecoder:weight_sharing_2": true,
-            "network_backbone:NBEATSDecoder:stack_type_2": "seasonality",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_interpretable_2": 3,
-            "network_backbone:NBEATSDecoder:dropout_2": 0.1
+            "network_backbone:NBEATSDecoder:n_beats_type": "I",
+            "network_backbone:NBEATSDecoder:use_dropout_I": true,
+            "network_backbone:NBEATSDecoder:num_stacks_I": 2,
+            "network_backbone:NBEATSDecoder:num_blocks_I_1": 3,
+            "network_backbone:NBEATSDecoder:num_layers_I_1": 2,
+            "network_backbone:NBEATSDecoder:width_I_1": 256,
+            "network_backbone:NBEATSDecoder:weight_sharing_I_1": true,
+            "network_backbone:NBEATSDecoder:stack_type_I_1": "trend",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_I_trend_1": 3,
+            "network_backbone:NBEATSDecoder:dropout_I_1": 0.1,
+            "network_backbone:NBEATSDecoder:num_blocks_I_2": 3,
+            "network_backbone:NBEATSDecoder:num_layers_I_2": 2,
+            "network_backbone:NBEATSDecoder:width_I_2": 512,
+            "network_backbone:NBEATSDecoder:weight_sharing_I_2": true,
+            "network_backbone:NBEATSDecoder:stack_type_I_2": "trend",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_I_seasonality_2": 3,
+            "network_backbone:NBEATSDecoder:dropout_I_2": 0.1
+        },
+        "NBEATS-G": {
+            "data_loader:backcast": true,
+            "data_loader:backcast_period": 2,
+            "loss:__choice__": "RegressionLoss",
+            "loss:RegressionLoss:loss_name": "mape",
+            "network:net_out_type": "regression",
+            "network_backbone:__choice__": "NBEATSEncoder",
+            "network_backbone:NBEATSDecoder:backcast_loss_ratio": 0.0,
+            "network_backbone:NBEATSDecoder:normalization": "NoNorm",
+            "network_backbone:NBEATSDecoder:activation": "relu",
+            "network_backbone:NBEATSDecoder:n_beats_type": "G",
+            "network_backbone:NBEATSDecoder:use_dropout_G": false,
+            "network_backbone:NBEATSDecoder:num_stacks_G": 30,
+            "network_backbone:NBEATSDecoder:num_blocks_G": 1,
+            "network_backbone:NBEATSDecoder:num_layers_G": 4,
+            "network_backbone:NBEATSDecoder:width_G": 512,
+            "network_backbone:NBEATSDecoder:weight_sharing_G": false,
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index bd3f0b662..27b14fa3f 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -52,7 +52,7 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
 
         for model_name in suggested_init_models:
             cfg_tmp = cfg_trainer.copy()
-            if model_name != 'NBEATS':
+            if 'NBEATS' in model_name:
                 cfg_tmp['data_loader:window_size'] = window_size
             model_cfg = models_name_to_cfgs.get(model_name, None)
             if model_cfg is None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index 7746784dd..2f6695536 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -77,7 +77,7 @@ class RNNEncoder(BaseForecastingEncoder):
 
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
-        self.lagged_value = [0, 1, 2, 3, 4, 5, 6]
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         encoder = _RNN(in_features=input_shape[-1],
@@ -107,7 +107,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             try:
                 freq = FREQUENCY_MAP[freq]
                 lagged_values = get_lags_for_frequency(freq)
-                self.lagged_value = [lag - 1 for lag in lagged_values]
+                self.lagged_value = [0] + lagged_values
             except Exception:
                 warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
                 # If
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index d55fc3d7d..44c7f38e3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -24,26 +24,33 @@ class NBEATSBLock(nn.Module):
     def __init__(self,
                  n_in_features: int,
                  stack_idx: int,
-                 config: Dict,
+                 stack_type: str,
+                 num_blocks: int,
+                 num_layers: int,
+                 width: int,
+                 normalization: str,
+                 activation: str,
+                 weight_sharing: bool,
+                 expansion_coefficient_length: int,
+                 use_dropout: bool,
+                 dropout_rate: Optional[float] = None,
                  ):
         super().__init__()
         self.n_in_features = n_in_features
         self.stack_idx = stack_idx
+        self.stack_type = stack_type
 
-        self.weight_sharing = config['weight_sharing_%d' % self.stack_idx]
-        self.num_blocks = config['num_blocks_%d' % self.stack_idx]
-        self.stack_type = config['stack_type_%d' % self.stack_idx]
-        if self.stack_type == 'generic':
-            self.expansion_coefficient_length = config['expansion_coefficient_length_generic_%d' % self.stack_idx]
-        else:
-            self.expansion_coefficient_length = config['expansion_coefficient_length_interpretable_%d' % self.stack_idx]
+        self.num_blocks = num_blocks
+        self.num_layers = num_layers
+        self.width = width
+        self.normalization = normalization
+        self.activation = activation
+        self.use_dropout = use_dropout
+        self.dropout_rate = dropout_rate
+
+        self.expansion_coefficient_length = expansion_coefficient_length
 
-        self.num_layers = config['num_layers_%d' % self.stack_idx]
-        self.width = config['width_%d' % self.stack_idx]
-        self.normalization = config['normalization']
-        self.activation = config['activation']
-        self.use_dropout = config['use_dropout']
-        self.dropout_rate = config.get('dropout_%d' % self.stack_idx, None)
+        self.weight_sharing = weight_sharing
 
         self.backbone = nn.Sequential(*self.build_backbone())
 
@@ -59,7 +66,7 @@ def build_backbone(self):
         return layers
 
     def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
-        layers.append(nn.Linear(in_features, self.width))
+        layers.append(nn.Linear(in_features, self.width, bias=False))
         if self.normalization == 'BN':
             layers.append(nn.BatchNorm1d(self.width))
         elif self.normalization == 'LN':
@@ -95,13 +102,63 @@ def decoder_properties(self):
     def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
         in_features = input_shape[-1]
-        stacks = [[] for _ in range(self.config['num_stacks'])]
-        for stack_idx in range(1, self.config['num_stacks'] + 1):
-            for block_idx in range(self.config['num_blocks_%d' % stack_idx]):
-                if self.config['weight_sharing_%d' % stack_idx] and block_idx > 0:
-                    # for weight sharing, we only create one instance
-                    break
-                stacks[stack_idx - 1].append(NBEATSBLock(in_features, stack_idx=stack_idx, config=self.config))
+        n_beats_type = self.config['n_beats_type']
+        if n_beats_type == 'G':
+            stacks = [[] for _ in range(self.config['num_stacks_G'])]
+            for stack_idx in range(1, self.config['num_stacks_G'] + 1):
+                for block_idx in range(self.config['num_blocks_G']):
+                    if self.config['weight_sharing_G'] and block_idx > 0:
+                        # for weight sharing, we only create one instance
+                        break
+                    ecl = self.config['expansion_coefficient_length_G']
+                    stacks[stack_idx - 1].append(NBEATSBLock(in_features,
+                                                             stack_idx=stack_idx,
+                                                             stack_type='generic',
+                                                             num_blocks=self.config['num_blocks_G'],
+                                                             num_layers=self.config['num_layers_G'],
+                                                             width=self.config['width_G'],
+                                                             normalization=self.config['normalization'],
+                                                             activation=self.config['activation'],
+                                                             weight_sharing=self.config['weight_sharing_G'],
+                                                             expansion_coefficient_length=ecl,
+                                                             use_dropout=self.config['use_dropout_G'],
+                                                             dropout_rate=self.config.get('dropout_G', None),
+                                                             ))
+
+        elif n_beats_type == 'I':
+            stacks = [[] for _ in range(self.config['num_stacks_I'])]
+            for stack_idx in range(1, self.config['num_stacks_I'] + 1):
+                for block_idx in range(self.config['num_blocks_I_%d' % stack_idx]):
+                    if self.config['weight_sharing_I_%d' % stack_idx] and block_idx > 0:
+                        # for weight sharing, we only create one instance
+                        break
+                    stack_type = self.config['stack_type_I_%d' % stack_idx]
+                    if stack_type == 'generic':
+                        ecl = self.config['expansion_coefficient_length_I_generic_%d' % stack_idx]
+                    elif stack_type == 'trend':
+                        ecl = self.config['expansion_coefficient_length_I_trend_%d' % stack_idx]
+                    elif stack_type == 'seasonality':
+                        ecl = self.config['expansion_coefficient_length_I_seasonality_%d' % stack_idx]
+                    else:
+                        raise ValueError(f"Unsupported stack_type {stack_type}")
+
+                    stacks[stack_idx - 1].append(NBEATSBLock(in_features,
+                                                             stack_idx=stack_idx,
+                                                             stack_type=stack_type,
+                                                             num_blocks=self.config['num_blocks_I_%d' % stack_idx],
+                                                             num_layers=self.config['num_layers_I_%d' % stack_idx],
+                                                             width=self.config['width_I_%d' % stack_idx],
+                                                             normalization=self.config['normalization'],
+                                                             activation=self.config['activation'],
+                                                             weight_sharing=self.config[f'weight_sharing_I_%d' %
+                                                                                        stack_idx],
+                                                             expansion_coefficient_length=ecl,
+                                                             use_dropout=self.config['use_dropout_I'],
+                                                             dropout_rate=self.config.get('dropout_I_%d' %
+                                                                                          stack_idx, None),
+                                                             ))
+        else:
+            raise ValueError(f"Unsupported n_beats_type: {n_beats_type}")
         return stacks, stacks[-1][-1].width
 
     @staticmethod
@@ -126,24 +183,50 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            num_stacks: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="num_stacks",
+            n_beats_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="n_beats_type",
+                value_range=('I', 'G'),
+                default_value='I'
+            ),
+            num_stacks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="num_stacks_G",
+                value_range=(4, 32),
+                default_value=30
+            ),
+            num_blocks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_blocks_G',
+                value_range=(1, 3),
+                default_value=1
+            ),
+            num_layers_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_layers_G',
+                value_range=(1, 5),
+                default_value=4
+            ),
+            width_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'width_G',
+                value_range=(32, 512),
+                default_value=256,
+                log=True
+            ),
+            num_stacks_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="num_stacks_I",
                 value_range=(1, 4),
                 default_value=2
             ),
-            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'num_blocks',
+            num_blocks_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_blocks_I',
                 value_range=(1, 5),
                 default_value=3
             ),
-            num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'num_layers',
+            num_layers_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'num_layers_I',
                 value_range=(1, 5),
                 default_value=3
             ),
-            width: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'width',
-                value_range=(32, 1024),
+            width_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'width_I',
+                value_range=(32, 2048),
                 default_value=512,
                 log=True
             ),
@@ -162,8 +245,13 @@ def get_hyperparameter_search_space(
                 default_value=32,
                 log=True
             ),
-            expansion_coefficient_length_interpretable: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'expansion_coefficient_length_interpretable',
+            expansion_coefficient_length_seasonality: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_seasonality',
+                value_range=(1, 8),
+                default_value=3,
+            ),
+            expansion_coefficient_length_trend: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                'expansion_coefficient_length_trend',
                 value_range=(1, 4),
                 default_value=3,
             ),
@@ -199,18 +287,30 @@ def get_hyperparameter_search_space(
         width
         The design of the configuration space follows pytorch-forecasting:
         https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
+        Give that N-BEATS-I and N-BEATS-G's default hyperparameter configuration that totally different, we consider
+        them as two seperate configuration space: N-BEATS-G that only contains generic blocks and thus could be scaled
+        up to 32 stacks, while each stacks share the same number of blocks/ width/ dropout rate. While N-BEATS-I is
+        is restricted to be a network with a much smaller number of stacks. However, the block type of N-BEATS-G at each
+        stack can be freely selected
+        freely selected
         Args:
             dataset_properties:
-            num_stacks: number of stacks
-            num_blocks: number of blocks per stack
-            num_layers: number of fc layers per block, this value is the same across all the blocks within one stack
-            width: fc layer width, this value is the same across all the blocks within one stack
+            n_beats_type: type of nbeats network, could be I (N-BEATS-I) or G (N-BEATS-G)
+            num_stacks_g: number of stacks
+            num_blocks_g: number of blocks per stack
+            num_layers_g: number of fc layers per block, this value is the same across all the blocks within one stack
+            width_g: fc layer width, this value is the same across all the blocks within one stack
+            num_stacks_i: number of stacks
+            num_blocks_i: number of blocks per stack
+            num_layers_i: number of fc layers per block, this value is the same across all the blocks within one stack
+            width_i: fc layer width, this value is the same across all the blocks within one stack
             weight_sharing: if weights are shared inside one block
             stack_type: stack type, used to define the final output
             expansion_coefficient_length_generic: expansion_coefficient_length, activate if stack_type is 'generic'
-            expansion_coefficient_length_interpretable: expansion_coefficient_length, activate if stack_type is 'trend'
-            or 'seasonality' (in this case n_dim is expansion_coefficient_length_interpretable * n_prediciton_steps)
-             the expansion coefficient) or trend (in this case, it corresponds to the degree of the polynomial)
+            expansion_coefficient_length_seasonality: expansion_coefficient_length, activate if stack_type is
+                'seasonality' (n_dim = expansion_coefficient_length_interpretable * n_prediciton_steps)
+            expansion_coefficient_length_trend: expansion_coefficient_length, activate if stack_type is 'trend' (it
+                corresponds to the degree of the polynomial)
             activation: activation function across fc layers
             use_dropout: if dropout is applied
             normalization: if normalization is applied
@@ -223,94 +323,163 @@ def get_hyperparameter_search_space(
         """
 
         cs = ConfigurationSpace()
-        min_num_stacks, max_num_stacks = num_stacks.value_range
 
-        num_stacks = get_hyperparameter(num_stacks, UniformIntegerHyperparameter)
+        n_beats_type = get_hyperparameter(n_beats_type, CategoricalHyperparameter)
 
+        # General Hyperparameters
         add_hyperparameter(cs, activation, CategoricalHyperparameter)
         add_hyperparameter(cs, normalization, CategoricalHyperparameter)
         add_hyperparameter(cs, backcast_loss_ratio, UniformFloatHyperparameter)
 
-        # We can have dropout in the network for
-        # better generalization
-        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
-        cs.add_hyperparameters([num_stacks, use_dropout])
-
-        for stack_idx in range(1, int(max_num_stacks) + 1):
-            num_blocks_search_space = HyperparameterSearchSpace(hyperparameter='num_blocks_%d' % stack_idx,
-                                                                value_range=num_blocks.value_range,
-                                                                default_value=num_blocks.default_value,
-                                                                log=num_blocks.log)
-            num_layers_search_space = HyperparameterSearchSpace(hyperparameter='num_layers_%d' % stack_idx,
-                                                                value_range=num_layers.value_range,
-                                                                default_value=num_layers.default_value,
-                                                                log=num_layers.log)
-            width_search_space = HyperparameterSearchSpace(hyperparameter='width_%d' % stack_idx,
-                                                           value_range=width.value_range,
-                                                           default_value=width.default_value,
-                                                           log=width.log)
-            weight_sharing_search_sapce = HyperparameterSearchSpace(hyperparameter='weight_sharing_%d' % stack_idx,
-                                                                    value_range=weight_sharing.value_range,
-                                                                    default_value=weight_sharing.default_value,
-                                                                    log=weight_sharing.log)
-            stack_type_search_space = HyperparameterSearchSpace(hyperparameter='stack_type_%d' % stack_idx,
-                                                                value_range=stack_type.value_range,
-                                                                default_value=stack_type.default_value,
-                                                                log=stack_type.log)
+        cs.add_hyperparameter(n_beats_type)
+        # N-BEATS-G
+
+        weight_sharing_g = HyperparameterSearchSpace(hyperparameter='weight_sharing_G',
+                                                     value_range=weight_sharing.value_range,
+                                                     default_value=weight_sharing.default_value,
+                                                     log=weight_sharing.log)
+        use_dropout_g = HyperparameterSearchSpace(hyperparameter='use_dropout_G',
+                                                  value_range=use_dropout.value_range,
+                                                  default_value=use_dropout.default_value,
+                                                  log=use_dropout.log)
+        dropout_g = HyperparameterSearchSpace(hyperparameter='dropout_G',
+                                              value_range=dropout.value_range,
+                                              default_value=dropout.default_value,
+                                              log=dropout.log)
+        ecl_g_search_space = HyperparameterSearchSpace(
+            hyperparameter='expansion_coefficient_length_G',
+            value_range=expansion_coefficient_length_generic.value_range,
+            default_value=expansion_coefficient_length_generic.default_value,
+            log=expansion_coefficient_length_generic.log
+        )
+
+        num_stacks_g = get_hyperparameter(num_stacks_g, UniformIntegerHyperparameter)
+        num_blocks_g = get_hyperparameter(num_blocks_g, UniformIntegerHyperparameter)
+        num_layers_g = get_hyperparameter(num_layers_g, UniformIntegerHyperparameter)
+        width_g = get_hyperparameter(width_g, UniformIntegerHyperparameter)
+        weight_sharing_g = get_hyperparameter(weight_sharing_g, CategoricalHyperparameter)
+        ecl_g = get_hyperparameter(ecl_g_search_space, UniformIntegerHyperparameter)
+        use_dropout_g = get_hyperparameter(use_dropout_g, CategoricalHyperparameter)
+
+        dropout_g = get_hyperparameter(dropout_g, UniformFloatHyperparameter)
+
+        n_beats_g_hps = [num_stacks_g, num_blocks_g, num_layers_g, width_g, weight_sharing_g, ecl_g, use_dropout_g]
+        n_beats_g_conds = [EqualsCondition(hp_nbeats_g, n_beats_type, 'G') for hp_nbeats_g in n_beats_g_hps]
+        cs.add_hyperparameters(n_beats_g_hps)
+        cs.add_hyperparameter(dropout_g)
+        cs.add_conditions(n_beats_g_conds)
+        cs.add_condition(AndConjunction(EqualsCondition(dropout_g, n_beats_type, 'G'),
+                                        EqualsCondition(dropout_g, use_dropout_g, True)))
+
+        min_num_stacks_i, max_num_stacks_i = num_stacks_i.value_range
+
+        use_dropout_i = HyperparameterSearchSpace(hyperparameter='use_dropout_I',
+                                                  value_range=use_dropout.value_range,
+                                                  default_value=use_dropout.default_value,
+                                                  log=use_dropout.log)
+
+        num_stacks_i = get_hyperparameter(num_stacks_i, UniformIntegerHyperparameter)
+        use_dropout_i = get_hyperparameter(use_dropout_i, CategoricalHyperparameter)
+
+        cs.add_hyperparameters([num_stacks_i, use_dropout_i])
+        cs.add_conditions([EqualsCondition(num_stacks_i, n_beats_type, 'I'),
+                           EqualsCondition(use_dropout_i, n_beats_type, 'I')
+                           ])
+
+        for stack_idx in range(1, int(max_num_stacks_i) + 1):
+            num_blocks_i_search_space = HyperparameterSearchSpace(hyperparameter='num_blocks_I_%d' % stack_idx,
+                                                                  value_range=num_blocks_i.value_range,
+                                                                  default_value=num_blocks_i.default_value,
+                                                                  log=num_blocks_i.log)
+            num_layers_i_search_space = HyperparameterSearchSpace(hyperparameter='num_layers_I_%d' % stack_idx,
+                                                                  value_range=num_layers_i.value_range,
+                                                                  default_value=num_layers_i.default_value,
+                                                                  log=num_layers_i.log)
+            width_i_search_space = HyperparameterSearchSpace(hyperparameter='width_I_%d' % stack_idx,
+                                                             value_range=width_i.value_range,
+                                                             default_value=width_i.default_value,
+                                                             log=width_i.log)
+            weight_sharing_i_search_space = HyperparameterSearchSpace(hyperparameter='weight_sharing_I_%d' % stack_idx,
+                                                                      value_range=weight_sharing.value_range,
+                                                                      default_value=weight_sharing.default_value,
+                                                                      log=weight_sharing.log)
+            stack_type_i_search_space = HyperparameterSearchSpace(hyperparameter='stack_type_I_%d' % stack_idx,
+                                                                  value_range=stack_type.value_range,
+                                                                  default_value=stack_type.default_value,
+                                                                  log=stack_type.log)
             expansion_coefficient_length_generic_search_space = HyperparameterSearchSpace(
-                hyperparameter='expansion_coefficient_length_generic_%d' % stack_idx,
+                hyperparameter='expansion_coefficient_length_I_generic_%d' % stack_idx,
                 value_range=expansion_coefficient_length_generic.value_range,
                 default_value=expansion_coefficient_length_generic.default_value,
                 log=expansion_coefficient_length_generic.log
             )
-            expansion_coefficient_length_interpretable_search_space = HyperparameterSearchSpace(
-                hyperparameter='expansion_coefficient_length_interpretable_%d' % stack_idx,
-                value_range=expansion_coefficient_length_interpretable.value_range,
-                default_value=expansion_coefficient_length_interpretable.default_value,
-                log=expansion_coefficient_length_interpretable.log
+            expansion_coefficient_length_seasonality_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_I_seasonality_%d' % stack_idx,
+                value_range=expansion_coefficient_length_seasonality.value_range,
+                default_value=expansion_coefficient_length_seasonality.default_value,
+                log=expansion_coefficient_length_seasonality.log
+            )
+            expansion_coefficient_length_trend_search_space = HyperparameterSearchSpace(
+                hyperparameter='expansion_coefficient_length_I_trend_%d' % stack_idx,
+                value_range=expansion_coefficient_length_trend.value_range,
+                default_value=expansion_coefficient_length_trend.default_value,
+                log=expansion_coefficient_length_trend.log
             )
 
-            num_blocks_hp = get_hyperparameter(num_blocks_search_space, UniformIntegerHyperparameter)
-            num_layers_hp = get_hyperparameter(num_layers_search_space, UniformIntegerHyperparameter)
-            width_hp = get_hyperparameter(width_search_space, UniformIntegerHyperparameter)
-            weight_sharing_hp = get_hyperparameter(weight_sharing_search_sapce, CategoricalHyperparameter)
-            stack_type_hp = get_hyperparameter(stack_type_search_space, CategoricalHyperparameter)
+            num_blocks_i_hp = get_hyperparameter(num_blocks_i_search_space, UniformIntegerHyperparameter)
+            num_layers_i_hp = get_hyperparameter(num_layers_i_search_space, UniformIntegerHyperparameter)
+            width_i_hp = get_hyperparameter(width_i_search_space, UniformIntegerHyperparameter)
+            weight_sharing_i_hp = get_hyperparameter(weight_sharing_i_search_space, CategoricalHyperparameter)
+            stack_type_i_hp = get_hyperparameter(stack_type_i_search_space, CategoricalHyperparameter)
 
             expansion_coefficient_length_generic_hp = get_hyperparameter(
                 expansion_coefficient_length_generic_search_space,
                 UniformIntegerHyperparameter
             )
-            expansion_coefficient_length_interpretable_hp = get_hyperparameter(
-                expansion_coefficient_length_interpretable_search_space,
+            expansion_coefficient_length_seasonality_hp = get_hyperparameter(
+                expansion_coefficient_length_seasonality_search_space,
+                UniformIntegerHyperparameter
+            )
+            expansion_coefficient_length_trend_hp = get_hyperparameter(
+                expansion_coefficient_length_trend_search_space,
                 UniformIntegerHyperparameter
             )
 
-            hps = [num_blocks_hp, num_layers_hp, width_hp, stack_type_hp, weight_sharing_hp]
-            cs.add_hyperparameters([*hps, expansion_coefficient_length_generic_hp,
-                                    expansion_coefficient_length_interpretable_hp])
+            hps = [num_blocks_i_hp, num_layers_i_hp, width_i_hp, stack_type_i_hp, weight_sharing_i_hp]
+            cs.add_hyperparameters([*hps,
+                                    expansion_coefficient_length_generic_hp,
+                                    expansion_coefficient_length_seasonality_hp,
+                                    expansion_coefficient_length_trend_hp])
 
-            cond_ecl_generic_cond_1 = EqualsCondition(expansion_coefficient_length_generic_hp, stack_type_hp, 'generic')
-            cond_ecl_interpretable_cond_1 = InCondition(expansion_coefficient_length_interpretable_hp,
-                                                        stack_type_hp, ('seasonality', 'trend'))
+            cond_ecls = [
+                EqualsCondition(expansion_coefficient_length_generic_hp, stack_type_i_hp, 'generic'),
+                EqualsCondition(expansion_coefficient_length_seasonality_hp, stack_type_i_hp, 'seasonality'),
+                EqualsCondition(expansion_coefficient_length_trend_hp, stack_type_i_hp, 'trend'),
+            ]
 
-            if stack_idx > int(min_num_stacks):
+            if stack_idx > int(min_num_stacks_i):
                 # The units of layer i should only exist
                 # if there are at least i layers
                 for hp in hps:
-                    cs.add_condition(GreaterThanCondition(hp, num_stacks, stack_idx - 1))
-                cond_ecl_generic = AndConjunction(
-                    GreaterThanCondition(expansion_coefficient_length_generic_hp, num_stacks, stack_idx - 1),
-                    cond_ecl_generic_cond_1
-                )
-                cond_ecl_interpretable = AndConjunction(
-                    GreaterThanCondition(expansion_coefficient_length_interpretable_hp, num_stacks, stack_idx - 1),
-                    cond_ecl_interpretable_cond_1
-                )
-                cs.add_conditions([cond_ecl_generic, cond_ecl_interpretable])
+                    cs.add_condition(
+                        AndConjunction(GreaterThanCondition(hp, num_stacks_i, stack_idx - 1),
+                                       EqualsCondition(hp, n_beats_type, 'I'))
+                    )
+                for cond_ecl in cond_ecls:
+                    cs.add_condition(
+                        AndConjunction(cond_ecl,
+                                       GreaterThanCondition(cond_ecl.child, num_stacks_i, stack_idx - 1),
+                                       EqualsCondition(cond_ecl.child, n_beats_type, 'I'))
+                    )
             else:
-                cs.add_conditions([cond_ecl_generic_cond_1, cond_ecl_interpretable_cond_1])
+                cs.add_conditions([EqualsCondition(hp, n_beats_type, 'I') for hp in hps])
+                cs.add_conditions([
+                    AndConjunction(cond_ecl,
+                                   EqualsCondition(cond_ecl.child, n_beats_type, 'I')) for cond_ecl in cond_ecls
+                ]
+                )
 
-            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % stack_idx,
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_I_%d' % stack_idx,
                                                              value_range=dropout.value_range,
                                                              default_value=dropout.default_value,
                                                              log=dropout.log)
@@ -318,12 +487,13 @@ def get_hyperparameter_search_space(
             dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
             cs.add_hyperparameter(dropout_hp)
 
-            dropout_condition_1 = EqualsCondition(dropout_hp, use_dropout, True)
+            dropout_condition_1 = EqualsCondition(dropout_hp, use_dropout_i, True)
+            dropout_condition_2 = EqualsCondition(dropout_hp, n_beats_type, 'I')
 
-            if stack_idx > int(min_num_stacks):
-                dropout_condition_2 = GreaterThanCondition(dropout_hp, num_stacks, stack_idx - 1)
-                cs.add_condition(AndConjunction(dropout_condition_1, dropout_condition_2))
+            if stack_idx > int(min_num_stacks_i):
+                dropout_condition_3 = GreaterThanCondition(dropout_hp, num_stacks_i, stack_idx - 1)
+                cs.add_condition(AndConjunction(dropout_condition_1, dropout_condition_2, dropout_condition_3))
             else:
-                cs.add_condition(dropout_condition_1)
+                cs.add_condition(AndConjunction(dropout_condition_1, dropout_condition_2))
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index eabc04826..a96a8fe08 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -61,7 +61,7 @@ def __init__(self, **kwargs: Dict):
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.auto_regressive = True
         self.rnn_kwargs = None
-        self.lagged_value = [0, 1, 2, 3, 4, 5, 6]
+        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -110,7 +110,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             try:
                 freq = FREQUENCY_MAP[freq]
                 lagged_values = get_lags_for_frequency(freq)
-                self.lagged_value = [lag - 1 for lag in lagged_values]
+                self.lagged_value = [0] + lagged_values
             except Exception:
                 warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
                 pass
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 4f659dc0e..10dfadc23 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -299,8 +299,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         fraction_samples_per_seq = X.get('fraction_samples_per_seq', 1.0)
         self.sample_interval = sample_interval
 
-
-
         # TODO, consider bucket setting
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index d2123d6df..cdfe802cd 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -48,7 +48,9 @@ def __init__(self, reduction: str = 'mean') -> None:
         super(MAPELoss, self).__init__(reduction)
 
     def forward(self, input: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
-        loss = torch.abs(input - target_tensor) / (torch.abs(target_tensor) + 1e-8)
+        target = torch.abs(target_tensor)
+        target[target == 0] = 1
+        loss = torch.abs(input - target_tensor) / target
         if self.reduction == 'mean':
             return loss.mean()
         elif self.reduction == 'sum':

From 70d82d9221b531fd76724f89ff63ef51df59a634 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 9 Jan 2022 18:44:34 +0100
Subject: [PATCH 114/347] new Transformer models,  allow RNN to do deepAR
 inference

---
 autoPyTorch/api/base_task.py                  |   4 +-
 .../configs/forecasting_init_cfgs.json        |  65 +++-
 autoPyTorch/datasets/time_series_dataset.py   |  41 ++-
 autoPyTorch/optimizer/utils.py                |   2 +-
 .../setup/network/forecasting_network.py      | 165 +++++++++--
 .../InceptionTimeEncoder.py                   |   4 +-
 .../forecasting_backbone/MLPEncoder.py        |   4 +-
 .../forecasting_backbone/RNNEncoder.py        |   4 +-
 .../TransformerEncoder.py                     | 278 ++++++++++++++++++
 .../forecasting_decoder/RNNDecoder.py         |  16 +-
 .../forecasting_decoder/TransformerDecoder.py | 278 ++++++++++++++++++
 .../base_forecasting_decoder.py               |  18 ++
 .../forecasting_backbone/transformer_util.py  |  68 +++++
 .../time_series_forecasting_data_loader.py    |  14 +-
 .../components/training/trainer/__init__.py   |   2 +
 requirements.txt                              |   2 +-
 16 files changed, 909 insertions(+), 56 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 8dcb72146..4e9e6158a 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1173,8 +1173,8 @@ def _get_fit_dictionary(
                                   'num_run': self._backend.get_next_num_run(),
                                   })
         if self.time_series_forecasting:
-            warnings.WarningMessage("Currently Time Series Forecasting tasks do not allow computing metrics "
-                                    "during training. It will be automatically set as False")
+            warnings.warn("Currently Time Series Forecasting tasks do not allow computing metrics "
+                          "during training. It will be automatically set as False")
             self.pipeline_options["metrics_during_training"] = False
         X.update(self.pipeline_options)
         return X
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 671bbc90e..81bb88e42 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -105,6 +105,62 @@
             "network_backbone:RNNEncoder:use_dropout": false,
             "network_backbone:RNNEncoder:decoder_type": "RNNDecoder",
             "network_backbone:RNNDecoder:decoder_type": "RNNDecoder",
+            "network:forecast_strategy": "mean"
+        },
+        "Seq2Seq-Transformer2Transformer": {
+            "loss:__choice__": "DistributionLoss",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "network_backbone:__choice__": "TransformerEncoder",
+            "network_backbone:TransformerEncoder:d_model_log": 5,
+            "network_backbone:TransformerEncoder:activation": "gelu",
+            "network_backbone:TransformerEncoder:num_layers": 1,
+            "network_backbone:TransformerEncoder:decoder_type": "TransformerDecoder",
+            "network_backbone:TransformerEncoder:use_dropout": true,
+            "network_backbone:TransformerEncoder:use_positional_encoder": true,
+            "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
+            "network_backbone:TransformerEncoder:d_feed_forward_log_1": 7,
+            "network_backbone:TransformerEncoder:num_head_log_1": 3,
+            "network_backbone:TransformerEncoder:layer_norm_eps_1": 1e-05,
+            "network_backbone:TransformerEncoder:dropout_1": 0.1,
+            "network_backbone:TransformerEncoder:use_layer_norm_output": true,
+            "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:TransformerDecoder:activation": "gelu",
+            "network_backbone:TransformerDecoder:num_layers": 1,
+            "network_backbone:TransformerDecoder:use_dropout": true,
+            "network_backbone:TransformerDecoder:use_positional_decoder": true,
+            "network_backbone:TransformerDecoder:dropout_positional_decoder": 0.1,
+            "network_backbone:TransformerDecoder:d_feed_forward_log_1": 7,
+            "network_backbone:TransformerDecoder:num_head_log_1": 3,
+            "network_backbone:TransformerDecoder:layer_norm_eps_1": 1e-05,
+            "network_backbone:TransformerDecoder:dropout_1": 0.1,
+            "network_backbone:TransformerDecoder:use_layer_norm_output": true,
+            "network_backbone:TransformerDecoder:layer_norm_eps_output": 1e-05,
+            "network:forecast_strategy": "sample",
+            "network:aggregation": "median",
+            "network:num_samples": 100
+        },
+        "Seq2Seq-Transformer2MLP": {
+            "loss:__choice__": "DistributionLoss",
+            "network:net_out_type": "distribution",
+            "loss:DistributionLoss:dist_cls": "studentT",
+            "network_backbone:__choice__": "TransformerEncoder",
+            "network_backbone:TransformerEncoder:d_model_log": 5,
+            "network_backbone:TransformerEncoder:activation": "gelu",
+            "network_backbone:TransformerEncoder:num_layers": 1,
+            "network_backbone:TransformerEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:TransformerEncoder:use_dropout": true,
+            "network_backbone:TransformerEncoder:use_positional_encoder": true,
+            "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
+            "network_backbone:TransformerEncoder:d_feed_forward_log_1": 7,
+            "network_backbone:TransformerEncoder:num_head_log_1": 3,
+            "network_backbone:TransformerEncoder:layer_norm_eps_1": 1e-05,
+            "network_backbone:TransformerEncoder:dropout_1": 0.1,
+            "network_backbone:TransformerEncoder:use_layer_norm_output": true,
+            "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:MLPDecoder:num_layers": 0,
+            "network_backbone:MLPDecoder:auto_regressive": false,
+            "network_backbone:MLPDecoder:units_final_layer": 30,
             "network:forecast_strategy": "sample",
             "network:aggregation": "median",
             "network:num_samples": 100
@@ -133,8 +189,8 @@
             "network_backbone:NBEATSDecoder:num_layers_I_2": 2,
             "network_backbone:NBEATSDecoder:width_I_2": 512,
             "network_backbone:NBEATSDecoder:weight_sharing_I_2": true,
-            "network_backbone:NBEATSDecoder:stack_type_I_2": "trend",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_I_seasonality_2": 3,
+            "network_backbone:NBEATSDecoder:stack_type_I_2": "seasonality",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_I_seasonality_2": 7,
             "network_backbone:NBEATSDecoder:dropout_I_2": 0.1
         },
         "NBEATS-G": {
@@ -148,13 +204,14 @@
             "network_backbone:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:NBEATSDecoder:activation": "relu",
             "network_backbone:NBEATSDecoder:n_beats_type": "G",
-            "network_backbone:NBEATSDecoder:use_dropout_G": false,
+            "network_backbone:NBEATSDecoder:use_dropout_G": true,
             "network_backbone:NBEATSDecoder:num_stacks_G": 30,
             "network_backbone:NBEATSDecoder:num_blocks_G": 1,
             "network_backbone:NBEATSDecoder:num_layers_G": 4,
             "network_backbone:NBEATSDecoder:width_G": 512,
             "network_backbone:NBEATSDecoder:weight_sharing_G": false,
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32,
+            "network_backbone:NBEATSDecoder:dropout_G": 0.1
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index db81e5dc2..5fb43fc93 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -159,8 +159,8 @@ def __init__(self,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  target_variables: Optional[Union[Tuple[int], int]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
-                 resampling_strategy: Union[
-                     CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
+                 resampling_strategy: Optional[Union[
+                     CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
@@ -534,6 +534,9 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                 num_splits=cast(int, num_splits),
             ))
             upper_window_size = (np.min(self.sequence_lengths_train) // num_splits) - self.n_prediction_steps
+        elif self.resampling_strategy is None:
+            splits.append(self.create_refit_split())
+            upper_window_size = np.inf
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
 
@@ -663,6 +666,40 @@ def create_holdout_val_split(
                                                                        indices=np.arange(
                                                                            len(dataset) - self.n_prediction_steps),
                                                                        **kwargs)
+
+            for idx_split in range(2):
+                splits[idx_split][idx_seq] = idx_start + split[idx_split]
+            idx_start += self.sequence_lengths_train[idx_seq]
+
+        train_indices = np.hstack([sp for sp in splits[0]])
+        test_indices = np.hstack([sp for sp in splits[1]])
+
+        return train_indices, test_indices
+
+    def create_refit_split(
+            self,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        This function creates the refit split for the given task. All the data in the dataset will be considered as
+        training sets
+        Args:
+            holdout_val_type (HoldoutValTypes):
+            val_share (float): share of the validation data
+
+        Returns:
+            (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
+        """
+        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+
+        splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
+        idx_start = 0
+        for idx_seq, dataset in enumerate(self.datasets):
+            if self.shift_input_data:
+                split = [np.arange(len(dataset)), np.array([len(dataset) - 1])]
+            else:
+                last_idx = len(dataset) - self.n_prediction_steps -1
+                split = [np.arange(len(dataset) - self.n_prediction_steps), np.array([last_idx])]
+
             for idx_split in range(2):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 27b14fa3f..0910630e4 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -52,7 +52,7 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
 
         for model_name in suggested_init_models:
             cfg_tmp = cfg_trainer.copy()
-            if 'NBEATS' in model_name:
+            if 'NBEATS' not in model_name:
                 cfg_tmp['data_loader:window_size'] = window_size
             model_cfg = models_name_to_cfgs.get(model_name, None)
             if model_cfg is None:
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 86c346d39..dce317bc5 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -171,6 +171,7 @@ def __init__(self,
                                  'for decoder!')
         self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
         self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
+        # self.mask_futur_features = decoder_properties['mask_future_features']
         self._device = torch.device('cpu')
 
         self.encoder_lagged_input = encoder_properties['lagged_input']
@@ -297,13 +298,22 @@ def predict(self,
 class ForecastingSeq2SeqNet(ForecastingNet):
     future_target_required = True
     """
-    Forecasting network with Seq2Seq structure.
+    Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while 
 
-    This structure is activate when the decoder is recurrent (RNN). We train the network with teacher forcing, thus
+    This structure is activate when the decoder is recurrent (RNN or transformer). 
+    We train the network with teacher forcing, thus
     targets_future is required for the network. To train the network, past targets and past features are fed to the
-    encoder to obtain the hidden states whereas future targets and future features
+    encoder to obtain the hidden states whereas future targets and future features.
+    When the output type is distribution and forecast_strategy is sampling, this model is equivalent to a deepAR model 
+    during inference.
     """
 
+    def __init__(self, **kwargs):
+        super(ForecastingSeq2SeqNet, self).__init__(**kwargs)
+        self.mask_on_future_target = kwargs['decoder_properties']['mask_on_future_target']
+        if self.mask_on_future_target:
+            self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
+
     def forward(self,
                 targets_past: torch.Tensor,
                 targets_future: Optional[torch.Tensor] = None,
@@ -344,41 +354,141 @@ def forward(self,
                                                                                 dim=-1)
             x_future = x_future.to(self.device)
 
-            _, hidden_states = self.encoder(x_past)
-            x_future, _ = self.decoder(x_future, hidden_states)
+            if self.encoder_has_hidden_states:
+                # RNN
+                _, features_latent = self.encoder(x_past, output_seq=True)
+                x_future, _ = self.decoder(x_future, features_latent)
+            elif self.mask_on_future_target:
+                features_latent = self.encoder(x_past, output_seq=True)
+                x_future = self.decoder(x_future, features_latent, tgt_mask=self.tgt_mask.to(self.device))
+            else:
+                raise NotImplementedError
             net_output = self.head(x_future)
 
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
-            all_predictions = []
-            predicted_target = targets_past[:, [-1]]
+            if self.encoder_has_hidden_states:
+                _, features_latent = self.encoder(x_past, output_seq=True)
+            else:
+                features_latent = self.encoder(x_past, output_seq=True)
 
-            _, hidden_states = self.encoder(x_past)
             if features_future is not None:
                 features_future = features_future
 
-            for idx_pred in range(self.n_prediction_steps):
-                if self.decoder_lagged_input:
-                    x_future = torch.cat([targets_past, predicted_target.cpu()], dim=1)
-                    x_future, _ = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+            if self.forecast_strategy != 'sample':
+                all_predictions = []
+                predicted_target = targets_past[:, [-1]]
+                targets_past = targets_past[:, :-1]
+                for idx_pred in range(self.n_prediction_steps):
+                    if self.decoder_lagged_input:
+                        x_future = torch.cat([targets_past, predicted_target.cpu()], dim=1)
+                        if self.decoder_has_hidden_states:
+                            x_future, _ = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+                        else:
+                            x_future, _ = get_lagged_subsequences(x_future, idx_pred + 1, self.decoder.lagged_value)
+                    else:
+                        if self.decoder_has_hidden_states:
+                            x_future = predicted_target[:, [-1]]
+                        else:
+                            x_future = predicted_target
+
+                    if self.decoder_has_hidden_states:
+                        x_future = x_future if features_future is None else torch.cat(
+                            [features_future[:, [idx_pred], :], x_future], dim=-1)
+                    else:
+                        x_future = x_future if features_future is None else torch.cat(
+                            [features_future[:, idx_pred + 1, :], x_future], dim=-1)
+
+                    x_future = x_future.to(self.device)
+                    if self.decoder_has_hidden_states:
+                        x_future, features_latent = self.decoder(x_future, features_latent=features_latent)
+                    else:
+                        x_future = self.decoder(x_future, features_latent)
+
+                    net_output = self.head(x_future[:, -1:, ])
+                    predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()],
+                                                 dim=1)
+
+                    all_predictions.append(net_output)
+
+                if self.output_type != 'distribution':
+                    all_predictions = torch.cat(all_predictions, dim=1)
                 else:
-                    x_future = predicted_target[:, [-1]]
-                x_future = x_future if features_future is None else torch.cat(
-                    [features_future[:, [idx_pred], :], x_future], dim=-1)
-                x_future = x_future.to(self.device)
-
-                x_future, hidden_states = self.decoder(x_future, hx=hidden_states)
-                net_output = self.head(x_future[:, -1:, ])
-                predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()], dim=1)
+                    all_predictions = self.pred_from_net_output(all_predictions)
 
-                all_predictions.append(net_output)
+                return self.rescale_output(all_predictions, loc, scale, self.device)
 
-            if self.output_type != 'distribution':
-                all_predictions = torch.cat(all_predictions, dim=1)
             else:
-                all_predictions = self.pred_from_net_output(all_predictions)
+                # we follow the DeepAR implementation:
+                all_samples = []
+                batch_size = targets_past.shape[0]
 
-            return self.rescale_output(all_predictions, loc, scale, self.device)
+                if self.encoder_has_hidden_states:
+
+                    if isinstance(features_latent, tuple):
+                        repeated_state = [
+                            s.repeat_interleave(repeats=self.num_samples, dim=1)
+                            for s in features_latent
+                        ]
+                    else:
+                        repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=1)
+                else:
+                    # Transformer's hidden states is of shape
+                    repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=0)
+
+                repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples,
+                                                                      dim=0).squeeze(1)
+                repeated_predicted_target = repeated_past_target[:, [-1]]
+                repeated_past_target = repeated_past_target[:, :-1, ]
+
+                repeated_static_feat = features_static.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ).unsqueeze(dim=1) if features_static is not None else None
+
+                repeated_time_feat = features_future.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if features_future is not None else None
+
+                for idx_pred in range(self.n_prediction_steps):
+                    if self.decoder_lagged_input:
+                        x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
+                        if self.decoder_has_hidden_states:
+                            x_future, _ = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+                        else:
+                            x_future, _ = get_lagged_subsequences(x_future, idx_pred + 1, self.decoder.lagged_value)
+                    else:
+                        if self.decoder_has_hidden_states:
+                            x_future = repeated_predicted_target[:, [-1]]
+                        else:
+                            x_future = repeated_predicted_target
+
+                    if self.decoder_has_hidden_states:
+                        x_future = x_future if repeated_time_feat is None else torch.cat(
+                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
+                    else:
+                        x_future = x_future if repeated_time_feat is None else torch.cat(
+                            [repeated_time_feat[:, idx_pred + 1, :], x_future], dim=-1)
+
+                    x_future = x_future.to(self.device)
+                    if self.decoder_has_hidden_states:
+                        x_future, repeated_state = self.decoder(x_future, features_latent=repeated_state)
+                    else:
+                        x_future = self.decoder(x_future, repeated_state)
+                    net_output = self.head(x_future[:, -1:, ])
+                    samples = self.pred_from_net_output(net_output).cpu()
+                    repeated_predicted_target = torch.cat([repeated_predicted_target,
+                                                           samples],
+                                                          dim=1)
+                    all_samples.append(samples)
+
+                all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
+
+                if self.aggregation == 'mean':
+                    return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
+                elif self.aggregation == 'median':
+                    return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
+                else:
+                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
     def predict(self,
                 targets_past: torch.Tensor,
@@ -494,7 +604,6 @@ def forward(self,
                     ]
                 else:
                     repeated_state = hidden_states.repeat_interleave(repeats=self.num_samples, dim=1)
-
             else:
                 # For other models, the full past targets are passed to the network.
                 encoder_output = self.encoder(x_past)
@@ -662,8 +771,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
 
-        if X['decoder_properties']['has_hidden_states']:
-            # decoder is RNN
+        if X['decoder_properties']['recurrent']:
+            # decoder is RNN or Transformer
             self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
         elif X['decoder_properties']['multi_blocks']:
             self.network = NBEATSNet(**network_init_kwargs)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
index ae48e1a9e..919c5d7b3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
@@ -153,8 +153,8 @@ def allowed_decoders():
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
         return {
-            'shortname': 'InceptionTimeBackbone',
-            'name': 'InceptionTimeBackbone',
+            'shortname': 'InceptionTimeEncoder',
+            'name': 'InceptionTimeEncoder',
             'handles_tabular': False,
             'handles_image': False,
             # TODO consider InceptionTime for forecasting
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
index 69f909c38..a859e4ec5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
@@ -116,8 +116,8 @@ def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: in
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
         return {
-            'shortname': 'TSMLPBackbone',
-            'name': 'TimeSeriesMLPBackbone',
+            'shortname': 'TSMLPEncoder',
+            'name': 'TimeSeriesMLPEncoder',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index 2f6695536..1c9bef50e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -126,8 +126,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         return {
-            'shortname': 'RNNBackbone',
-            'name': 'RNNBackbone',
+            'shortname': 'RNNEncoder',
+            'name': 'RNNEncoder',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
new file mode 100644
index 000000000..95eccb28b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -0,0 +1,278 @@
+from typing import Any, Dict, Optional, Tuple, List, Union
+import warnings
+import numpy as np
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
+
+import torch
+from torch import nn
+from gluonts.time_feature.lag import get_lags_for_frequency
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+    BaseForecastingEncoder, EncoderNetwork
+)
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
+    PositionalEncoding, build_transformer_layers
+
+
+class _TransformerEncoder(EncoderNetwork):
+    def __init__(self,
+                 in_features: int,
+                 d_model: int,
+                 transformer_encoder_layers: [nn.Module],
+                 use_positional_encoder: bool,
+                 use_layer_norm_output: bool,
+                 dropout_pe: float = 0.0,
+                 layer_norm_eps_output: Optional[float] = None,
+                 lagged_value: Optional[Union[List, np.ndarray]] = None):
+        super().__init__()
+        self.lagged_value = lagged_value
+        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
+
+        self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        if use_positional_encoder:
+            self.input_layer.append(PositionalEncoding(d_model, dropout_pe))
+        self.input_layer = nn.Sequential(*self.input_layer)
+
+        self.transformer_encoder_layers = nn.ModuleList(transformer_encoder_layers)
+
+        self.use_layer_norm_output = use_layer_norm_output
+        if use_layer_norm_output:
+            self.norm_output = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+
+    def forward(self,
+                x: torch.Tensor,
+                output_seq: bool = False,
+                mask: Optional[torch.Tensor] = None,
+                src_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.input_layer(x)
+
+        for encoder_layer in self.transformer_encoder_layers:
+            x = encoder_layer(x, mask, src_key_padding_mask)
+        if self.use_layer_norm_output:
+            x = self.norm_output(x)
+        if output_seq:
+            return x
+        else:
+            return x[:, -1, :]
+
+
+class TransformerEncoder(BaseForecastingEncoder):
+    """
+    Standard searchable Transformer Encoder for time series data
+    """
+    _fixed_seq_length = False
+
+    def __init__(self, **kwargs: Dict):
+        super().__init__(**kwargs)
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
+
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        d_model = 2 ** self.config['d_model_log']
+        transformer_encoder_layers = []
+        for layer_id in range(1, self.config['num_layers'] + 1):
+            new_layer = build_transformer_layers(d_model=d_model, config=self.config,
+                                                 layer_id=layer_id, layer_type='encoder')
+            transformer_encoder_layers.append(new_layer)
+
+        encoder = _TransformerEncoder(in_features=input_shape[-1],
+                                      d_model=d_model,
+                                      transformer_encoder_layers=transformer_encoder_layers,
+                                      use_positional_encoder=self.config['use_positional_encoder'],
+                                      use_layer_norm_output=self.config['use_layer_norm_output'],
+                                      dropout_pe=self.config.get('dropout_positional_encoder', 0.0),
+                                      layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
+                                      lagged_value=self.lagged_value)
+        return encoder
+
+    @staticmethod
+    def allowed_decoders():
+        """
+        decoder that is compatible with the encoder
+        """
+        return ['MLPDecoder', 'TransformerDecoder']
+
+    def encoder_properties(self):
+        encoder_properties = super().encoder_properties()
+        encoder_properties.update({'lagged_input': True,
+                                   })
+        return encoder_properties
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        freq = X['dataset_properties'].get('freq', None)
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        if freq is not None:
+            try:
+                freq = FREQUENCY_MAP[freq]
+                lagged_values = get_lags_for_frequency(freq)
+                self.lagged_value = [0] + lagged_values
+            except Exception:
+                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
+                # If
+                pass
+        return super().fit(X, y)
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        transformer_encoder_kwargs = {'d_model_log': self.config['d_model_log']}  # used for initialize
+        X.update({'transformer_encoder_kwargs': transformer_encoder_kwargs})
+        return super().transform(X)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'TransformerEncoder',
+            'name': 'TransformerEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            num_layers: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='num_layers',
+                                      value_range=(1, 8),
+                                      default_value=4),
+            n_head_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='n_head_log',
+                                      value_range=(1, 4),
+                                      default_value=3),
+            d_model_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='d_model_log',
+                                      value_range=(4, 10),
+                                      default_value=5),
+            d_feed_forward_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
+                                      value_range=(6, 12),
+                                      default_value=7),
+            layer_norm_eps: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='layer_norm_eps',
+                                      value_range=(1e-7, 1e-3),
+                                      default_value=1e-5,
+                                      log=True),
+            use_positional_encoder: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_positional_encoder',
+                                      value_range=(True, False),
+                                      default_value=True),
+            use_layer_norm_output: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_layer_norm_output',
+                                      value_range=(True, False),
+                                      default_value=True),
+            activation: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="activation",
+                                      value_range=('relu', 'gelu'),
+                                      default_value='relu',
+                                      ),
+            use_dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                      value_range=(True, False),
+                                      default_value=False,
+                                      ),
+            dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="dropout",
+                                      value_range=(0, 0.8),
+                                      default_value=0.5,
+                                      ),
+            decoder_type: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='decoder_type',
+                                      value_range=('MLPDecoder', 'TransformerDecoder'),
+                                      default_value='TransformerDecoder')
+    ) -> ConfigurationSpace:
+        """
+        get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
+        consider their log value (with base 2) as the hyperparameters
+
+        """
+        cs = CS.ConfigurationSpace()
+
+        add_hyperparameter(cs, activation, CategoricalHyperparameter)
+        add_hyperparameter(cs, d_model_log, UniformIntegerHyperparameter)
+
+        min_transformer_layers, max_transformer_layers = num_layers.value_range
+
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+
+        # We can have dropout in the network for
+        # better generalization
+        use_positional_encoder = get_hyperparameter(use_positional_encoder, CategoricalHyperparameter)
+
+        dropout_pe = HyperparameterSearchSpace(hyperparameter='dropout_positional_encoder',
+                                               value_range=dropout.value_range,
+                                               default_value=dropout.default_value,
+                                               log=dropout.log)
+        dropout_pe = get_hyperparameter(dropout_pe, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([num_layers, use_dropout, use_positional_encoder, dropout_pe])
+        cs.add_condition(CS.AndConjunction(
+            CS.EqualsCondition(dropout_pe, use_dropout, True),
+            CS.EqualsCondition(dropout_pe, use_positional_encoder, True)
+        ))
+
+        for i in range(1, int(max_transformer_layers) + 1):
+            n_head_log_search_space = HyperparameterSearchSpace(hyperparameter='num_head_log_%d' % i,
+                                                                value_range=n_head_log.value_range,
+                                                                default_value=n_head_log.default_value,
+                                                                log=n_head_log.log)
+            d_feed_forward_log_search_space = HyperparameterSearchSpace(hyperparameter='d_feed_forward_log_%d' % i,
+                                                                        value_range=d_feed_forward_log.value_range,
+                                                                        default_value=d_feed_forward_log.default_value)
+
+            layer_norm_eps_search_space = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_%d' % i,
+                                                                    value_range=layer_norm_eps.value_range,
+                                                                    default_value=layer_norm_eps.default_value,
+                                                                    log=layer_norm_eps.log)
+
+            n_head_log_hp = get_hyperparameter(n_head_log_search_space, UniformIntegerHyperparameter)
+            d_feed_forward_log_hp = get_hyperparameter(d_feed_forward_log_search_space, UniformIntegerHyperparameter)
+            layer_norm_eps_hp = get_hyperparameter(layer_norm_eps_search_space, UniformFloatHyperparameter)
+
+            layers_dims = [n_head_log_hp, d_feed_forward_log_hp, layer_norm_eps_hp]
+
+            cs.add_hyperparameters(layers_dims)
+
+            if i > int(min_transformer_layers):
+                # The units of layer i should only exist
+                # if there are at least i layers
+                cs.add_conditions([
+                    CS.GreaterThanCondition(hp_layer, num_layers, i - 1) for hp_layer in layers_dims
+                ])
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % i,
+                                                             value_range=dropout.value_range,
+                                                             default_value=dropout.default_value,
+                                                             log=dropout.log)
+            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
+            cs.add_hyperparameter(dropout_hp)
+
+            dropout_condition_1 = CS.EqualsCondition(dropout_hp, use_dropout, True)
+
+            if i > int(min_transformer_layers):
+                dropout_condition_2 = CS.GreaterThanCondition(dropout_hp, num_layers, i - 1)
+                cs.add_condition(CS.AndConjunction(dropout_condition_1, dropout_condition_2))
+            else:
+                cs.add_condition(dropout_condition_1)
+
+        use_layer_norm_output = get_hyperparameter(use_layer_norm_output, CategoricalHyperparameter)
+        layer_norm_eps_output = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_output',
+                                                          value_range=layer_norm_eps.value_range,
+                                                          default_value=layer_norm_eps.default_value,
+                                                          log=layer_norm_eps.log)
+
+        layer_norm_eps_output = get_hyperparameter(layer_norm_eps_output, UniformFloatHyperparameter)
+        cs.add_hyperparameters([use_layer_norm_output, layer_norm_eps_output])
+        cs.add_condition(CS.EqualsCondition(layer_norm_eps_output, use_layer_norm_output, True))
+
+        add_hyperparameter(cs, decoder_type, CategoricalHyperparameter)
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index a96a8fe08..59ec726d0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -15,13 +15,13 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
-    BaseForecastingDecoder
+    BaseForecastingDecoder, RecurrentDecoderNetwork
 
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
 from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 
-class RNN_Module(nn.Module):
+class RNN_Module(RecurrentDecoderNetwork):
     def __init__(self,
                  in_features: int,
                  hidden_size: int,
@@ -43,11 +43,11 @@ def __init__(self,
                          bidirectional=False,
                          batch_first=True)
 
-    def forward(self, x: torch.Tensor,
-                hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
-        if x.ndim == 2:
-            x = x.unsqueeze(1)
-        outputs, hidden_state, = self.lstm(x, hx)
+    def forward(self, x_future: torch.Tensor,
+                features_latent: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+        if x_future.ndim == 2:
+            x_future = x_future.unsqueeze(1)
+        outputs, hidden_state, = self.lstm(x_future, features_latent)
         return outputs, hidden_state
 
 
@@ -131,5 +131,5 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict] = None,
     ) -> ConfigurationSpace:
         cs = CS.ConfigurationSpace()
-        cs.add_hyperparameter(Constant('decoder_type', 'RNNDecoder'))  # this helps the encoder to recognize the decoder.
+        cs.add_hyperparameter(Constant('decoder_type', 'RNNDecoder'))  # this helps the encoder to recognize the decoder
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
new file mode 100644
index 000000000..df8421357
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -0,0 +1,278 @@
+from typing import Any, Dict, Optional, Tuple, List, Union
+
+import warnings
+import torch
+from torch import nn
+import numpy as np
+from gluonts.time_feature.lag import get_lags_for_frequency
+
+import ConfigSpace as CS
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter,
+    UniformFloatHyperparameter
+)
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
+    BaseForecastingDecoder, RecurrentDecoderNetwork
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
+    PositionalEncoding, build_transformer_layers
+
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
+from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
+
+
+class _TransformerDecoder(RecurrentDecoderNetwork):
+    def __init__(self,
+                 in_features: int,
+                 d_model: int,
+                 transformer_decoder_layers: [nn.Module],
+                 use_positional_decoder: bool,
+                 use_layer_norm_output: bool,
+                 dropout_pd: float = 0.0,
+                 layer_norm_eps_output: Optional[float] = None,
+                 lagged_value: Optional[Union[List, np.ndarray]] = None):
+        super().__init__()
+        self.lagged_value = lagged_value
+        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
+
+        self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        if use_positional_decoder:
+            self.input_layer.append(PositionalEncoding(d_model, dropout_pd))
+        self.input_layer = nn.Sequential(*self.input_layer)
+
+        self.transformer_decoder_layers = nn.ModuleList(transformer_decoder_layers)
+
+        nn.TransformerDecoder
+
+        self.use_layer_norm_output = use_layer_norm_output
+        if use_layer_norm_output:
+            self.norm_output = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+
+    def forward(self, x_future: torch.Tensor, features_latent: torch.Tensor,
+                tgt_mask: Optional[torch.Tensor] = None,
+                memory_mask: Optional[torch.Tensor] = None,
+                tgt_key_padding_mask: Optional[torch.Tensor] = None,
+                memory_key_padding_mask: Optional[torch.Tensor] = None):
+        output = self.input_layer(x_future)
+        for decoder_layer in self.transformer_decoder_layers:
+            output = decoder_layer(output, features_latent, tgt_mask=tgt_mask,
+                                   memory_mask=memory_mask,
+                                   tgt_key_padding_mask=tgt_key_padding_mask,
+                                   memory_key_padding_mask=memory_key_padding_mask)
+
+        if self.use_layer_norm_output:
+            output = self.norm_output(output)
+
+        return output
+
+
+class ForecastingTransformerDecoder(BaseForecastingDecoder):
+    def __init__(self, **kwargs: Dict):
+        super().__init__(**kwargs)
+        # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
+        self.auto_regressive = True
+        self.transformer_encoder_kwargs = None
+        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
+
+    def _build_decoder(self,
+                       input_shape: Tuple[int, ...],
+                       n_prediction_heads: int,
+                       dataset_properties: Dict) -> nn.Module:
+        d_model = 2 ** self.transformer_encoder_kwargs['d_model_log']
+        transformer_decoder_layers = []
+        for layer_id in range(1, self.config['num_layers'] + 1):
+            new_layer = build_transformer_layers(d_model=d_model, config=self.config,
+                                                 layer_id=layer_id, layer_type='decoder')
+            transformer_decoder_layers.append(new_layer)
+
+        decoder = _TransformerDecoder(in_features=dataset_properties['output_shape'][-1],
+                                      d_model=d_model,
+                                      transformer_decoder_layers=transformer_decoder_layers,
+                                      use_positional_decoder=self.config['use_positional_decoder'],
+                                      use_layer_norm_output=self.config['use_layer_norm_output'],
+                                      dropout_pd=self.config.get('dropout_positional_decoder', 0.0),
+                                      layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
+                                      lagged_value=self.lagged_value)
+        return decoder, d_model
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        fit_requirement = super(ForecastingTransformerDecoder, self)._required_fit_requirements
+        fit_requirement.append(FitRequirement('transformer_encoder_kwargs', (Dict,), user_defined=False,
+                                              dataset_property=False))
+        return fit_requirement
+
+    def decoder_properties(self):
+        decoder_properties = super().decoder_properties()
+        decoder_properties.update({'recurrent': True,
+                                   'lagged_input': True,
+                                   'mask_on_future_target': True,
+                                   })
+        return decoder_properties
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.transformer_encoder_kwargs = X['transformer_encoder_kwargs']
+
+        freq = X['dataset_properties'].get('freq', None)
+        if 'lagged_value' in X['dataset_properties']:
+            self.lagged_value = X['dataset_properties']['lagged_value']
+        if freq is not None:
+            try:
+                freq = FREQUENCY_MAP[freq]
+                lagged_values = get_lags_for_frequency(freq)
+                self.lagged_value = [0] + lagged_values
+            except Exception:
+                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
+                pass
+        return super().fit(X, y)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TransformerDecoder',
+            'name': 'TransformerDecoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @property
+    def fitted_encoder(self):
+        return ['TransformerEncoder']
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict] = None,
+            num_layers: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='num_layers',
+                                      value_range=(1, 8),
+                                      default_value=4),
+            n_head_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='n_head_log',
+                                      value_range=(1, 4),
+                                      default_value=3),
+            d_feed_forward_log: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
+                                      value_range=(6, 12),
+                                      default_value=7),
+            layer_norm_eps: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='layer_norm_eps',
+                                      value_range=(1e-7, 1e-3),
+                                      default_value=1e-5,
+                                      log=True),
+            use_positional_decoder: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_positional_decoder',
+                                      value_range=(True, False),
+                                      default_value=True),
+            use_layer_norm_output: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter='use_layer_norm_output',
+                                      value_range=(True, False),
+                                      default_value=True),
+            activation: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="activation",
+                                      value_range=('relu', 'gelu'),
+                                      default_value='relu',
+                                      ),
+            use_dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="use_dropout",
+                                      value_range=(True, False),
+                                      default_value=False,
+                                      ),
+            dropout: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="dropout",
+                                      value_range=(0, 0.8),
+                                      default_value=0.5,
+                                      ),
+    ) -> ConfigurationSpace:
+        """
+        get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
+        consider their log value (with base 2) as the hyperparameters
+
+        """
+        cs = CS.ConfigurationSpace()
+
+        add_hyperparameter(cs, activation, CategoricalHyperparameter)
+
+        min_transformer_layers, max_transformer_layers = num_layers.value_range
+
+        num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+
+        # We can have dropout in the network for
+        # better generalization
+        use_positional_decoder = get_hyperparameter(use_positional_decoder, CategoricalHyperparameter)
+
+        dropout_pd = HyperparameterSearchSpace(hyperparameter='dropout_positional_decoder',
+                                               value_range=dropout.value_range,
+                                               default_value=dropout.default_value,
+                                               log=dropout.log)
+        dropout_pd = get_hyperparameter(dropout_pd, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([num_layers, use_dropout, use_positional_decoder, dropout_pd])
+        cs.add_condition(CS.AndConjunction(
+            CS.EqualsCondition(dropout_pd, use_dropout, True),
+            CS.EqualsCondition(dropout_pd, use_positional_decoder, True)
+        ))
+
+        for i in range(1, int(max_transformer_layers) + 1):
+            n_head_log_search_space = HyperparameterSearchSpace(hyperparameter='num_head_log_%d' % i,
+                                                                value_range=n_head_log.value_range,
+                                                                default_value=n_head_log.default_value,
+                                                                log=n_head_log.log)
+            d_feed_forward_log_search_space = HyperparameterSearchSpace(hyperparameter='d_feed_forward_log_%d' % i,
+                                                                        value_range=d_feed_forward_log.value_range,
+                                                                        default_value=d_feed_forward_log.default_value)
+
+            layer_norm_eps_search_space = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_%d' % i,
+                                                                    value_range=layer_norm_eps.value_range,
+                                                                    default_value=layer_norm_eps.default_value,
+                                                                    log=layer_norm_eps.log)
+
+            n_head_log_hp = get_hyperparameter(n_head_log_search_space, UniformIntegerHyperparameter)
+            d_feed_forward_log_hp = get_hyperparameter(d_feed_forward_log_search_space, UniformIntegerHyperparameter)
+            layer_norm_eps_search_hp = get_hyperparameter(layer_norm_eps_search_space, UniformFloatHyperparameter)
+
+            layers_dims = [n_head_log_hp, d_feed_forward_log_hp, layer_norm_eps_search_hp]
+
+            cs.add_hyperparameters(layers_dims)
+
+            if i > int(min_transformer_layers):
+                # The units of layer i should only exist
+                # if there are at least i layers
+                cs.add_conditions([
+                    CS.GreaterThanCondition(hp_layer, num_layers, i - 1) for hp_layer in layers_dims
+                ])
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % i,
+                                                             value_range=dropout.value_range,
+                                                             default_value=dropout.default_value,
+                                                             log=dropout.log)
+            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
+            cs.add_hyperparameter(dropout_hp)
+
+            dropout_condition_1 = CS.EqualsCondition(dropout_hp, use_dropout, True)
+
+            if i > int(min_transformer_layers):
+                dropout_condition_2 = CS.GreaterThanCondition(dropout_hp, num_layers, i - 1)
+                cs.add_condition(CS.AndConjunction(dropout_condition_1, dropout_condition_2))
+            else:
+                cs.add_condition(dropout_condition_1)
+
+        use_layer_norm_output = get_hyperparameter(use_layer_norm_output, CategoricalHyperparameter)
+        layer_norm_eps_output = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_output',
+                                                          value_range=layer_norm_eps.value_range,
+                                                          default_value=layer_norm_eps.default_value,
+                                                          log=layer_norm_eps.log)
+        layer_norm_eps_output = get_hyperparameter(layer_norm_eps_output, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([use_layer_norm_output, layer_norm_eps_output])
+        cs.add_condition(CS.EqualsCondition(layer_norm_eps_output, use_layer_norm_output, True))
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 0a74674aa..1e1a6ecaa 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -1,6 +1,7 @@
 from abc import abstractmethod, ABC
 from typing import Any, Dict, Iterable, Tuple, List, Optional
 
+import torch
 from torch import nn
 
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
@@ -8,6 +9,22 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
 
 
+class RecurrentDecoderNetwork(nn.Module):
+    def forward(self, x_future: torch.Tensor, features_latent: torch.Tensor):
+        """
+        Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
+
+
+        Args:
+            x_future torch.Tensor(B, L_future, N_out), the future features
+            features_latent: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
+        Returns:
+            net_output: torch.Tensor with shape either (B, L_future, N)
+
+        """
+        raise NotImplementedError
+
+
 class BaseForecastingDecoder(autoPyTorchComponent):
     """
     Base class for network heads used for forecasting.
@@ -44,6 +61,7 @@ def decoder_properties(self):
                               'recurrent': False,
                               'lagged_input': False,
                               'multi_blocks': False,
+                              'mask_on_future_target': False,
                               }
         return decoder_properties
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
new file mode 100644
index 000000000..a43a5c033
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
@@ -0,0 +1,68 @@
+from typing import Dict, Any
+import torch
+from torch import nn
+import math
+
+
+def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_id: int, layer_type='encoder'):
+    nhead = 2 ** config['num_head_log_%d' % layer_id]
+    dim_feedforward = 2 ** config['d_feed_forward_log_%d' % layer_id]
+    dropout = config['dropout_%d' % layer_id]
+    activation = config['activation']
+    layer_norm_eps = config['layer_norm_eps_%d' % layer_id]
+    if layer_type == 'encoder':
+        return nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
+                                          dropout=dropout, activation=activation,
+                                          layer_norm_eps=layer_norm_eps, batch_first=True)
+    elif layer_type == 'decoder':
+        return nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
+                                          dropout=dropout, activation=activation,
+                                          layer_norm_eps=layer_norm_eps, batch_first=True)
+    else:
+        raise ValueError('layer_type must be encoder or decoder!')
+
+
+# https://github.com/pytorch/examples/blob/master/word_language_model/model.py
+class PositionalEncoding(nn.Module):
+    r"""
+        NOTE: different from the raw implementation, this model is designed for the batch_first inputs!
+        Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [batch size, sequence length embed dim]
+            output: [batch size, sequence length, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        x = x + self.pe[:, :x.size(1), :]
+        return self.dropout(x)
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 10dfadc23..2e4da175c 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -8,8 +8,9 @@
 import numpy as np
 
 import torch
+import collections
 from torch.utils.data.sampler import SubsetRandomSampler
-from torch._six import container_abcs, string_classes, int_classes
+from torch._six import string_classes
 from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format, default_collate
 
 import torchvision
@@ -91,11 +92,11 @@ def __call__(self, batch, padding_value=0.0):
                 return torch.as_tensor(batch)
         elif isinstance(elem, float):
             return torch.tensor(batch, dtype=torch.float64)
-        elif isinstance(elem, int_classes):
+        elif isinstance(elem, int):
             return torch.tensor(batch)
         elif isinstance(elem, string_classes):
             return batch
-        elif isinstance(elem, container_abcs.Mapping):
+        elif isinstance(elem, collections.abc.Mapping):
             return {key: self([d[key] for d in batch]) if key != "past_target"
                     else self([d[key] for d in batch], self.target_padding_value) for key in elem}
         raise TypeError(f"Unsupported data type {elem_type}")
@@ -334,7 +335,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
         #  A new sequence must start from the index 7. We could then split each unique values to represent the length
         # of each split
-        _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
+        dataset_seq_length_train_all = X['dataset_properties']['sequence_lengths_train']
+        if np.sum(dataset_seq_length_train_all) == len(train_split):
+            # this works if we want to fit the entire datasets
+            seq_train_length = np.array(dataset_seq_length_train_all)
+        else:
+            _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
         # create masks for masking
         seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
         seq_train_length[seq_idx_inactivate] = 0
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index 16ddf65a7..5fc2ef061 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -415,6 +415,8 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
         if self.checkpoint_dir is None:
             self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory)
 
+        if not os.path.exists(self.checkpoint_dir):
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
         epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch()
 
         # Save the checkpoint if there is a new best epoch
diff --git a/requirements.txt b/requirements.txt
index b54faec45..94e368180 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 pandas
-torch
+torch==1.10.1
 torchvision
 tensorboard
 scikit-learn>=0.24.0,<0.25.0

From aa0221a3667f8d1e1e2f86a812906c3e892c50af Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 12 Jan 2022 16:07:40 +0100
Subject: [PATCH 115/347] maint

---
 .../configs/forecasting_init_cfgs.json        |  7 +-
 autoPyTorch/optimizer/utils.py                |  6 +-
 .../forecasting_backbone/TCNEncoder.py        |  2 +-
 .../TransformerEncoder.py                     |  4 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  2 +-
 .../forecasting_decoder/TransformerDecoder.py |  4 +-
 .../forecasting_backbone/transformer_util.py  |  2 +-
 .../forecasting_network_head/NBEATS_head.py   |  8 +-
 .../time_series_forecasting_data_loader.py    | 98 +++++++++++++++----
 .../forecasting_base_trainer.py               |  4 +
 .../pipeline/time_series_forecasting.py       |  6 +-
 11 files changed, 108 insertions(+), 35 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 81bb88e42..cf6cda1e0 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -2,10 +2,11 @@
     "trainer": {
         "data_loader:batch_size": 32,
         "data_loader:backcast": false,
+        "data_loader:sample_strategy": "seq_uniform",
         "data_loader:num_batches_per_epoch": 50,
         "imputer:numerical_strategy": "median",
         "lr_scheduler:__choice__": "ReduceLROnPlateau",
-        "lr_scheduler:ReduceLROnPlateau:mode": "min",
+        "lr_scheduler:ReduceLROnPlateau:mode": "max",
         "lr_scheduler:ReduceLROnPlateau:factor": 0.5,
         "lr_scheduler:ReduceLROnPlateau:patience": 10,
         "optimizer:__choice__": "AdamOptimizer",
@@ -166,6 +167,7 @@
             "network:num_samples": 100
         },
         "NBEATS-I": {
+            "target_scaler:__choice__": "TargetNoScaler",
             "data_loader:backcast": true,
             "data_loader:backcast_period": 2,
             "loss:__choice__": "RegressionLoss",
@@ -194,8 +196,7 @@
             "network_backbone:NBEATSDecoder:dropout_I_2": 0.1
         },
         "NBEATS-G": {
-            "data_loader:backcast": true,
-            "data_loader:backcast_period": 2,
+            "target_scaler:__choice__": "TargetNoScaler",
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mape",
             "network:net_out_type": "regression",
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 0910630e4..58d8e57e0 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -52,13 +52,15 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
 
         for model_name in suggested_init_models:
             cfg_tmp = cfg_trainer.copy()
-            if 'NBEATS' not in model_name:
-                cfg_tmp['data_loader:window_size'] = window_size
+
             model_cfg = models_name_to_cfgs.get(model_name, None)
             if model_cfg is None:
                 warnings.warn(f'Cannot to find the corresponding information of model {model_name} from,'
                               f' forecasting_init_cfgs, currently only {list(models_name_to_cfgs.keys())} are '
                               f'supported')
+                continue
+            if not model_cfg.get('data_loader:backcast', False):
+                cfg_tmp['data_loader:window_size'] = window_size
 
             cfg_tmp.update(model_cfg)
             initial_configurations_dict.append(cfg_tmp)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index 3bbb40c6c..94e761ddb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -126,7 +126,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         encoder = _TemporalConvNet(input_shape[-1],
                                    num_channels,
                                    kernel_size=kernel_size,
-                                   dropout=self.config["dropout"] if self.config["use_dropout"] else 0.0
+                                   dropout=self.config[f"dropout_{i}"] if self.config["use_dropout"] else 0.0
                                    )
         self._receptive_field = encoder.receptive_field
         return encoder
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
index 95eccb28b..6094f0320 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -142,8 +142,8 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict] = None,
             num_layers: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='num_layers',
-                                      value_range=(1, 8),
-                                      default_value=4),
+                                      value_range=(1, 4),
+                                      default_value=1),
             n_head_log: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='n_head_log',
                                       value_range=(1, 4),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 44c7f38e3..5cf944eec 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -66,7 +66,7 @@ def build_backbone(self):
         return layers
 
     def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
-        layers.append(nn.Linear(in_features, self.width, bias=False))
+        layers.append(nn.Linear(in_features, self.width))
         if self.normalization == 'BN':
             layers.append(nn.BatchNorm1d(self.width))
         elif self.normalization == 'LN':
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index df8421357..e35164b01 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -153,8 +153,8 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict] = None,
             num_layers: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='num_layers',
-                                      value_range=(1, 8),
-                                      default_value=4),
+                                      value_range=(1, 4),
+                                      default_value=1),
             n_head_log: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='n_head_log',
                                       value_range=(1, 4),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
index a43a5c033..fad9dfec4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
@@ -7,7 +7,7 @@
 def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_id: int, layer_type='encoder'):
     nhead = 2 ** config['num_head_log_%d' % layer_id]
     dim_feedforward = 2 ** config['d_feed_forward_log_%d' % layer_id]
-    dropout = config['dropout_%d' % layer_id]
+    dropout = config.get('dropout_%d' % layer_id, 0.0)
     activation = config['activation']
     layer_norm_eps = config['layer_norm_eps_%d' % layer_id]
     if layer_type == 'encoder':
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index a6b267432..3e07ec5db 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -34,15 +34,15 @@ def linspace(backcast_length: int, forecast_length: int, centered: bool = False)
 
 
 def get_generic_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+    backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim),
                                   nn.Linear(thetas_dim, backcast_length))
-    forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+    forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim),
                                   nn.Linear(thetas_dim, forecast_length))
     return backcast_head, forecast_head
 
 
 def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    base_layer = nn.Linear(block_width, thetas_dim, bias=False)
+    base_layer = nn.Linear(block_width, thetas_dim)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=True)
     norm = np.sqrt(forecast_length / thetas_dim)  # ensure range of predictions is comparable to input
@@ -60,7 +60,7 @@ def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, bac
 
 
 def get_seasonality_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    base_layer = nn.Linear(block_width, forecast_length, bias=False)
+    base_layer = nn.Linear(block_width, forecast_length)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=False)
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 2e4da175c..10d86f54f 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -106,14 +106,18 @@ class TimeSeriesSampler(SubsetRandomSampler):
     def __init__(self,
                  indices: Sequence[int],
                  seq_lengths: Sequence[int],
-                 num_instances_per_seqs: Optional[List[int]] = None,
+                 num_instances_per_seqs: Optional[List[float]] = None,
                  min_start: int = 0,
                  generator: Optional[torch.Generator] = None) -> None:
         """
         A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
         sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
         the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
-        and  randomly sample one instance from each interval.
+        and  randomly sample one instance from each interval. If num_instances_per_seqs is not an integral, then the
+        first interval is selected with a certain probability:
+        for instance, if we want to sample 1.3 instance from a sequence [0,1,2,3,4,5], then we first divide the seuqence
+        into two parts: [0, 3] and [3, 6], one sample is sampled from the second part, while an expected value of 0.3 is
+        sampled from the first part (This part will be sampled in the very end with torch.multinomial)
 
         Parameters
         ----------
@@ -121,8 +125,9 @@ def __init__(self,
             The set of all the possible indices that can be sampled from
         seq_lengths: Sequence[int]
             lengths of each sequence, applied to unsqueeze indices
-        num_instances_per_seqs: OPtional[List[int]]=None
-            how many instances are sampled in each sequence, if it is None, all the sequences are sampled
+        num_instances_per_seqs: Optional[List[int]]=None
+            expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
+            sampled
         min_start: int
             the how many first instances we want to skip (the first few sequences need to be padded with 0)
         generator: Optional[torch.Generator]
@@ -136,24 +141,43 @@ def __init__(self,
             if len(seq_lengths) != len(num_instances_per_seqs):
                 raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
                                  f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
-            seq_intervals = []
+            seq_intervals_int = []
+            seq_intervals_decimal = []
+            # seq_intervals_decimal_length = []
+            num_expected_ins_decimal = []
             idx_tracker = 0
             for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
                 idx_end = idx_tracker + seq_length
                 idx_start = idx_tracker + min_start
-                interval = np.linspace(idx_start, idx_end, num_instances + 1, endpoint=True, dtype=np.int)
-                seq_intervals.append(interval)
+                if idx_start > idx_end:
+                    idx_start = idx_tracker
+
+                num_interval = int(np.ceil(num_instances))
+                interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
+
+                num_expected_ins_decimal.append(np.modf(num_instances)[0])
+                seq_intervals_decimal.append(interval[:2])
+
+                seq_intervals_int.append(interval[1:])
+                idx_tracker += seq_length
+
+            num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
+            # seq_intervals_decimal_length = np.stack(seq_intervals_decimal_length)
             self.seq_lengths = seq_lengths
-            self.num_instances = np.sum(num_instances_per_seqs)
-            self.seq_intervals = seq_intervals
+            self.num_instances = int(np.round(np.sum(num_instances_per_seqs)))
+
+            self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
+            self.seq_intervals_int = seq_intervals_int
+
+            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal)
 
     def __iter__(self):
         if self.iter_all_seqs:
             return super().__iter__()
         samples = torch.ones(self.num_instances, dtype=torch.int)
         idx_samples_start = 0
-        idx_seq_tracker = 0
-        for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals, self.seq_lengths)):
+        idx_samples_end = 0
+        for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals_int, self.seq_lengths)):
             if len(interval) == 1:
                 continue
 
@@ -161,11 +185,19 @@ def __iter__(self):
             idx_samples_end = idx_samples_start + num_samples
 
             samples_shift = torch.rand(num_samples, generator=self.generator) * (interval[1:] - interval[:-1])
-            samples_seq = torch.floor(samples_shift + interval[:-1]).int() + idx_seq_tracker
+            samples_seq = torch.floor(samples_shift + interval[:-1]).int()
             samples[idx_samples_start: idx_samples_end] = samples_seq
 
             idx_samples_start = idx_samples_end
-            idx_seq_tracker += seq_length
+        num_samples_remain = self.num_instances - idx_samples_end
+        if num_samples_remain > 0:
+            samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain)
+            seq_interval = self.seq_intervals_decimal[samples_idx]
+
+            samples_shift = torch.rand(num_samples_remain, generator=self.generator)
+            samples_shift *= (seq_interval[:, 1] - seq_interval[:, 0])
+            samples_seq_remain = torch.floor(samples_shift + seq_interval[:, 0]).int()
+            samples[-num_samples_remain:] = samples_seq_remain
 
         return (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
 
@@ -237,6 +269,7 @@ def __init__(self,
                  window_size: int = 1,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
+                 sample_strategy= 'seq_uniform',
                  random_state: Optional[np.random.RandomState] = None) -> None:
         """
         initialize a dataloader
@@ -260,6 +293,7 @@ def __init__(self,
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
+        self.sample_strategy = sample_strategy
         self.subseq_length = self.window_size
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector = None
@@ -284,6 +318,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         if self.backcast:
             self.window_size = self.backcast_period * self.n_prediction_steps
 
+
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
@@ -335,6 +370,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
         #  A new sequence must start from the index 7. We could then split each unique values to represent the length
         # of each split
+
+        # TODO consider min_starrt as a hp (multiple of self.n_prediction_steps?)
+        min_start = self.n_prediction_steps
+
         dataset_seq_length_train_all = X['dataset_properties']['sequence_lengths_train']
         if np.sum(dataset_seq_length_train_all) == len(train_split):
             # this works if we want to fit the entire datasets
@@ -345,9 +384,24 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
         seq_train_length[seq_idx_inactivate] = 0
         # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
-        num_instances_per_seqs = np.ceil(np.ceil(num_instances_train / num_instances_dataset * seq_train_length) *
-                                         fraction_samples_per_seq)
-        num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
+        """
+        num_instances_per_seqs = np.ceil(
+            np.ceil(num_instances_train / (num_instances_dataset - min_start) * seq_train_length) *
+            fraction_samples_per_seq
+        )
+        """
+        if self.sample_strategy == 'length_uniform':
+            available_seq_length = seq_train_length - min_start
+            available_seq_length = np.where(available_seq_length <= 1, 1, available_seq_length)
+            num_instances_per_seqs = num_instances_train / num_instances_dataset * available_seq_length
+        elif self.sample_strategy == 'seq_uniform':
+            num_seq_train = len(seq_train_length)
+            num_instances_per_seqs = np.repeat(num_instances_train / num_seq_train, num_seq_train)
+        else:
+            raise NotImplementedError(f'Unsupported sample strategy: {self.sample_strategy}')
+        num_instances_per_seqs *= fraction_samples_per_seq
+
+        #num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
         # at least one element of each sequence should be selected
 
         # TODO consider the case where num_instances_train is greater than num_instances_dataset,
@@ -356,7 +410,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         sampler_indices_train = np.arange(num_instances_dataset)
 
         self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
-                                               num_instances_per_seqs=num_instances_per_seqs)
+                                               num_instances_per_seqs=num_instances_per_seqs,
+                                               min_start=min_start)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
@@ -489,6 +544,10 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                         HyperparameterSearchSpace(hyperparameter="num_batches_per_epoch",
                                                                   value_range=(30, 100),
                                                                   default_value=50),
+                                        sample_strategy: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter="sample_strategy",
+                                                                  value_range=('length_uniform', 'seq_uniform'),
+                                                                  default_value='seq_uniform'),
                                         backcast: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter='backcast',
                                                                   value_range=(True, False),
@@ -512,6 +571,10 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
             window_size (int): window size, (if activate) this value directly determines the window_size of the
                                data loader
             num_batch_per_epoch (int): how many batches are trained at each iteration
+            sample_strategy(str): how samples are distributed. if it is length_uniform, then every single data point
+                                  has the same probability to be sampled, in which case longer sequence will occupy more
+                                  samples. If it is seq_uniform, then every sequence has the same probability to be
+                                  sampled regardless of their length
             backcast (bool): if back_cast module is activate (in which case window size is a
             multiple of n_prediction_steps)
             backcast_period (int): activate if backcast is activate, the window size is then computed with
@@ -524,6 +587,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         cs = ConfigurationSpace()
         add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, num_batch_per_epoch, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, sample_strategy, CategoricalHyperparameter)
 
         window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
         backcast = get_hyperparameter(backcast, CategoricalHyperparameter)
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 18bd5b691..080dcd247 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -84,6 +84,8 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
+        import time
+        time_start = time.time()
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -114,6 +116,8 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
+        print(f'time used for trainging epoch {epoch}: {time.time() - time_start}')
+        print(f'Loss for epoch {epoch}: {loss_sum/N}')
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 993254b78..f8b703a76 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -240,8 +240,8 @@ def _get_hyperparameter_search_space(self,
 
             network_encoder_hp = cs.get_hyperparameter('network_backbone:__choice__')
 
-            if 'MLPEncoder' or 'TCNEncoder' or 'InceptionTimeEncoder' in network_encoder_hp.choices:
-                forbidden = ['MLPEncoder', 'TCNEncoder', 'InceptionTimeEncoder']
+            if 'MLPEncoder' in network_encoder_hp.choices:
+                forbidden = ['MLPEncoder']
                 forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
                 for hp_ar in hp_auto_regressive:
                     forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
@@ -287,10 +287,12 @@ def _get_hyperparameter_search_space(self,
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
                     forbidden_loss_non_regression)
                 )
+                """
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
                     forbidden_backcast_false)
                 )
+                """
             """
             if 'NBEATSDecoder' in network_decoder_hp.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(

From 0a8b5f2da0b11df2f02dd5e018dd65f857af8e85 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 12 Jan 2022 17:01:08 +0100
Subject: [PATCH 116/347] maint

---
 .../forecasting_backbone/TransformerEncoder.py                | 2 +-
 .../trainer/forecasting_trainer/forecasting_base_trainer.py   | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
index 6094f0320..aebd32135 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -150,7 +150,7 @@ def get_hyperparameter_search_space(
                                       default_value=3),
             d_model_log: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='d_model_log',
-                                      value_range=(4, 10),
+                                      value_range=(4, 9),
                                       default_value=5),
             d_feed_forward_log: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 080dcd247..18bd5b691 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -84,8 +84,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
-        import time
-        time_start = time.time()
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -116,8 +114,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
-        print(f'time used for trainging epoch {epoch}: {time.time() - time_start}')
-        print(f'Loss for epoch {epoch}: {loss_sum/N}')
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:

From 7a7b68d49029c01e1a6862bb69de8414fd81bd15 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 13 Jan 2022 19:02:11 +0100
Subject: [PATCH 117/347] maint

---
 .../network_backbone/forecasting_backbone/TCNEncoder.py     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index 94e761ddb..0f17a4210 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -92,7 +92,7 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: List[i
                                       stride=stride,
                                       dilation=dilation_size,
                                       padding=(kernel_size[i] - 1) * dilation_size,
-                                      dropout=dropout)]
+                                      dropout=dropout[i])]
             # receptive_field_block = 1 + (kernel_size - 1) * dilation_size * \
             #                        (int(np.prod(stride_values[:-2])) * (1 + stride_values[-2]))
             receptive_field_block = 1 + 2 * (kernel_size[i] - 1) * dilation_size  # stride = 1, we ignore stide computation
@@ -120,13 +120,15 @@ class TCNEncoder(BaseForecastingEncoder):
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_1"]]
         kernel_size = [self.config["kernel_size_1"]]
+        dropout = [self.config[f"dropout_1"] if self.config["use_dropout"] else 0.0]
         for i in range(2, self.config["num_blocks"] + 1):
             num_channels.append(self.config[f"num_filters_{i}"])
             kernel_size.append(self.config[f"kernel_size_{i}"])
+            dropout.append(self.config[f"dropout_{i}"] if self.config["use_dropout"] else 0.0)
         encoder = _TemporalConvNet(input_shape[-1],
                                    num_channels,
                                    kernel_size=kernel_size,
-                                   dropout=self.config[f"dropout_{i}"] if self.config["use_dropout"] else 0.0
+                                   dropout=dropout
                                    )
         self._receptive_field = encoder.receptive_field
         return encoder

From eeeda0255cee7d15521c166557dfce9b82e34243 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 13 Jan 2022 21:53:23 +0100
Subject: [PATCH 118/347] maint

---
 .../network_backbone/forecasting_backbone/TCNEncoder.py    | 7 ++++---
 .../data_loader/time_series_forecasting_data_loader.py     | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index 0f17a4210..ba6c39aeb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -95,7 +95,8 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: List[i
                                       dropout=dropout[i])]
             # receptive_field_block = 1 + (kernel_size - 1) * dilation_size * \
             #                        (int(np.prod(stride_values[:-2])) * (1 + stride_values[-2]))
-            receptive_field_block = 1 + 2 * (kernel_size[i] - 1) * dilation_size  # stride = 1, we ignore stide computation
+            # stride = 1, we ignore stride computation
+            receptive_field_block = 1 + 2 * (kernel_size[i] - 1) * dilation_size
             receptive_field += receptive_field_block
         self.receptive_field = receptive_field
         self.network = nn.Sequential(*layers)
@@ -159,8 +160,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                              value_range=(1, 10),
-                                                                              default_value=5),
+                                                                              value_range=(1, 6),
+                                                                              default_value=3),
             num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                                value_range=(4, 64),
                                                                                default_value=32,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 10d86f54f..0a1c62de4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -169,7 +169,7 @@ def __init__(self,
             self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
             self.seq_intervals_int = seq_intervals_int
 
-            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal)
+            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal) + 1e-8
 
     def __iter__(self):
         if self.iter_all_seqs:

From 738f18d7ec9d1cb4ff7d6ca6d15f47ed81ed3e55 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 13 Jan 2022 21:53:51 +0100
Subject: [PATCH 119/347] reduced search space for Transformer

---
 .../TransformerEncoder.py                     | 72 ++++------------
 .../forecasting_decoder/TransformerDecoder.py | 85 +++++--------------
 .../forecasting_backbone/transformer_util.py  | 10 +--
 3 files changed, 45 insertions(+), 122 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
index aebd32135..8c6de0668 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -28,6 +28,7 @@ class _TransformerEncoder(EncoderNetwork):
     def __init__(self,
                  in_features: int,
                  d_model: int,
+                 num_layers: int,
                  transformer_encoder_layers: [nn.Module],
                  use_positional_encoder: bool,
                  use_layer_norm_output: bool,
@@ -43,11 +44,14 @@ def __init__(self,
             self.input_layer.append(PositionalEncoding(d_model, dropout_pe))
         self.input_layer = nn.Sequential(*self.input_layer)
 
-        self.transformer_encoder_layers = nn.ModuleList(transformer_encoder_layers)
-
         self.use_layer_norm_output = use_layer_norm_output
         if use_layer_norm_output:
-            self.norm_output = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+            norm = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+        else:
+            norm = None
+        self.transformer_encoder_layers = nn.TransformerEncoder(encoder_layer=transformer_encoder_layers,
+                                                                num_layers=num_layers,
+                                                                norm=norm)
 
     def forward(self,
                 x: torch.Tensor,
@@ -55,11 +59,7 @@ def forward(self,
                 mask: Optional[torch.Tensor] = None,
                 src_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         x = self.input_layer(x)
-
-        for encoder_layer in self.transformer_encoder_layers:
-            x = encoder_layer(x, mask, src_key_padding_mask)
-        if self.use_layer_norm_output:
-            x = self.norm_output(x)
+        x = self.transformer_encoder_layers(x)
         if output_seq:
             return x
         else:
@@ -78,14 +78,11 @@ def __init__(self, **kwargs: Dict):
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         d_model = 2 ** self.config['d_model_log']
-        transformer_encoder_layers = []
-        for layer_id in range(1, self.config['num_layers'] + 1):
-            new_layer = build_transformer_layers(d_model=d_model, config=self.config,
-                                                 layer_id=layer_id, layer_type='encoder')
-            transformer_encoder_layers.append(new_layer)
+        transformer_encoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='encoder')
 
         encoder = _TransformerEncoder(in_features=input_shape[-1],
                                       d_model=d_model,
+                                      num_layers=self.config['num_layers'],
                                       transformer_encoder_layers=transformer_encoder_layers,
                                       use_positional_encoder=self.config['use_positional_encoder'],
                                       use_layer_norm_output=self.config['use_layer_norm_output'],
@@ -220,48 +217,13 @@ def get_hyperparameter_search_space(
             CS.EqualsCondition(dropout_pe, use_positional_encoder, True)
         ))
 
-        for i in range(1, int(max_transformer_layers) + 1):
-            n_head_log_search_space = HyperparameterSearchSpace(hyperparameter='num_head_log_%d' % i,
-                                                                value_range=n_head_log.value_range,
-                                                                default_value=n_head_log.default_value,
-                                                                log=n_head_log.log)
-            d_feed_forward_log_search_space = HyperparameterSearchSpace(hyperparameter='d_feed_forward_log_%d' % i,
-                                                                        value_range=d_feed_forward_log.value_range,
-                                                                        default_value=d_feed_forward_log.default_value)
-
-            layer_norm_eps_search_space = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_%d' % i,
-                                                                    value_range=layer_norm_eps.value_range,
-                                                                    default_value=layer_norm_eps.default_value,
-                                                                    log=layer_norm_eps.log)
-
-            n_head_log_hp = get_hyperparameter(n_head_log_search_space, UniformIntegerHyperparameter)
-            d_feed_forward_log_hp = get_hyperparameter(d_feed_forward_log_search_space, UniformIntegerHyperparameter)
-            layer_norm_eps_hp = get_hyperparameter(layer_norm_eps_search_space, UniformFloatHyperparameter)
-
-            layers_dims = [n_head_log_hp, d_feed_forward_log_hp, layer_norm_eps_hp]
-
-            cs.add_hyperparameters(layers_dims)
-
-            if i > int(min_transformer_layers):
-                # The units of layer i should only exist
-                # if there are at least i layers
-                cs.add_conditions([
-                    CS.GreaterThanCondition(hp_layer, num_layers, i - 1) for hp_layer in layers_dims
-                ])
-            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % i,
-                                                             value_range=dropout.value_range,
-                                                             default_value=dropout.default_value,
-                                                             log=dropout.log)
-            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
-            cs.add_hyperparameter(dropout_hp)
-
-            dropout_condition_1 = CS.EqualsCondition(dropout_hp, use_dropout, True)
-
-            if i > int(min_transformer_layers):
-                dropout_condition_2 = CS.GreaterThanCondition(dropout_hp, num_layers, i - 1)
-                cs.add_condition(CS.AndConjunction(dropout_condition_1, dropout_condition_2))
-            else:
-                cs.add_condition(dropout_condition_1)
+        add_hyperparameter(cs, n_head_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, d_feed_forward_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, layer_norm_eps, UniformFloatHyperparameter)
+
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameter(dropout)
+        cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
 
         use_layer_norm_output = get_hyperparameter(use_layer_norm_output, CategoricalHyperparameter)
         layer_norm_eps_output = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_output',
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index e35164b01..d6d0a501f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -32,7 +32,8 @@ class _TransformerDecoder(RecurrentDecoderNetwork):
     def __init__(self,
                  in_features: int,
                  d_model: int,
-                 transformer_decoder_layers: [nn.Module],
+                 num_layers: int,
+                 transformer_decoder_layers: nn.Module,
                  use_positional_decoder: bool,
                  use_layer_norm_output: bool,
                  dropout_pd: float = 0.0,
@@ -47,13 +48,15 @@ def __init__(self,
             self.input_layer.append(PositionalEncoding(d_model, dropout_pd))
         self.input_layer = nn.Sequential(*self.input_layer)
 
-        self.transformer_decoder_layers = nn.ModuleList(transformer_decoder_layers)
-
-        nn.TransformerDecoder
-
         self.use_layer_norm_output = use_layer_norm_output
+
         if use_layer_norm_output:
-            self.norm_output = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+            norm = nn.LayerNorm(d_model, eps=layer_norm_eps_output)
+        else:
+            norm = None
+        self.transformer_decoder_layers = nn.TransformerDecoder(decoder_layer=transformer_decoder_layers,
+                                                                num_layers=num_layers,
+                                                                norm=norm)
 
     def forward(self, x_future: torch.Tensor, features_latent: torch.Tensor,
                 tgt_mask: Optional[torch.Tensor] = None,
@@ -61,15 +64,10 @@ def forward(self, x_future: torch.Tensor, features_latent: torch.Tensor,
                 tgt_key_padding_mask: Optional[torch.Tensor] = None,
                 memory_key_padding_mask: Optional[torch.Tensor] = None):
         output = self.input_layer(x_future)
-        for decoder_layer in self.transformer_decoder_layers:
-            output = decoder_layer(output, features_latent, tgt_mask=tgt_mask,
-                                   memory_mask=memory_mask,
-                                   tgt_key_padding_mask=tgt_key_padding_mask,
-                                   memory_key_padding_mask=memory_key_padding_mask)
-
-        if self.use_layer_norm_output:
-            output = self.norm_output(output)
-
+        output = self.transformer_decoder_layers(output, features_latent, tgt_mask=tgt_mask,
+                                                 memory_mask=memory_mask,
+                                                 tgt_key_padding_mask=tgt_key_padding_mask,
+                                                 memory_key_padding_mask=memory_key_padding_mask)
         return output
 
 
@@ -86,20 +84,18 @@ def _build_decoder(self,
                        n_prediction_heads: int,
                        dataset_properties: Dict) -> nn.Module:
         d_model = 2 ** self.transformer_encoder_kwargs['d_model_log']
-        transformer_decoder_layers = []
-        for layer_id in range(1, self.config['num_layers'] + 1):
-            new_layer = build_transformer_layers(d_model=d_model, config=self.config,
-                                                 layer_id=layer_id, layer_type='decoder')
-            transformer_decoder_layers.append(new_layer)
+        transformer_decoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='decoder')
 
         decoder = _TransformerDecoder(in_features=dataset_properties['output_shape'][-1],
                                       d_model=d_model,
+                                      num_layers=self.config['num_layers'],
                                       transformer_decoder_layers=transformer_decoder_layers,
                                       use_positional_decoder=self.config['use_positional_decoder'],
                                       use_layer_norm_output=self.config['use_layer_norm_output'],
                                       dropout_pd=self.config.get('dropout_positional_decoder', 0.0),
                                       layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
                                       lagged_value=self.lagged_value)
+
         return decoder, d_model
 
     @property
@@ -222,48 +218,13 @@ def get_hyperparameter_search_space(
             CS.EqualsCondition(dropout_pd, use_positional_decoder, True)
         ))
 
-        for i in range(1, int(max_transformer_layers) + 1):
-            n_head_log_search_space = HyperparameterSearchSpace(hyperparameter='num_head_log_%d' % i,
-                                                                value_range=n_head_log.value_range,
-                                                                default_value=n_head_log.default_value,
-                                                                log=n_head_log.log)
-            d_feed_forward_log_search_space = HyperparameterSearchSpace(hyperparameter='d_feed_forward_log_%d' % i,
-                                                                        value_range=d_feed_forward_log.value_range,
-                                                                        default_value=d_feed_forward_log.default_value)
-
-            layer_norm_eps_search_space = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_%d' % i,
-                                                                    value_range=layer_norm_eps.value_range,
-                                                                    default_value=layer_norm_eps.default_value,
-                                                                    log=layer_norm_eps.log)
-
-            n_head_log_hp = get_hyperparameter(n_head_log_search_space, UniformIntegerHyperparameter)
-            d_feed_forward_log_hp = get_hyperparameter(d_feed_forward_log_search_space, UniformIntegerHyperparameter)
-            layer_norm_eps_search_hp = get_hyperparameter(layer_norm_eps_search_space, UniformFloatHyperparameter)
-
-            layers_dims = [n_head_log_hp, d_feed_forward_log_hp, layer_norm_eps_search_hp]
-
-            cs.add_hyperparameters(layers_dims)
-
-            if i > int(min_transformer_layers):
-                # The units of layer i should only exist
-                # if there are at least i layers
-                cs.add_conditions([
-                    CS.GreaterThanCondition(hp_layer, num_layers, i - 1) for hp_layer in layers_dims
-                ])
-            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % i,
-                                                             value_range=dropout.value_range,
-                                                             default_value=dropout.default_value,
-                                                             log=dropout.log)
-            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
-            cs.add_hyperparameter(dropout_hp)
-
-            dropout_condition_1 = CS.EqualsCondition(dropout_hp, use_dropout, True)
-
-            if i > int(min_transformer_layers):
-                dropout_condition_2 = CS.GreaterThanCondition(dropout_hp, num_layers, i - 1)
-                cs.add_condition(CS.AndConjunction(dropout_condition_1, dropout_condition_2))
-            else:
-                cs.add_condition(dropout_condition_1)
+        add_hyperparameter(cs, n_head_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, d_feed_forward_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, layer_norm_eps, UniformFloatHyperparameter)
+
+        dropout = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameter(dropout)
+        cs.add_condition(CS.EqualsCondition(dropout, use_dropout, True))
 
         use_layer_norm_output = get_hyperparameter(use_layer_norm_output, CategoricalHyperparameter)
         layer_norm_eps_output = HyperparameterSearchSpace(hyperparameter='layer_norm_eps_output',
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
index fad9dfec4..f1f972919 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
@@ -4,12 +4,12 @@
 import math
 
 
-def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_id: int, layer_type='encoder'):
-    nhead = 2 ** config['num_head_log_%d' % layer_id]
-    dim_feedforward = 2 ** config['d_feed_forward_log_%d' % layer_id]
-    dropout = config.get('dropout_%d' % layer_id, 0.0)
+def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='encoder'):
+    nhead = 2 ** config['n_head_log']
+    dim_feedforward = 2 ** config['d_feed_forward_log']
+    dropout = config.get('dropout')
     activation = config['activation']
-    layer_norm_eps = config['layer_norm_eps_%d' % layer_id]
+    layer_norm_eps = config['layer_norm_eps']
     if layer_type == 'encoder':
         return nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
                                           dropout=dropout, activation=activation,

From 353b5c589b9ac0929d135cb8fe92900069c72a5a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 13 Jan 2022 21:57:33 +0100
Subject: [PATCH 120/347] reduced init design

---
 .../configs/forecasting_init_cfgs.json        | 41 ++++---------------
 1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index cf6cda1e0..1d7ec95ec 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -120,10 +120,10 @@
             "network_backbone:TransformerEncoder:use_dropout": true,
             "network_backbone:TransformerEncoder:use_positional_encoder": true,
             "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
-            "network_backbone:TransformerEncoder:d_feed_forward_log_1": 7,
-            "network_backbone:TransformerEncoder:num_head_log_1": 3,
-            "network_backbone:TransformerEncoder:layer_norm_eps_1": 1e-05,
-            "network_backbone:TransformerEncoder:dropout_1": 0.1,
+            "network_backbone:TransformerEncoder:d_feed_forward_log": 7,
+            "network_backbone:TransformerEncoder:n_head_log": 3,
+            "network_backbone:TransformerEncoder:layer_norm_eps": 1e-05,
+            "network_backbone:TransformerEncoder:dropout": 0.1,
             "network_backbone:TransformerEncoder:use_layer_norm_output": true,
             "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
             "network_backbone:TransformerDecoder:activation": "gelu",
@@ -131,41 +131,16 @@
             "network_backbone:TransformerDecoder:use_dropout": true,
             "network_backbone:TransformerDecoder:use_positional_decoder": true,
             "network_backbone:TransformerDecoder:dropout_positional_decoder": 0.1,
-            "network_backbone:TransformerDecoder:d_feed_forward_log_1": 7,
-            "network_backbone:TransformerDecoder:num_head_log_1": 3,
-            "network_backbone:TransformerDecoder:layer_norm_eps_1": 1e-05,
-            "network_backbone:TransformerDecoder:dropout_1": 0.1,
+            "network_backbone:TransformerDecoder:d_feed_forward_log": 7,
+            "network_backbone:TransformerDecoder:n_head_log": 3,
+            "network_backbone:TransformerDecoder:layer_norm_eps": 1e-05,
+            "network_backbone:TransformerDecoder:dropout": 0.1,
             "network_backbone:TransformerDecoder:use_layer_norm_output": true,
             "network_backbone:TransformerDecoder:layer_norm_eps_output": 1e-05,
             "network:forecast_strategy": "sample",
             "network:aggregation": "median",
             "network:num_samples": 100
         },
-        "Seq2Seq-Transformer2MLP": {
-            "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
-            "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "TransformerEncoder",
-            "network_backbone:TransformerEncoder:d_model_log": 5,
-            "network_backbone:TransformerEncoder:activation": "gelu",
-            "network_backbone:TransformerEncoder:num_layers": 1,
-            "network_backbone:TransformerEncoder:decoder_type": "MLPDecoder",
-            "network_backbone:TransformerEncoder:use_dropout": true,
-            "network_backbone:TransformerEncoder:use_positional_encoder": true,
-            "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
-            "network_backbone:TransformerEncoder:d_feed_forward_log_1": 7,
-            "network_backbone:TransformerEncoder:num_head_log_1": 3,
-            "network_backbone:TransformerEncoder:layer_norm_eps_1": 1e-05,
-            "network_backbone:TransformerEncoder:dropout_1": 0.1,
-            "network_backbone:TransformerEncoder:use_layer_norm_output": true,
-            "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
-            "network_backbone:MLPDecoder:num_layers": 0,
-            "network_backbone:MLPDecoder:auto_regressive": false,
-            "network_backbone:MLPDecoder:units_final_layer": 30,
-            "network:forecast_strategy": "sample",
-            "network:aggregation": "median",
-            "network:num_samples": 100
-        },
         "NBEATS-I": {
             "target_scaler:__choice__": "TargetNoScaler",
             "data_loader:backcast": true,

From e8db57b15f9bd8d881b11b7b877c1a1b8f7e715f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 13 Jan 2022 22:32:53 +0100
Subject: [PATCH 121/347] maint

---
 .../network_backbone/forecasting_backbone/transformer_util.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
index f1f972919..538f5ed9c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
@@ -7,7 +7,7 @@
 def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='encoder'):
     nhead = 2 ** config['n_head_log']
     dim_feedforward = 2 ** config['d_feed_forward_log']
-    dropout = config.get('dropout')
+    dropout = config.get('dropout', 0.0)
     activation = config['activation']
     layer_norm_eps = config['layer_norm_eps']
     if layer_type == 'encoder':

From 8dec08cc3b56597db1ada032309d6648c7e2da71 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 14 Jan 2022 19:42:33 +0100
Subject: [PATCH 122/347] maint

---
 .../configs/forecasting_init_cfgs.json        |  5 ++--
 .../forecasting_network_head/NBEATS_head.py   | 12 ++++-----
 .../time_series_forecasting_data_loader.py    | 25 +++++++++++--------
 .../components/training/metrics/metrics.py    |  8 +++---
 4 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 1d7ec95ec..ef5f9a0c1 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -180,14 +180,13 @@
             "network_backbone:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:NBEATSDecoder:activation": "relu",
             "network_backbone:NBEATSDecoder:n_beats_type": "G",
-            "network_backbone:NBEATSDecoder:use_dropout_G": true,
+            "network_backbone:NBEATSDecoder:use_dropout_G": false,
             "network_backbone:NBEATSDecoder:num_stacks_G": 30,
             "network_backbone:NBEATSDecoder:num_blocks_G": 1,
             "network_backbone:NBEATSDecoder:num_layers_G": 4,
             "network_backbone:NBEATSDecoder:width_G": 512,
             "network_backbone:NBEATSDecoder:weight_sharing_G": false,
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32,
-            "network_backbone:NBEATSDecoder:dropout_G": 0.1
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index 3e07ec5db..b4b9214ab 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -34,15 +34,15 @@ def linspace(backcast_length: int, forecast_length: int, centered: bool = False)
 
 
 def get_generic_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim),
-                                  nn.Linear(thetas_dim, backcast_length))
-    forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim),
-                                  nn.Linear(thetas_dim, forecast_length))
+    backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+                                  nn.Linear(thetas_dim, backcast_length, bias=False))
+    forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
+                                  nn.Linear(thetas_dim, forecast_length, bias=False))
     return backcast_head, forecast_head
 
 
 def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    base_layer = nn.Linear(block_width, thetas_dim)
+    base_layer = nn.Linear(block_width, thetas_dim, bias=False)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=True)
     norm = np.sqrt(forecast_length / thetas_dim)  # ensure range of predictions is comparable to input
@@ -60,7 +60,7 @@ def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, bac
 
 
 def get_seasonality_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
-    base_layer = nn.Linear(block_width, forecast_length)
+    base_layer = nn.Linear(block_width, forecast_length, bias=False)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=False)
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 0a1c62de4..f8b0fe8c0 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -153,12 +153,20 @@ def __init__(self,
                     idx_start = idx_tracker
 
                 num_interval = int(np.ceil(num_instances))
-                interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
-
-                num_expected_ins_decimal.append(np.modf(num_instances)[0])
-                seq_intervals_decimal.append(interval[:2])
-
-                seq_intervals_int.append(interval[1:])
+                if num_interval > idx_end - idx_start:
+                    num_expected_ins_decimal.append(np.modf(num_instances)[0])
+                    interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
+                    # we consider
+                    num_expected_ins_decimal.append(num_instances)
+                    seq_intervals_decimal.append(interval[:2])
+                    seq_intervals_int.append(interval[1:])
+                else:
+                    interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
+
+                    num_expected_ins_decimal.append(np.modf(num_instances)[0])
+                    seq_intervals_decimal.append(interval[:2])
+
+                    seq_intervals_int.append(interval[1:])
                 idx_tracker += seq_length
 
             num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
@@ -199,12 +207,11 @@ def __iter__(self):
             samples_seq_remain = torch.floor(samples_shift + seq_interval[:, 0]).int()
             samples[-num_samples_remain:] = samples_seq_remain
 
-        return (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
+        yield from (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
 
     def __len__(self):
         return self.num_instances
 
-
 class ExpandTransformTimeSeries(object):
     """Expand Dimensionality so tabular transformations see
        a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
@@ -362,8 +369,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_dataset = np.size(train_split)
         num_instances_train = self.num_batches_per_epoch * self.batch_size
 
-        if num_instances_train > num_instances_dataset:
-            num_instances_train = num_instances_dataset
 
         # get the length of each sequence of training data (after split)
         # as we already know that the elements in 'train_split' increases consecutively with a certain number of
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 7780c81f5..fe43e033b 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -69,16 +69,18 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
     Returns:
         mase_coefficient: inverse of mase_denominator
     """
-    if sp > len(past_target):
+    if sp >= len(past_target):
         # in this case, we simply consider the mean value of the entire sequence
         # TODO condsider if there is a better way of handling this
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target,
                                                                    np.zeros_like(past_target),
                                                                    multioutput="raw_values")
     else:
+
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
-                                                                   past_target[:-sp],
-                                                                   multioutput="raw_values")
+                                                               past_target[:-sp],
+                                                               multioutput="raw_values")
+
     return 1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
 
 

From 7008cec79552de254cd8b1b1768c316c5c6e7403 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 14 Jan 2022 21:39:01 +0100
Subject: [PATCH 123/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 5fb43fc93..76c002c2d 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -202,6 +202,9 @@ def __init__(self,
         if isinstance(freq, list):
             tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
             freq_value = tmp_freq
+        if isinstance(freq_value, list):
+            tmp_freq = min([freq_value_item for freq_value_item in freq_value if freq_value_item > n_prediction_steps])
+            freq_value = tmp_freq
 
         seasonality = SEASONALITY_MAP.get(freq, 1)
         if isinstance(seasonality, list):

From c7d401e6833e3411f4ab81f9f80ff2eae8fe13c6 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 14 Jan 2022 21:49:03 +0100
Subject: [PATCH 124/347] maint

---
 .../data_loader/time_series_forecasting_data_loader.py     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index f8b0fe8c0..49e50e517 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -154,7 +154,6 @@ def __init__(self,
 
                 num_interval = int(np.ceil(num_instances))
                 if num_interval > idx_end - idx_start:
-                    num_expected_ins_decimal.append(np.modf(num_instances)[0])
                     interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
                     # we consider
                     num_expected_ins_decimal.append(num_instances)
@@ -199,7 +198,11 @@ def __iter__(self):
             idx_samples_start = idx_samples_end
         num_samples_remain = self.num_instances - idx_samples_end
         if num_samples_remain > 0:
-            samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain)
+            if num_samples_remain > self.num_expected_ins_decimal[-1]:
+                replacement = True
+            else:
+                replacement = False
+            samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain, replacement)
             seq_interval = self.seq_intervals_decimal[samples_idx]
 
             samples_shift = torch.rand(num_samples_remain, generator=self.generator)

From df15a2b42fa05fd7de758b1b8504c97852d6037f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 16 Jan 2022 18:03:04 +0100
Subject: [PATCH 125/347] faster forecasting

---
 autoPyTorch/datasets/time_series_dataset.py   | 30 ++++++++-
 autoPyTorch/evaluation/abstract_evaluator.py  |  8 +--
 ...time_series_forecasting_train_evaluator.py | 26 ++++----
 .../time_series_forecasting_data_loader.py    | 65 ++++++++++++++-----
 4 files changed, 95 insertions(+), 34 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 76c002c2d..d7a315ceb 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 import uuid
 import bisect
+import copy
 
 import numpy as np
 
@@ -93,7 +94,7 @@ def __getitem__(self, index: int, train: bool = True) \
             features from past, targets from past and future
         """
         if index < 0:
-            index = self.__len__() + 1 - index
+            index = self.__len__() + index
 
         if hasattr(self.X, 'loc'):
             X = self.X.iloc[:index + 1]
@@ -147,6 +148,21 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
             self.val_transform = transform
         return self
 
+    def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
+        if index < 0:
+            index = self.__len__() + index
+        if index == self.__len__() - 1:
+            return copy.copy(self)
+        else:
+            return TimeSeriesSequence(self.X[:index + 1],
+                                      self.Y[:index + 1 + self.n_prediction_steps],
+                                      train_transforms=self.train_transform,
+                                      val_transforms=self.val_transform,
+                                      n_prediction_steps=self.n_prediction_steps,
+                                      sp=self.sp)
+
+
+
 
 class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
     datasets: List[TimeSeriesSequence]
@@ -343,6 +359,18 @@ def __getitem__(self, idx, train=True):
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return self.datasets[dataset_idx].__getitem__(sample_idx, train)
 
+    def get_validation_set(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_val_seq_set(sample_idx)
+
     def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 9a598626a..2c4b361e4 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -329,13 +329,13 @@ def fit(self, X: Dict[str, Any], y: Any,
 
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int = 1000) -> np.array:
-        new_X = X[-self.n_prediction_steps:]
-        return super(DummyTimeSeriesForecastingPipeline, self).predict_proba(new_X)
+        new_X = [x.X[-1] for x in X]
+        return np.tile(new_X, (1, self.n_prediction_steps)).astype(np.float32)
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
                 batch_size: int = 1000) -> np.array:
-        new_X = X[-self.n_prediction_steps:]
-        return super(DummyTimeSeriesForecastingPipeline, self).predict(new_X).astype(np.float32)
+        new_X = [x.X[-1]for x in X]
+        return np.tile(new_X, (1, self.n_prediction_steps)).astype(np.float32)
 
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 6cc1f6f2e..739d32239 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -306,28 +306,30 @@ def _predict(self, pipeline: BaseEstimator,
                  test_indices: Union[np.ndarray, List],
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
         # TODO consider multile outputs
-        opt_pred = np.ones([len(test_indices), self.n_prediction_steps, self.num_targets])
-        for seq_idx, test_idx in enumerate(test_indices):
-            opt_pred[seq_idx] = self.predict_function(self.datamanager[test_idx][0]['past_target'], pipeline)
+        val_sets = []
+        for test_idx in test_indices:
+            val_sets.append(self.datamanager.get_validation_set(test_idx))
+        opt_pred = self.predict_function(val_sets, pipeline)
         opt_pred = opt_pred.reshape(-1, self.num_targets)
 
         #TODO we consider X_valid and X_test as a multiple sequences???
         if self.X_valid is not None:
-            valid_pred = np.ones([len(test_indices), self.n_prediction_steps])
-            for seq_idx, val_seq in enumerate(self.datamanager.datasets):
-                valid_pred[seq_idx] = self.predict_function(val_seq.X, pipeline).flatten()
+            valid_sets = []
+            for val_seq in enumerate(self.datamanager.datasets):
+                valid_sets.append(val_seq.X_val)
+            valid_pred = self.predict_function(valid_sets, pipeline).flatten()
 
-            valid_pred = valid_pred.flatten()
+            valid_pred = valid_pred.squeeze(-1)
 
         else:
             valid_pred = None
 
         if self.X_test is not None:
-            test_pred = np.ones([len(test_indices), self.n_prediction_steps])
-            for seq_idx, test_seq in enumerate(self.datamanager.datasets):
-                test_pred[seq_idx] = self.predict_function(test_seq.X, pipeline)
-
-            test_pred = test_pred.flatten()
+            test_sets = []
+            for test_seq in enumerate(self.datamanager.datasets):
+                test_sets.append(test_seq.X_test)
+            test_pred = self.predict_function(valid_sets, pipeline).flatten()
+            test_pred = test_pred.squeeze(-1)
         else:
             test_pred = None
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 49e50e517..b1f3bd11d 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -27,6 +27,17 @@
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 
 
+class TestSequenceDataset(TransformSubset):
+    def __init__(self, dataset: List[TimeSeriesSequence], train: bool = False) -> None:
+        self.dataset = dataset
+        self.indices = torch.arange(len(dataset))
+        self.train = train
+
+    def __getitem__(self, idx: int) -> np.ndarray:
+        # we only consider the entire sequence
+        seq = self.dataset[idx]
+        return seq.__getitem__(len(seq) - 1, self.train)
+
 def pad_sequence_from_start(sequences: List[torch.Tensor],
                             seq_minimal_length: int,
                             batch_first=True,
@@ -486,28 +497,48 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         """
         # TODO more supported inputs
         if isinstance(X, (np.ndarray, torch.Tensor)):
-            dataset = TimeSeriesSequence(
-                X=X, Y=y,
-                # This dataset is used for loading test data in a batched format
-                train_transforms=self.test_transform,
-                val_transforms=self.test_transform,
-                n_prediction_steps=0,
-            )
-
-        elif isinstance(X, TimeSeriesSequence):
-            dataset = X
-            dataset.update_transform(self.test_transform, train=False)
+            if isinstance(X, torch.Tensor):
+                X = X.numpy()
+            if X.ndim == 1:
+                X = [X]
+        if isinstance(X, TimeSeriesSequence):
+            X.update_transform(self.test_transform, train=False)
+            dataset = [X]
+        elif isinstance(X, Sequence):
+            dataset = []
+            if isinstance(X[0], TimeSeriesSequence):
+                for X_seq in X:
+                    X_seq.update_transform(self.test_transform, train=False)
+                    dataset.append(X_seq)
+            else:
+                if y is None:
+                    for X_seq in X:
+                        seq = TimeSeriesSequence(
+                            X=X_seq, Y=y,
+                            # This dataset is used for loading test data in a batched format
+                            train_transforms=self.test_transform,
+                            val_transforms=self.test_transform,
+                            n_prediction_steps=0,
+                        )
+                        dataset.append(seq)
+                else:
+                    for X_seq, y_seq in zip(X, y):
+                        seq = TimeSeriesSequence(
+                            X=X_seq, Y=y_seq,
+                            # This dataset is used for loading test data in a batched format
+                            train_transforms=self.test_transform,
+                            val_transforms=self.test_transform,
+                            n_prediction_steps=0,
+                        )
+                        dataset.append(seq)
         else:
-            raise ValueError(f"Unsupported type of input X: {type(X)}")
-
-        # we only consider the last sequence as validation set
-        test_seq_indices = [len(dataset) - 1]
+            raise NotImplementedError(f"Unsupported type of input X: {type(X)}")
 
-        dataset_test = TransformSubset(dataset, indices=test_seq_indices, train=False)
+        dataset_test = TestSequenceDataset(dataset, train=False)
 
         return torch.utils.data.DataLoader(
             dataset_test,
-            batch_size=min(batch_size, len(dataset)),
+            batch_size=min(batch_size, len(dataset), self.batch_size),
             shuffle=False,
             collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
         )

From ca6a47d5977afcf1b2db47e6aefef9ba361705b2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 16 Jan 2022 19:32:12 +0100
Subject: [PATCH 126/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py        | 10 +++++++++-
 .../time_series_forecasting_train_evaluator.py     | 14 ++++----------
 .../components/training/metrics/metrics.py         | 14 +++++++-------
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index d7a315ceb..8507c0038 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -78,7 +78,7 @@ def __init__(self,
         self.val_transform = val_transforms
         self.sp = sp
 
-        self.mase_coefficient = compute_mase_coefficient(self.X, sp=self.sp)
+        self.mase_coefficient = compute_mase_coefficient(self.X, sp=self.sp, n_prediction_steps=n_prediction_steps)
 
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
@@ -371,6 +371,14 @@ def get_validation_set(self, idx):
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return self.datasets[dataset_idx].get_val_seq_set(sample_idx)
 
+    def get_time_series_seq(self, idx) -> TimeSeriesSequence:
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        return self.datasets[dataset_idx]
+
     def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 739d32239..d4df9d999 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -16,7 +16,7 @@
 from smac.tae import StatusType
 
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES, compute_mase_coefficient
+from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -291,12 +291,7 @@ def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.n
         mase_coefficient = np.ones([len(test_split), self.num_targets])
         if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
             for seq_idx, test_idx in enumerate(test_split):
-                seq = self.datamanager[test_idx][0]['past_target']
-                if seq.shape[-1] > 1:
-                    seq = seq[self.datamanager.target_variables].squeeze()
-                else:
-                    seq = seq.squeeze()
-                mase_coefficient[seq_idx] = compute_mase_coefficient(seq, self.seasonality)
+                mase_coefficient[seq_idx] = self.datamanager.get_time_series_seq(test_idx).mase_coefficient
 
         mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
         return mase_coefficient
@@ -305,7 +300,6 @@ def _predict(self, pipeline: BaseEstimator,
                  train_indices: Union[np.ndarray, List],
                  test_indices: Union[np.ndarray, List],
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
-        # TODO consider multile outputs
         val_sets = []
         for test_idx in test_indices:
             val_sets.append(self.datamanager.get_validation_set(test_idx))
@@ -319,7 +313,7 @@ def _predict(self, pipeline: BaseEstimator,
                 valid_sets.append(val_seq.X_val)
             valid_pred = self.predict_function(valid_sets, pipeline).flatten()
 
-            valid_pred = valid_pred.squeeze(-1)
+            valid_pred = valid_pred.reshape(-1, self.num_targets)
 
         else:
             valid_pred = None
@@ -329,7 +323,7 @@ def _predict(self, pipeline: BaseEstimator,
             for test_seq in enumerate(self.datamanager.datasets):
                 test_sets.append(test_seq.X_test)
             test_pred = self.predict_function(valid_sets, pipeline).flatten()
-            test_pred = test_pred.squeeze(-1)
+            test_pred = test_pred.reshape(-1, self.num_targets)
         else:
             test_pred = None
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index fe43e033b..c8e2caecd 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -10,8 +10,6 @@
 
 from autoPyTorch.pipeline.components.training.metrics.base import make_metric
 
-
-
 # Standard regression scores
 mean_absolute_error = make_metric('mean_absolute_error',
                                   sklearn.metrics.mean_absolute_error,
@@ -52,13 +50,14 @@
 f1 = make_metric('f1',
                  sklearn.metrics.f1_score)
 
+
 # Standard Forecasting Scores
 
 
 # To avoid storing unnecessary scale values here, we scale all the values under
 # AutoPytorch.evaluation.time_series_forecasting_train_evaluator
 
-def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> float:
+def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int, n_prediction_steps: int) -> float:
     """
     compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
     this function aims at reducing the memroy requirement
@@ -76,10 +75,12 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
                                                                    np.zeros_like(past_target),
                                                                    multioutput="raw_values")
     else:
-
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
-                                                               past_target[:-sp],
-                                                               multioutput="raw_values")
+                                                                   past_target[:-sp],
+                                                                   multioutput="raw_values")
+    if mase_denominator == 0.0:
+        # they will not be counter when computing MASE
+        return np.zeros_like(mase_denominator)
 
     return 1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
 
@@ -104,7 +105,6 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
 
 MASE_LOSSES = [mean_MASE_forecasting, median_MASE_forecasting]
 
-
 mean_MAE_forecasting = make_metric('mean_MAE_forecasting',
                                    forecasting_metrics.mean_absolute_error,
                                    optimum=0,

From 5c9b10c409b0b1b7f1df0bf43a3cc9d268f9f58e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 16 Jan 2022 19:48:24 +0100
Subject: [PATCH 127/347] allow singel fidelity

---
 autoPyTorch/optimizer/smbo.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index d3dca4243..5208db842 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -10,6 +10,7 @@
 
 from smac.facade.smac_ac_facade import SMAC4AC
 from smac.intensification.hyperband import Hyperband
+from smac.intensification.intensification import Intensifier
 from smac.runhistory.runhistory import RunHistory
 from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
 from smac.scenario.scenario import Scenario
@@ -38,15 +39,15 @@
 
 
 def get_smac_object(
-    scenario_dict: Dict[str, Any],
-    seed: int,
-    ta: Callable,
-    ta_kwargs: Dict[str, Any],
-    n_jobs: int,
-    initial_budget: int,
-    max_budget: Union[int, float],
-    dask_client: Optional[dask.distributed.Client],
-    initial_configurations: Optional[List[Configuration]] = None,
+        scenario_dict: Dict[str, Any],
+        seed: int,
+        ta: Callable,
+        ta_kwargs: Dict[str, Any],
+        n_jobs: int,
+        initial_budget: int,
+        max_budget: Union[int, float],
+        dask_client: Optional[dask.distributed.Client],
+        initial_configurations: Optional[List[Configuration]] = None,
 ) -> SMAC4AC:
     """
     This function returns an SMAC object that is gonna be used as
@@ -67,7 +68,13 @@ def get_smac_object(
         (SMAC4AC): sequential model algorithm configuration object
 
     """
-    intensifier = Hyperband
+    if initial_budget == max_budget:
+        intensifier = Intensifier
+        intensifier_kwargs = {'deterministic': True, }
+    else:
+        intensifier = Hyperband
+        intensifier_kwargs = {'initial_budget': initial_budget, 'max_budget': max_budget,
+                              'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'}
 
     rh2EPM = RunHistory2EPM4LogCost
     return SMAC4AC(
@@ -79,8 +86,7 @@ def get_smac_object(
         initial_configurations=initial_configurations,
         run_id=seed,
         intensifier=intensifier,
-        intensifier_kwargs={'initial_budget': initial_budget, 'max_budget': max_budget,
-                            'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'},
+        intensifier_kwargs=intensifier_kwargs,
         dask_client=dask_client,
         n_jobs=n_jobs,
     )

From 64580f744024cee351758835a2d3735cfc7db373 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 17 Jan 2022 12:38:19 +0100
Subject: [PATCH 128/347] maint

---
 autoPyTorch/configs/forecasting_init_cfgs.json       |  2 +-
 .../components/setup/network/forecasting_network.py  | 11 +++++++----
 .../forecasting_backbone/MLPEncoder.py               |  4 ++--
 .../time_series_forecasting_data_loader.py           | 12 ++++++------
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index ef5f9a0c1..155bf6e14 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -2,7 +2,7 @@
     "trainer": {
         "data_loader:batch_size": 32,
         "data_loader:backcast": false,
-        "data_loader:sample_strategy": "seq_uniform",
+        "data_loader:sample_strategy": "SeqUniform",
         "data_loader:num_batches_per_epoch": 50,
         "imputer:numerical_strategy": "median",
         "lr_scheduler:__choice__": "ReduceLROnPlateau",
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index dce317bc5..961c8b487 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -436,8 +436,9 @@ def forward(self,
                     # Transformer's hidden states is of shape
                     repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=0)
 
-                repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples,
-                                                                      dim=0).squeeze(1)
+                max_lag_seq_length = max(self.decoder.lagged_value) + 1
+                repeated_past_target = targets_past[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
+                                                                                               dim=0).squeeze(1)
                 repeated_predicted_target = repeated_past_target[:, [-1]]
                 repeated_past_target = repeated_past_target[:, :-1, ]
 
@@ -607,8 +608,10 @@ def forward(self,
             else:
                 # For other models, the full past targets are passed to the network.
                 encoder_output = self.encoder(x_past)
-            repeated_past_target = targets_past.repeat_interleave(repeats=self.num_samples,
-                                                                  dim=0).squeeze(1)
+            max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size) + 1
+            repeated_past_target = targets_past[:, -max_lag_seq_length - 1:, ].repeat_interleave(
+                repeats=self.num_samples,
+                dim=0).squeeze(1)
 
             repeated_static_feat = features_static.repeat_interleave(
                 repeats=self.num_samples, dim=0
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
index a859e4ec5..95457bdc2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
@@ -127,8 +127,8 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
-                                                                              value_range=(1, 15),
-                                                                              default_value=5,
+                                                                              value_range=(1, 5),
+                                                                              default_value=3,
                                                                               ),
             activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
                                                                               value_range=tuple(_activations.keys()),
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index b1f3bd11d..91c9cf103 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -409,11 +409,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             fraction_samples_per_seq
         )
         """
-        if self.sample_strategy == 'length_uniform':
+        if self.sample_strategy == 'LengthUniform':
             available_seq_length = seq_train_length - min_start
             available_seq_length = np.where(available_seq_length <= 1, 1, available_seq_length)
             num_instances_per_seqs = num_instances_train / num_instances_dataset * available_seq_length
-        elif self.sample_strategy == 'seq_uniform':
+        elif self.sample_strategy == 'SeqUniform':
             num_seq_train = len(seq_train_length)
             num_instances_per_seqs = np.repeat(num_instances_train / num_seq_train, num_seq_train)
         else:
@@ -585,8 +585,8 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                                                   default_value=50),
                                         sample_strategy: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter="sample_strategy",
-                                                                  value_range=('length_uniform', 'seq_uniform'),
-                                                                  default_value='seq_uniform'),
+                                                                  value_range=('LengthUniform', 'SeqUniform'),
+                                                                  default_value='SeqUniform'),
                                         backcast: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter='backcast',
                                                                   value_range=(True, False),
@@ -610,9 +610,9 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
             window_size (int): window size, (if activate) this value directly determines the window_size of the
                                data loader
             num_batch_per_epoch (int): how many batches are trained at each iteration
-            sample_strategy(str): how samples are distributed. if it is length_uniform, then every single data point
+            sample_strategy(str): how samples are distributed. if it is LengthUnifrom, then every single data point
                                   has the same probability to be sampled, in which case longer sequence will occupy more
-                                  samples. If it is seq_uniform, then every sequence has the same probability to be
+                                  samples. If it is SeqUniform, then every sequence has the same probability to be
                                   sampled regardless of their length
             backcast (bool): if back_cast module is activate (in which case window size is a
             multiple of n_prediction_steps)

From df69c79fa000393a770cb07525b946c76790ee05 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 17 Jan 2022 13:33:48 +0100
Subject: [PATCH 129/347] fix budget num_seq

---
 .../data_loader/time_series_forecasting_data_loader.py       | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 91c9cf103..dc0ab7158 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -164,7 +164,7 @@ def __init__(self,
                     idx_start = idx_tracker
 
                 num_interval = int(np.ceil(num_instances))
-                if num_interval > idx_end - idx_start:
+                if num_interval > idx_end - idx_start or num_interval == 0:
                     interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
                     # we consider
                     num_expected_ins_decimal.append(num_instances)
@@ -401,7 +401,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
         # create masks for masking
         seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
-        seq_train_length[seq_idx_inactivate] = 0
         # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
         """
         num_instances_per_seqs = np.ceil(
@@ -418,6 +417,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             num_instances_per_seqs = np.repeat(num_instances_train / num_seq_train, num_seq_train)
         else:
             raise NotImplementedError(f'Unsupported sample strategy: {self.sample_strategy}')
+
+        num_instances_per_seqs[seq_idx_inactivate] = 0
         num_instances_per_seqs *= fraction_samples_per_seq
 
         #num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)

From 91acd5b449e6b7d39287ca0e023ec6124f11cb17 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 17 Jan 2022 17:20:02 +0100
Subject: [PATCH 130/347] faster sampler and lagger

---
 autoPyTorch/api/time_series_forecasting.py    | 20 ++++----
 autoPyTorch/constants_forecasting.py          |  2 +
 autoPyTorch/datasets/time_series_dataset.py   | 25 +++++++---
 .../setup/network/forecasting_network.py      | 47 +++++++++++++++++--
 .../forecasting_backbone/RNNEncoder.py        | 12 -----
 .../TransformerEncoder.py                     | 12 -----
 .../forecasting_decoder/RNNDecoder.py         | 12 -----
 .../forecasting_decoder/TransformerDecoder.py | 12 -----
 .../time_series_forecasting_data_loader.py    | 38 +++++++++------
 9 files changed, 95 insertions(+), 85 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 5ee60736e..65135f7f0 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -7,10 +7,7 @@
 import pandas as pd
 
 from autoPyTorch.api.base_task import BaseTask
-from autoPyTorch.constants import (
-    TASK_TYPES_TO_STRING,
-    TIMESERIES_FORECASTING,
-)
+from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
@@ -21,7 +18,7 @@
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE, SEASONALITY_MAP
+from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE
 
 
 class TimeSeriesForecastingTask(BaseTask):
@@ -288,11 +285,13 @@ def search(
             base_window_size = int(np.ceil(self.dataset.freq_value))
             # we don't want base window size to large, which might cause a too long computation time, in which case
             # we will use n_prediction_step instead (which is normally smaller than base_window_size)
-            if base_window_size > self.dataset.upper_window_size or base_window_size > MAX_WINDOW_SIZE_BASE:
-                # TODO considering padding to allow larger upper_window_size !!!
-                base_window_size = int(np.ceil(min(n_prediction_steps, self.dataset.upper_window_size)))
             if base_window_size > MAX_WINDOW_SIZE_BASE:
-                base_window_size = 50  # TODO this value comes from setting of solar dataset, do we have a better choice?
+                # TODO considering padding to allow larger upper_window_size !!!
+                if n_prediction_steps > MAX_WINDOW_SIZE_BASE:
+                    base_window_size = 50
+                else:
+                    base_window_size = n_prediction_steps
+
             if self.search_space_updates is None:
                 self.search_space_updates = HyperparameterSearchSpaceUpdates()
 
@@ -305,9 +304,6 @@ def search(
                                              default_value=int(np.ceil(1.25 * base_window_size)),
                                              )
 
-        seasonality = SEASONALITY_MAP.get(self.dataset.freq, 1)
-        if isinstance(seasonality, list):
-            seasonality = min(seasonality)  # Use to calculate MASE
         self._metrics_kwargs = {'sp': self.dataset.seasonality,
                                 'n_prediction_steps': n_prediction_steps}
 
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index 3b5a355ca..e1d27f70e 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -17,4 +17,6 @@
 }
 
 MAX_WINDOW_SIZE_BASE = 500
+MIN_LONG_WINDOW_SIZE = 100
+MIN_LONG_FORECASTING_HORIZON = 100
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 8507c0038..f46a728a2 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -29,6 +29,10 @@
     HoldoutValTypes
 )
 
+from gluonts.time_feature.lag import get_lags_for_frequency
+from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
+
+
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
@@ -183,6 +187,7 @@ def __init__(self,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  validator: Optional[TimeSeriesForecastingInputValidator] = None,
+                 lagged_value: Optional[List[int]] = None,
                  n_prediction_steps: int = 1,
                  dataset_name: Optional[str] = None,
                  shift_input_data: bool = True,
@@ -194,6 +199,7 @@ def __init__(self,
         TODO add supports on X for pandas and target variables can be str or Tuple[str]
         :param freq: Optional[Union[str, int]] frequency of the series sequences, used to determine the (possible)
         period
+        :param lagged_value: lagged values applied to RNN and Transformer that allows them to use previous data
         :param n_prediction_steps: The number of steps you want to forecast into the future
         :param shift_input_data: bool
         if the input X and targets needs to be shifted to be aligned:
@@ -216,10 +222,10 @@ def __init__(self,
                         "you could pass freq with a numerical value")
             freq_value = SEASONALITY_MAP.get(freq, None)
         if isinstance(freq, list):
-            tmp_freq = min([freq_value for freq_value in freq if freq_value > n_prediction_steps])
+            tmp_freq = min([freq_value for freq_value in freq if freq_value >= n_prediction_steps])
             freq_value = tmp_freq
         if isinstance(freq_value, list):
-            tmp_freq = min([freq_value_item for freq_value_item in freq_value if freq_value_item > n_prediction_steps])
+            tmp_freq = min([freq_value_item for freq_value_item in freq_value if freq_value_item >= n_prediction_steps])
             freq_value = tmp_freq
 
         seasonality = SEASONALITY_MAP.get(freq, 1)
@@ -347,6 +353,13 @@ def __init__(self,
 
         self.splits = self.get_splits_from_resampling_strategy()
 
+        # TODO doing experiments to give the most proper way of defining these two values
+        if lagged_value is None:
+            freq = FREQUENCY_MAP[self.freq]
+            lagged_value = [0] + get_lags_for_frequency(freq)
+
+        self.lagged_value = lagged_value
+
     def __getitem__(self, idx, train=True):
         if idx < 0:
             if -idx > len(self):
@@ -560,7 +573,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             if self.val_tensors is not None:
                 upper_window_size = np.min(self.sequence_lengths_train) - self.n_prediction_steps
             else:
-                upper_window_size = int(np.min(self.sequence_lengths_train) * 1 - val_share) - self.n_prediction_steps
+                upper_window_size = int(np.min(self.sequence_lengths_train) * 1 - val_share)
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
@@ -572,13 +585,12 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                 cross_val_type=self.resampling_strategy,
                 num_splits=cast(int, num_splits),
             ))
-            upper_window_size = (np.min(self.sequence_lengths_train) // num_splits) - self.n_prediction_steps
+            upper_window_size = (np.min(self.sequence_lengths_train)) - self.n_prediction_steps * (num_splits - 1)
         elif self.resampling_strategy is None:
             splits.append(self.create_refit_split())
             upper_window_size = np.inf
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
-
         self.upper_window_size = upper_window_size
         return splits
 
@@ -604,7 +616,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'upper_window_size': self.upper_window_size,
                                    'sp': self.seasonality,  # For metric computation,
                                    'freq': self.freq,
-                                   'sequence_lengths_train': self.sequence_lengths_train})
+                                   'sequence_lengths_train': self.sequence_lengths_train,
+                                   'lagged_value': self.lagged_value})
         return dataset_properties
 
     def create_cross_val_splits(
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 961c8b487..f4139290f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -104,11 +104,46 @@ def get_lagged_subsequences(
 
     # (N, I, S, C)
     lagged_seq = torch.masked_select(sequence, mask_extend).reshape(batch_size, num_lags, subsequences_length, -1)
+
     lagged_seq = torch.transpose(lagged_seq, 1, 2).reshape(batch_size, subsequences_length, -1)
 
     return lagged_seq, mask
 
 
+def get_lagged_subsequences_inference(
+        sequence: torch.Tensor,
+        subsequences_length: int,
+        lags_seq: Optional[List[int]] = None, ):
+    """
+    this function works exactly the same as get_lagged_subsequences. However, this implementation is faster when no
+    cached value is available, thus it more suitable during inference times.
+
+    designed for doing inference for DeepAR, the core idea is to use
+    """
+    sequence_length = sequence.shape[1]
+    batch_size = sequence.shape[0]
+    lagged_values = []
+    for lag_index in lags_seq:
+        begin_index = -lag_index - subsequences_length
+        end_index = -lag_index if lag_index > 0 else None
+        if end_index is not None and end_index < -sequence_length:
+            lagged_values.append(torch.zeros([batch_size, subsequences_length, *sequence.shape[2:]]))
+            continue
+        if begin_index < -sequence_length:
+            if end_index is not None:
+                pad_shape = [batch_size, subsequences_length - sequence_length - end_index, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence[:, :end_index, ...]], dim=1))
+            else:
+                pad_shape = [batch_size, subsequences_length - sequence_length, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence], dim=1))
+            continue
+        else:
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+
+    lagged_seq = torch.stack(lagged_values, -1).transpose(-1, -2).reshape(batch_size, subsequences_length, -1)
+    return lagged_seq
+
+
 class ForecastingNet(nn.Module):
     future_target_required = False
 
@@ -383,9 +418,10 @@ def forward(self,
                     if self.decoder_lagged_input:
                         x_future = torch.cat([targets_past, predicted_target.cpu()], dim=1)
                         if self.decoder_has_hidden_states:
-                            x_future, _ = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
                         else:
-                            x_future, _ = get_lagged_subsequences(x_future, idx_pred + 1, self.decoder.lagged_value)
+                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
+                                                                         self.decoder.lagged_value)
                     else:
                         if self.decoder_has_hidden_states:
                             x_future = predicted_target[:, [-1]]
@@ -454,9 +490,10 @@ def forward(self,
                     if self.decoder_lagged_input:
                         x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
                         if self.decoder_has_hidden_states:
-                            x_future, _ = get_lagged_subsequences(x_future, 1, self.decoder.lagged_value)
+                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
                         else:
-                            x_future, _ = get_lagged_subsequences(x_future, idx_pred + 1, self.decoder.lagged_value)
+                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
+                                                                         self.decoder.lagged_value)
                     else:
                         if self.decoder_has_hidden_states:
                             x_future = repeated_predicted_target[:, [-1]]
@@ -634,7 +671,7 @@ def forward(self,
             for k in range(1, self.n_prediction_steps):
                 if self.encoder_lagged_input:
                     x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                    x_next, _ = get_lagged_subsequences(x_next, 1, self.encoder.lagged_value)
+                    x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
                 else:
                     x_next = next_sample
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index 1c9bef50e..dce0850e6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -12,14 +12,12 @@
 
 import torch
 from torch import nn
-from gluonts.time_feature.lag import get_lags_for_frequency
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderNetwork
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 
 class _RNN(EncoderNetwork):
@@ -100,18 +98,8 @@ def encoder_properties(self):
         return encoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        freq = X['dataset_properties'].get('freq', None)
         if 'lagged_value' in X['dataset_properties']:
             self.lagged_value = X['dataset_properties']['lagged_value']
-        if freq is not None:
-            try:
-                freq = FREQUENCY_MAP[freq]
-                lagged_values = get_lags_for_frequency(freq)
-                self.lagged_value = [0] + lagged_values
-            except Exception:
-                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
-                # If
-                pass
         return super().fit(X, y)
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
index 8c6de0668..12c2e6a52 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -12,14 +12,12 @@
 
 import torch
 from torch import nn
-from gluonts.time_feature.lag import get_lags_for_frequency
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderNetwork
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
     PositionalEncoding, build_transformer_layers
 
@@ -105,18 +103,8 @@ def encoder_properties(self):
         return encoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
-        freq = X['dataset_properties'].get('freq', None)
         if 'lagged_value' in X['dataset_properties']:
             self.lagged_value = X['dataset_properties']['lagged_value']
-        if freq is not None:
-            try:
-                freq = FREQUENCY_MAP[freq]
-                lagged_values = get_lags_for_frequency(freq)
-                self.lagged_value = [0] + lagged_values
-            except Exception:
-                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
-                # If
-                pass
         return super().fit(X, y)
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 59ec726d0..aec3874f3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -10,7 +10,6 @@
 from torch import nn
 
 import numpy as np
-from gluonts.time_feature.lag import get_lags_for_frequency
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
@@ -18,7 +17,6 @@
     BaseForecastingDecoder, RecurrentDecoderNetwork
 
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
-from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 
 class RNN_Module(RecurrentDecoderNetwork):
@@ -102,18 +100,8 @@ def decoder_properties(self):
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.rnn_kwargs = X['rnn_kwargs']
-
-        freq = X['dataset_properties'].get('freq', None)
         if 'lagged_value' in X['dataset_properties']:
             self.lagged_value = X['dataset_properties']['lagged_value']
-        if freq is not None:
-            try:
-                freq = FREQUENCY_MAP[freq]
-                lagged_values = get_lags_for_frequency(freq)
-                self.lagged_value = [0] + lagged_values
-            except Exception:
-                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
-                pass
         return super().fit(X, y)
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index d6d0a501f..41b5fdbbc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -4,7 +4,6 @@
 import torch
 from torch import nn
 import numpy as np
-from gluonts.time_feature.lag import get_lags_for_frequency
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -25,7 +24,6 @@
     PositionalEncoding, build_transformer_layers
 
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
-from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 
 class _TransformerDecoder(RecurrentDecoderNetwork):
@@ -115,18 +113,8 @@ def decoder_properties(self):
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.transformer_encoder_kwargs = X['transformer_encoder_kwargs']
-
-        freq = X['dataset_properties'].get('freq', None)
         if 'lagged_value' in X['dataset_properties']:
             self.lagged_value = X['dataset_properties']['lagged_value']
-        if freq is not None:
-            try:
-                freq = FREQUENCY_MAP[freq]
-                lagged_values = get_lags_for_frequency(freq)
-                self.lagged_value = [0] + lagged_values
-            except Exception:
-                warnings.warn(f'cannot find the proper lagged value for {freq}, we use the default lagged value')
-                pass
         return super().fit(X, y)
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index dc0ab7158..155b4ea80 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -38,8 +38,10 @@ def __getitem__(self, idx: int) -> np.ndarray:
         seq = self.dataset[idx]
         return seq.__getitem__(len(seq) - 1, self.train)
 
+
 def pad_sequence_from_start(sequences: List[torch.Tensor],
                             seq_minimal_length: int,
+                            seq_max_length: int = np.inf,
                             batch_first=True,
                             padding_value=0.0) -> torch.Tensor:
     r"""
@@ -52,7 +54,9 @@ def pad_sequence_from_start(sequences: List[torch.Tensor],
     # in sequences are same and fetching those from sequences[0]
     max_size = sequences[0].size()
     trailing_dims = max_size[1:]
-    max_len = max(max([s.size(0) for s in sequences]), seq_minimal_length)
+    max_len = min(max(max([s.size(0) for s in sequences]), seq_minimal_length), seq_max_length)
+    if seq_max_length > max_len:
+        seq_max_length = max_len
     if batch_first:
         out_dims = (len(sequences), max_len) + trailing_dims
     else:
@@ -60,12 +64,12 @@ def pad_sequence_from_start(sequences: List[torch.Tensor],
 
     out_tensor = sequences[0].new_full(out_dims, padding_value)
     for i, tensor in enumerate(sequences):
-        length = tensor.size(0)
+        length = min(tensor.size(0), seq_max_length)
         # use index notation to prevent duplicate references to the tensor
         if batch_first:
-            out_tensor[i, -length:, ...] = tensor
+            out_tensor[i, -length:, ...] = tensor[-length:]
         else:
-            out_tensor[-length:, i, ...] = tensor
+            out_tensor[-length:, i, ...] = tensor[-length:]
 
     return out_tensor
 
@@ -78,9 +82,10 @@ class PadSequenceCollector:
 
     """
 
-    def __init__(self, window_size: int, target_padding_value: float = 0.0):
+    def __init__(self, window_size: int, target_padding_value: float = 0.0, seq_max_length: int = np.inf):
         self.window_size = window_size
         self.target_padding_value = target_padding_value
+        self.seq_max_length = seq_max_length
 
     def __call__(self, batch, padding_value=0.0):
         elem = batch[0]
@@ -88,7 +93,8 @@ def __call__(self, batch, padding_value=0.0):
         if isinstance(elem, torch.Tensor):
             seq = pad_sequence_from_start(batch,
                                           seq_minimal_length=self.window_size,
-                                          batch_first=True, padding_value=padding_value) # type: torch.Tensor
+                                          seq_max_length=self.seq_max_length,
+                                          batch_first=True, padding_value=padding_value)  # type: torch.Tensor
             return seq
 
         elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
@@ -109,7 +115,7 @@ def __call__(self, batch, padding_value=0.0):
             return batch
         elif isinstance(elem, collections.abc.Mapping):
             return {key: self([d[key] for d in batch]) if key != "past_target"
-                    else self([d[key] for d in batch], self.target_padding_value) for key in elem}
+            else self([d[key] for d in batch], self.target_padding_value) for key in elem}
         raise TypeError(f"Unsupported data type {elem_type}")
 
 
@@ -226,6 +232,7 @@ def __iter__(self):
     def __len__(self):
         return self.num_instances
 
+
 class ExpandTransformTimeSeries(object):
     """Expand Dimensionality so tabular transformations see
        a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
@@ -237,6 +244,7 @@ def __call__(self, data: np.ndarray) -> np.ndarray:
             data = np.expand_dims(data, axis=-1)
         return data
 
+
 class SequenceBuilder(object):
     """build a time sequence token from the given time sequence
     it requires two hyperparameters: sample_interval and window size
@@ -275,6 +283,7 @@ def __call__(self, data: np.ndarray) -> np.ndarray:
 
             return data[sample_indices]
 
+
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
     """This class is an interface to read time sequence data
 
@@ -290,7 +299,7 @@ def __init__(self,
                  window_size: int = 1,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
-                 sample_strategy= 'seq_uniform',
+                 sample_strategy='seq_uniform',
                  random_state: Optional[np.random.RandomState] = None) -> None:
         """
         initialize a dataloader
@@ -339,7 +348,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         if self.backcast:
             self.window_size = self.backcast_period * self.n_prediction_steps
 
-
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
@@ -348,7 +356,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             # for lower resolution, window_size should be smaller
             self.window_size = (self.window_size - 1) // sample_interval + 1
 
-        self.padding_collector = PadSequenceCollector(self.window_size, padding_value)
+        max_lagged_value = max(X['dataset_properties'].get('lagged_value', [np.inf]))
+        max_lagged_value += self.window_size + self.n_prediction_steps
+
+        self.padding_collector = PadSequenceCollector(self.window_size, padding_value, max_lagged_value)
 
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
@@ -383,7 +394,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_dataset = np.size(train_split)
         num_instances_train = self.num_batches_per_epoch * self.batch_size
 
-
         # get the length of each sequence of training data (after split)
         # as we already know that the elements in 'train_split' increases consecutively with a certain number of
         # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
@@ -421,7 +431,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_per_seqs[seq_idx_inactivate] = 0
         num_instances_per_seqs *= fraction_samples_per_seq
 
-        #num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
+        # num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
         # at least one element of each sequence should be selected
 
         # TODO consider the case where num_instances_train is greater than num_instances_dataset,
@@ -446,7 +456,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         self.val_data_loader = torch.utils.data.DataLoader(
             val_dataset,
-            batch_size=min(self.batch_size, len(val_dataset)),
+            batch_size=min(1000, len(val_dataset)),
             shuffle=False,
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
@@ -539,7 +549,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
 
         return torch.utils.data.DataLoader(
             dataset_test,
-            batch_size=min(batch_size, len(dataset), self.batch_size),
+            batch_size=min(batch_size, len(dataset)),
             shuffle=False,
             collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
         )

From fa96b27366d086ba575926a4ba1042898a619eb4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 17 Jan 2022 17:51:18 +0100
Subject: [PATCH 131/347] maint

---
 .../data_loader/time_series_forecasting_data_loader.py     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 155b4ea80..61c7a3dba 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -188,6 +188,7 @@ def __init__(self,
             num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
             # seq_intervals_decimal_length = np.stack(seq_intervals_decimal_length)
             self.seq_lengths = seq_lengths
+            self.seq_lengths_sum = np.sum(seq_lengths)
             self.num_instances = int(np.round(np.sum(num_instances_per_seqs)))
 
             self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
@@ -204,7 +205,6 @@ def __iter__(self):
         for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals_int, self.seq_lengths)):
             if len(interval) == 1:
                 continue
-
             num_samples = len(interval) - 1
             idx_samples_end = idx_samples_start + num_samples
 
@@ -224,9 +224,12 @@ def __iter__(self):
 
             samples_shift = torch.rand(num_samples_remain, generator=self.generator)
             samples_shift *= (seq_interval[:, 1] - seq_interval[:, 0])
-            samples_seq_remain = torch.floor(samples_shift + seq_interval[:, 0]).int()
+            samples_seq_remain = torch.floor(samples_shift).int() + seq_interval[:, 0]
             samples[-num_samples_remain:] = samples_seq_remain
 
+        # sometimes if self.seq_lengths_sum is too large, float might not be accurate enough
+        samples = torch.where(samples == self.seq_lengths_sum, samples - 1, samples)
+
         yield from (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
 
     def __len__(self):

From 995684a5e37edcbbb3d327666c6baabb2dc50764 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 17 Jan 2022 20:36:42 +0100
Subject: [PATCH 132/347] maint

---
 .../network_backbone/forecasting_backbone/__init__.py     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 419f0e2d7..d8daf652d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -231,12 +231,16 @@ def get_hyperparameter_search_space(
             encoder2decoder[encoder_name] = allowed_decoders
 
         for decoder_name in available_decoders.keys():
+            if not decoder2encoder[decoder_name]:
+                continue
             updates = self._get_search_space_updates(prefix=decoder_name)
             config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
                                                                                             **updates)
             compatible_encoders = decoder2encoder[decoder_name]
             encoders_with_multi_decoder = []
             encoder_with_uni_decoder = []
+            # this could happen if its parent encoder is not part of
+            inactive_decoder = []
             for encoder in compatible_encoders:
                 if len(encoder2decoder[encoder]) > 1:
                     encoders_with_multi_decoder.append(encoder)
@@ -269,7 +273,9 @@ def get_hyperparameter_search_space(
                         or_cond.append(EqualsCondition(hp,
                                                        hp_decoder_type,
                                                        decoder_name))
-                    if len(or_cond) > 1:
+                    if len(or_cond) == 0:
+                        continue
+                    elif len(or_cond) > 1:
                         conditions_to_add.append(OrConjunction(*or_cond))
                     else:
                         conditions_to_add.append(or_cond[0])

From d3a0e319b25105f28ae6c96ad8bac39b08d83e8a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 18 Jan 2022 10:51:48 +0100
Subject: [PATCH 133/347] maint deepAR

---
 .../setup/network/forecasting_network.py      | 81 ++++++++++++++-----
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index f4139290f..cf98da2ed 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -267,7 +267,7 @@ def forward(self,
                 features_future: Optional[torch.Tensor] = None,
                 features_static: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-
+        # TODO We need to replace thus None with empty tensors to avoid checking if they are None every time!
         if self.encoder_lagged_input:
             targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
             targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
@@ -471,8 +471,10 @@ def forward(self,
                 else:
                     # Transformer's hidden states is of shape
                     repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=0)
-
-                max_lag_seq_length = max(self.decoder.lagged_value) + 1
+                if self.decoder_lagged_input:
+                    max_lag_seq_length = max(self.decoder.lagged_value) + 1
+                else:
+                    max_lag_seq_length = 1 + self.window_size
                 repeated_past_target = targets_past[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
                                                                                                dim=0).squeeze(1)
                 repeated_predicted_target = repeated_past_target[:, [-1]]
@@ -504,8 +506,9 @@ def forward(self,
                         x_future = x_future if repeated_time_feat is None else torch.cat(
                             [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
                     else:
+                        # decoder uses the entire future targets
                         x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, idx_pred + 1, :], x_future], dim=-1)
+                            [repeated_time_feat[:, :idx_pred + 1, :], x_future], dim=-1)
 
                     x_future = x_future.to(self.device)
                     if self.decoder_has_hidden_states:
@@ -593,7 +596,7 @@ def forward(self,
 
             x_input = targets_all
             if features_past is not None:
-                features_all = torch.cat([features_past, features_future], dim=1)
+                features_all = torch.cat([features_past[:, 1:], features_future], dim=1)
                 x_input = torch.cat([features_all, x_input], dim=-1)
             x_input = x_input.to(self.device)
 
@@ -622,8 +625,18 @@ def forward(self,
 
                 targets_past, _, loc, scale = self.target_scaler(targets_past)
                 x_past = targets_past
-
-            x_past = x_past if features_past is None else torch.cat([features_past, x_past], dim=-1)
+            if features_past is not None:
+                # features is one step ahead of target
+                if self.window_size > 1:
+                    features_all = torch.cat([features_past[:, -self.window_size + 1:, ],
+                                              features_future],
+                                             dim=1)
+                else:
+                    features_all = features_future
+            else:
+                features_all = None
+            x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
+                                                                   dim=-1)
 
             x_past = x_past.to(self.device)
             # TODO consider static features
@@ -645,8 +658,12 @@ def forward(self,
             else:
                 # For other models, the full past targets are passed to the network.
                 encoder_output = self.encoder(x_past)
-            max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size) + 1
-            repeated_past_target = targets_past[:, -max_lag_seq_length - 1:, ].repeat_interleave(
+            if self.encoder_lagged_input:
+                max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size)
+            else:
+                max_lag_seq_length = self.window_size
+            # TODO considering padding targets here instead of inside get_lagged function
+            repeated_past_target = targets_past[:, -max_lag_seq_length:, ].repeat_interleave(
                 repeats=self.num_samples,
                 dim=0).squeeze(1)
 
@@ -654,7 +671,17 @@ def forward(self,
                 repeats=self.num_samples, dim=0
             ).unsqueeze(dim=1) if features_static is not None else None
 
-            repeated_time_feat = features_future.repeat_interleave(
+            if not self.encoder_has_hidden_states:
+                if features_all is not None:
+                    # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
+                    # data properties!!!
+                    time_feature = features_all
+                else:
+                    time_feature = None
+            else:
+                time_feature = features_future[:, 1:] if self.n_prediction_steps > 1 else None
+
+            repeated_time_feat = time_feature.repeat_interleave(
                 repeats=self.num_samples, dim=0
             ) if features_future is not None else None
 
@@ -669,22 +696,40 @@ def forward(self,
             all_samples.append(next_sample)
 
             for k in range(1, self.n_prediction_steps):
-                if self.encoder_lagged_input:
-                    x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                    x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
+                if self.encoder_has_hidden_states:
+                    if self.encoder_lagged_input:
+                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
+                        x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
+                    else:
+                        x_next = next_sample
+                    x_next = x_next if repeated_time_feat is None else torch.cat([repeated_time_feat[:, k - 1:k],
+                                                                                  x_next], dim=-1)
+                    x_next = x_next.to(self.device)
+                    encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
                 else:
-                    x_next = next_sample
+                    if self.encoder_lagged_input:
+                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
+                        x_next = get_lagged_subsequences_inference(x_next,
+                                                                   self.window_size + k,
+                                                                   self.encoder.lagged_value)
+                    else:
+                        x_next = torch.cat([repeated_past_target[:, -self.window_size:], *all_samples], dim=1)
+                    if repeated_time_feat is None:
+                        x_next = x_next
+                    else:
+                        x_next = torch.cat([repeated_time_feat[:, :self.window_size + k],
+                                            x_next], dim=-1)
+
+                    encoder_output = self.encoder(x_next)
 
-                x_next = x_next if repeated_time_feat is None else torch.cat([repeated_time_feat[:, k:k + 1],
-                                                                              x_next], dim=-1)
                 if self.encoder_has_hidden_states:
                     x_next = x_next.to(self.device)
                     encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
                 else:
                     x_next = torch.cat([repeated_past_target, *all_samples], dim=1).to(self.device)
                     encoder_output = self.encoder(x_next)
-                # for training, the encoder output a sequence. Thus for prediction, the network should have the same
-                # format output
+                # During training, the encoder output a sequence. Thus for prediction, the network should have the same
+                # output format
                 encoder_output = torch.unsqueeze(encoder_output, 1)
 
                 net_output = self.head(self.decoder(encoder_output))

From d8b389210c291439877e740fdbb343865df8c09a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 18 Jan 2022 13:58:21 +0100
Subject: [PATCH 134/347] maint

---
 .../setup/network/forecasting_network.py           | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index cf98da2ed..848a7849f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -671,15 +671,15 @@ def forward(self,
                 repeats=self.num_samples, dim=0
             ).unsqueeze(dim=1) if features_static is not None else None
 
-            if not self.encoder_has_hidden_states:
-                if features_all is not None:
-                    # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
-                    # data properties!!!
-                    time_feature = features_all
+            if features_all is not None:
+                if not self.encoder_has_hidden_states:
+                        # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
+                        # data properties!!!
+                        time_feature = features_all
                 else:
-                    time_feature = None
+                    time_feature = features_future[:, 1:] if self.n_prediction_steps > 1 else None
             else:
-                time_feature = features_future[:, 1:] if self.n_prediction_steps > 1 else None
+                time_feature = None
 
             repeated_time_feat = time_feature.repeat_interleave(
                 repeats=self.num_samples, dim=0

From 5e9fbae18f06bb6d13518148191d8cafa6b065e6 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 19 Jan 2022 17:25:35 +0100
Subject: [PATCH 135/347] maint

---
 .../components/setup/network/forecasting_network.py       | 8 +-------
 .../network_backbone/forecasting_backbone/MLPEncoder.py   | 4 ++--
 .../network_backbone/forecasting_backbone/RNNEncoder.py   | 4 ++--
 .../network_backbone/forecasting_backbone/TCNEncoder.py   | 4 ++--
 .../forecasting_backbone/TransformerEncoder.py            | 2 +-
 .../forecasting_decoder/MLPDecoder.py                     | 6 +++---
 .../forecasting_decoder/NBEATSDecoder.py                  | 2 +-
 .../forecasting_decoder/TransformerDecoder.py             | 4 ++--
 8 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 848a7849f..bf6bbe8c0 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -719,15 +719,9 @@ def forward(self,
                     else:
                         x_next = torch.cat([repeated_time_feat[:, :self.window_size + k],
                                             x_next], dim=-1)
-
-                    encoder_output = self.encoder(x_next)
-
-                if self.encoder_has_hidden_states:
                     x_next = x_next.to(self.device)
-                    encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
-                else:
-                    x_next = torch.cat([repeated_past_target, *all_samples], dim=1).to(self.device)
                     encoder_output = self.encoder(x_next)
+
                 # During training, the encoder output a sequence. Thus for prediction, the network should have the same
                 # output format
                 encoder_output = torch.unsqueeze(encoder_output, 1)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
index 95457bdc2..1483f2833 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
@@ -141,7 +141,7 @@ def get_hyperparameter_search_space(
                                                                                ),
             num_units: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_units",
                                                                              value_range=(16, 1024),
-                                                                             default_value=256,
+                                                                             default_value=64,
                                                                              log=True
                                                                              ),
             normalization: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='normalization',
@@ -149,7 +149,7 @@ def get_hyperparameter_search_space(
                                                                                  default_value='BN'),
             dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dropout",
                                                                            value_range=(0, 0.8),
-                                                                           default_value=0.5,
+                                                                           default_value=0.1,
                                                                            ),
     ) -> ConfigurationSpace:
         cs = MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index dce0850e6..cfe84492f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -132,14 +132,14 @@ def get_hyperparameter_search_space(
                                                                               default_value=1),
             hidden_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='hidden_size',
                                                                                value_range=(32, 512),
-                                                                               default_value=256,
+                                                                               default_value=64,
                                                                                log=True),
             use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='use_dropout',
                                                                                value_range=(True, False),
                                                                                default_value=False),
             dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dropout',
                                                                            value_range=(0., 0.5),
-                                                                           default_value=0.2),
+                                                                           default_value=0.1),
             bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
                                                                                  value_range=(True, False),
                                                                                  default_value=True),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index ba6c39aeb..a00bccfc6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -164,11 +164,11 @@ def get_hyperparameter_search_space(
                                                                               default_value=3),
             num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                                value_range=(4, 64),
-                                                                               default_value=32,
+                                                                               default_value=16,
                                                                                log=True),
             kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
                                                                                value_range=(2, 64),
-                                                                               default_value=32,
+                                                                               default_value=8,
                                                                                log=True),
             use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_dropout",
                                                                                value_range=(True, False),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
index 12c2e6a52..800c9fc80 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -167,7 +167,7 @@ def get_hyperparameter_search_space(
             dropout: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter="dropout",
                                       value_range=(0, 0.8),
-                                      default_value=0.5,
+                                      default_value=0.1,
                                       ),
             decoder_type: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='decoder_type',
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index a29f807ff..1c3c6db70 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -58,10 +58,10 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
                                                                               value_range=(0, 3),
-                                                                              default_value=2),
+                                                                              default_value=1),
             units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
-                                                                               value_range=(64, 512),
-                                                                               default_value=128,
+                                                                               value_range=(16, 512),
+                                                                               default_value=64,
                                                                                log=True),
             activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
                                                                               value_range=tuple(_activations.keys()),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 5cf944eec..503fba735 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -273,7 +273,7 @@ def get_hyperparameter_search_space(
             dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="dropout",
                 value_range=(0, 0.8),
-                default_value=0.5,
+                default_value=0.1,
             ),
             backcast_loss_ratio: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="backcast_loss_ratio",
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 41b5fdbbc..b3d3a6ae9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -172,8 +172,8 @@ def get_hyperparameter_search_space(
                                       ),
             dropout: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter="dropout",
-                                      value_range=(0, 0.8),
-                                      default_value=0.5,
+                                      value_range=(0, 0.1),
+                                      default_value=0.1,
                                       ),
     ) -> ConfigurationSpace:
         """

From 3eb197e4f6a9c0019c0fc8bfa8c4e340e5abed00 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 19 Jan 2022 18:54:39 +0100
Subject: [PATCH 136/347] cross validation

---
 autoPyTorch/datasets/resampling_strategy.py      | 16 +++++-----------
 autoPyTorch/datasets/time_series_dataset.py      | 15 ++++++++-------
 .../time_series_forecasting_train_evaluator.py   | 12 ++----------
 3 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index e12b314a7..262fb8637 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -100,7 +100,7 @@ def is_stratified(self) -> bool:
         'num_splits': 5,
     },
     CrossValTypes.time_series_cross_validation: {
-        'num_splits': 5,
+        'num_splits': 3,
     },
 }
 
@@ -129,8 +129,6 @@ def stratified_holdout_validation(random_state: np.random.RandomState,
                                       random_state=random_state)
         return train, val
 
-    # TODO DO we move these under autoPyTorch/datasets/time_series_dataset.py?
-    # TODO rewrite this part, as we only need holdout sets
     @staticmethod
     def time_series_hold_out_validation(random_state: np.random.RandomState,
                                         val_share: float, indices: np.ndarray, **kwargs: Any) \
@@ -146,9 +144,8 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
         """
         # TODO consider how we handle test size properly
         # Time Series prediction only requires on set of prediction for each
-        # This implement needs to be combined with time series forecasting dataloader, where each time an entire time series
-        # is used for prediction
-        test_size = kwargs['n_prediction_steps']
+        # This implement needs to be combined with time series forecasting dataloader, where each time an entire
+        # time series is used for prediction
         cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
         train, val = list(cv.split(indices))[-1]
         return train, val
@@ -242,12 +239,9 @@ def time_series_cross_validation(random_state: np.random.RandomState,
                  ([0, 1, 2], [3])]
 
         """
-        # TODO: we use gap=n_prediction_step here, we need to consider if we want to implement n_prediction_step here or
-        # under DATALOADER!!!
-        # TODO do we need cross valriadtion for time series datasets?
         test_size = kwargs['n_prediction_steps']
-        cv = TimeSeriesSplit(n_splits=num_splits, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
-        splits = list(cv.split(indices))
+        cv = TimeSeriesSplit(n_splits=num_splits, test_size=test_size, gap=0)
+        splits = [(indices[split[0]], indices[split[1][-1:]]) for split in cv.split(indices)]
         return splits
 
     @classmethod
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f46a728a2..ee17c034c 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -646,25 +646,26 @@ def create_cross_val_splits(
         if not isinstance(cross_val_type, CrossValTypes):
             raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
         idx_start = 0
-        splits = [[[] for _ in range(len(self.datasets))] for _ in range(num_splits)]
 
         kwargs = {"n_prediction_steps": self.n_prediction_steps}
-        splits = [[() for _ in range(self.num_sequences)] for _ in range(num_splits)]
+        splits = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
         idx_all = self._get_indices()
 
         for idx_seq, dataset in enumerate(self.datasets):
             if self.shift_input_data:
-                split = self.cross_validators[cross_val_type.name](num_splits,
-                                                                   indices=np.arange(len(dataset)), **kwargs)
+                split = self.cross_validators[cross_val_type.name](self.random_state,
+                                                                   num_splits,
+                                                                   indices=idx_start + np.arange(len(dataset)), **kwargs)
             else:
                 # If the data is not shifted, we need to discard the last n_prediction_steps such that we have enough
                 # y values
-                split = self.cross_validators[cross_val_type.name](num_splits,
-                                                                   indices=np.arange(
+                split = self.cross_validators[cross_val_type.name](self.random_state,
+                                                                   num_splits,
+                                                                   indices=idx_start + np.arange(
                                                                        len(dataset) - self.n_prediction_steps),
                                                                    **kwargs)
             for idx_split in range(num_splits):
-                splits[idx_split][idx_seq] = idx_start + split[idx_split]
+                splits[idx_split][idx_seq] = split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
         # in this case, splits is stored as :
         #  [ first split, second_split ...]
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index d4df9d999..2a5f55864 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -179,8 +179,6 @@ def fit_predict_and_loss(self) -> None:
                 # Compute train loss of this fold and store it. train_loss could
                 # either be a scalar or a dict of scalars with metrics as keys.
 
-                train_loss = 0.
-                train_losses[i] = train_loss
                 # number of training data points for this fold. Used for weighting
                 # the average.
                 train_fold_weights[i] = len(train_split)
@@ -203,17 +201,11 @@ def fit_predict_and_loss(self) -> None:
 
             # Compute weights of each fold based on the number of samples in each
             # fold.
-            train_fold_weights = [w / sum(train_fold_weights)
-                                  for w in train_fold_weights]
+
             opt_fold_weights = [w / sum(opt_fold_weights)
                                 for w in opt_fold_weights]
 
-            # train_losses is a list of dicts. It is
-            # computed using the target metric (self.metric).
-            train_loss = np.average([train_losses[i][str(self.metric)]
-                                     for i in range(self.num_folds)],
-                                    weights=train_fold_weights,
-                                    )
+            train_loss = None
 
             opt_loss = {}
             # self.logger.debug("OPT LOSSES: {}".format(opt_losses if opt_losses is not None else None))

From 6f4fdf10aa45cd07ee3add608b1150ebe905fae4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 19 Jan 2022 19:47:20 +0100
Subject: [PATCH 137/347] allow holdout for smaller datasets

---
 .../data/time_series_forecasting_validator.py | 22 +++++++++----------
 autoPyTorch/datasets/resampling_strategy.py   |  8 +++++--
 autoPyTorch/datasets/time_series_dataset.py   | 22 +++++++++++++++++++
 .../time_series_forecasting_data_loader.py    |  1 +
 4 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 04469f92a..8943087db 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -117,18 +117,18 @@ def transform(
                 y_transformed = np.expand_dims(y_transformed, -1)
             return X_transformed, sequence_lengths, y_transformed
 
-            num_train_data = np.sum(sequence_lengths)
+        num_train_data = np.sum(sequence_lengths)
 
-            # a matrix that is concatenated by all the time series sequences
-            X_flat = np.empty([num_train_data, num_features])
+        # a matrix that is concatenated by all the time series sequences
+        X_flat = np.empty([num_train_data, num_features])
 
-            start_idx = 0
-            # TODO make it parallel with large number of sequences
-            for seq_idx, seq_length in enumerate(sequence_lengths):
-                end_idx = start_idx + seq_length
-                X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
-                start_idx = end_idx
+        start_idx = 0
+        # TODO make it parallel with large number of sequences
+        for seq_idx, seq_length in enumerate(sequence_lengths):
+            end_idx = start_idx + seq_length
+            X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
+            start_idx = end_idx
 
-            X_transformed = self.feature_validator.transform(X_flat)
+        X_transformed = self.feature_validator.transform(X_flat)
 
-            return X_transformed, sequence_lengths
+        return X_transformed, sequence_lengths
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 262fb8637..2ab334b00 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -146,8 +146,12 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
         # Time Series prediction only requires on set of prediction for each
         # This implement needs to be combined with time series forecasting dataloader, where each time an entire
         # time series is used for prediction
-        cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
-        train, val = list(cv.split(indices))[-1]
+        try:
+            cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
+            train, val = list(cv.split(indices))[-1]
+        except ValueError:
+            train = np.array([], dtype=indices.dtype)
+            val = indices[-1:]
         return train, val
 
     @classmethod
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ee17c034c..90ebbe644 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -3,6 +3,7 @@
 import uuid
 import bisect
 import copy
+import warnings
 
 import numpy as np
 
@@ -286,6 +287,27 @@ def __init__(self,
         self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
 
+        minimal_seq_length = np.min(sequence_lengths)
+        if isinstance(resampling_strategy, CrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
+                'num_splits', None)
+            if resampling_strategy_args is not None:
+                num_splits = resampling_strategy_args.get('num_split', num_splits)
+            while minimal_seq_length - n_prediction_steps * num_splits <= 0:
+                num_splits -= 1
+
+            if num_splits >= 2:
+                resampling_strategy = CrossValTypes.time_series_cross_validation
+                if resampling_strategy_args is None:
+                    resampling_strategy_args = {'num_splits': num_splits}
+                else:
+                    resampling_strategy_args.update({'num_splits': num_splits})
+            else:
+                warnings.warn('The dataset is not suitable for cross validation, we will apply holdout instead')
+
+                resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
+                resampling_strategy_args = None
+
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 61c7a3dba..5ff9d37b9 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -415,6 +415,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         # create masks for masking
         seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
         # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
+
         """
         num_instances_per_seqs = np.ceil(
             np.ceil(num_instances_train / (num_instances_dataset - min_start) * seq_train_length) *

From 8de8e505cb4fd65fc8e0599e7384309bf986b78a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 19 Jan 2022 21:05:30 +0100
Subject: [PATCH 138/347] smac4ac to smac4hpo

---
 autoPyTorch/optimizer/smbo.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 5208db842..7e77bd0f9 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -9,10 +9,11 @@
 import dask.distributed
 
 from smac.facade.smac_ac_facade import SMAC4AC
+from smac.facade.smac_hpo_facade import SMAC4HPO
 from smac.intensification.hyperband import Hyperband
 from smac.intensification.intensification import Intensifier
 from smac.runhistory.runhistory import RunHistory
-from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
+from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost, RunHistory2EPM4LogScaledCost
 from smac.scenario.scenario import Scenario
 from smac.tae.dask_runner import DaskParallelRunner
 from smac.tae.serial_runner import SerialRunner
@@ -71,19 +72,22 @@ def get_smac_object(
     if initial_budget == max_budget:
         intensifier = Intensifier
         intensifier_kwargs = {'deterministic': True, }
+        rh2EPM = RunHistory2EPM4LogScaledCost
+
     else:
         intensifier = Hyperband
         intensifier_kwargs = {'initial_budget': initial_budget, 'max_budget': max_budget,
                               'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'}
+        rh2EPM = RunHistory2EPM4LogCost
 
-    rh2EPM = RunHistory2EPM4LogCost
-    return SMAC4AC(
+    return SMAC4HPO(
         scenario=Scenario(scenario_dict),
         rng=seed,
         runhistory2epm=rh2EPM,
         tae_runner=ta,
         tae_runner_kwargs=ta_kwargs,
         initial_configurations=initial_configurations,
+        initial_design=None,
         run_id=seed,
         intensifier=intensifier,
         intensifier_kwargs=intensifier_kwargs,

From 54fbe592cd8ab8e261ddcb2fa3171e0b6e687c65 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 21 Jan 2022 14:44:03 +0100
Subject: [PATCH 139/347] maint

---
 autoPyTorch/api/base_task.py                  | 22 ++++++-------
 autoPyTorch/api/time_series_forecasting.py    | 32 +++----------------
 .../trainer/forecasting_trainer/__init__.py   |  2 +-
 .../pipeline/time_series_forecasting.py       | 17 ++++++++--
 requirements.txt                              |  2 +-
 5 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 4e9e6158a..3a724f69f 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -69,7 +69,8 @@ def _pipeline_predict(pipeline: BasePipeline,
                       X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int,
                       logger: PicklableClientLogger,
-                      task: int) -> np.ndarray:
+                      task: int,
+                      forecasting_task: bool=False) -> np.ndarray:
     @typing.no_type_check
     def send_warnings_to_log(
             message, category, filename, lineno, file=None, line=None):
@@ -93,13 +94,13 @@ def send_warnings_to_log(
                     prediction,
                     np.sum(prediction, axis=1)
                 ))
-
-    if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
-            X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
-        logger.warning(
-            "Prediction shape for model %s is %s while X_.shape is %s",
-            pipeline, str(prediction.shape), str(X_.shape)
-        )
+    if not forecasting_task:
+        if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
+                X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
+            logger.warning(
+                "Prediction shape for model %s is %s while X_.shape is %s",
+                pipeline, str(prediction.shape), str(X_.shape)
+            )
     return prediction
 
 
@@ -944,8 +945,6 @@ def _search(
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type
-        if time_series_forecasting and budget_type is not 'epochs':
-            self.pipeline_options['epochs'] = 100
 
         # Here the budget is set to max because the SMAC intensifier can be:
         # Hyperband: in this case the budget is determined on the fly and overwritten
@@ -1341,7 +1340,8 @@ def predict(
 
         all_predictions = joblib.Parallel(n_jobs=n_jobs)(
             joblib.delayed(_pipeline_predict)(
-                models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type]
+                models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type],
+                self.time_series_forecasting
             )
             for identifier in self.ensemble_.get_selected_model_identifiers()
         )
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 65135f7f0..6ec01eccf 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -340,31 +340,7 @@ def predict(
                     target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
                 (used for multi-variable prediction), indicates which value needs to be predicted
         """
-        y_pred = np.ones([len(X_test), self.dataset.n_prediction_steps])
-        for seq_idx, seq in enumerate(X_test):
-            if self.dataset.normalize_y:
-                if pd.DataFrame(seq).shape[-1] > 1:
-                    if self.target_variables is None and y_train is None:
-                        raise ValueError(
-                            'For multi-variant prediction task, either target_variables or y_train needs to '
-                            'be provided!')
-                    if y_train is None:
-                        y_train = seq[self.target_variables]
-                else:
-                    y_train = seq
-                if self.dataset.shift_input_data:
-                    # if input data is shifted, we must compute the mean and standard deviation with the shifted data.
-                    # This is helpful when the
-                    mean_seq = np.mean(y_train[self.dataset.n_prediction_steps:])
-                    std_seq = np.std(y_train[self.dataset.n_prediction_steps:])
-                else:
-                    mean_seq = np.mean(y_train)
-                    std_seq = np.std(y_train)
-
-                seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
-
-                seq_pred = seq_pred * std_seq + mean_seq
-            else:
-                seq_pred = super(TimeSeriesForecastingTask, self).predict(seq, batch_size, n_jobs).flatten()
-            y_pred[seq_idx] = seq_pred
-        return y_pred
+        flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
+        if self.dataset.num_target == 1:
+            return flattened_res.reshape([len(X_test), self.dataset.n_prediction_steps])
+        return flattened_res.reshape([len(X_test), self.dataset.n_prediction_steps, self.dataset.num_target])
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index c9ebaf835..c40e3e2f8 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -53,7 +53,7 @@ def get_budget_tracker(self, X):
         if 'epochs' in X:
             max_epochs = X['epochs']
         elif X['budget_type'] in FORECASTING_BUDGET_TYPE:
-            max_epochs = 100
+            max_epochs = 50
         else:
             max_epochs = None
         return BudgetTracker(
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index f8b703a76..a6a988e4d 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -426,6 +426,7 @@ def _get_estimator_hyperparameter_name(self) -> str:
         """
         return "time_series_forecasting"
 
+
     def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
         """Predict the output using the selected model.
 
@@ -442,8 +443,18 @@ def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray
         # Pre-process X
         if batch_size is None:
             warnings.warn("Batch size not provided. "
-                          "Will predict on the whole data in a single iteration")
-            batch_size = X.shape[0]
+                          "Will use 1000 instead")
+            batch_size = 1000
 
         loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size)
-        return self.named_steps['network'].predict(loader, self.target_scaler)
+        try:
+            return self.named_steps['network'].predict(loader, self.target_scaler).flatten()
+        except Exception as e:
+            # https://github.com/pytorch/fairseq/blob/50a671f78d0c8de0392f924180db72ac9b41b801/fairseq/trainer.py#L283
+            if 'out of memory' in str(e):
+                if batch_size == 1:
+                    raise e
+                warnings.warn('| WARNING: ran out of memory, retrying batch')
+                torch.cuda.empty_cache()
+                batch_size = batch_size // 2
+                return self.predict(X, batch_size=batch_size // 2)
diff --git a/requirements.txt b/requirements.txt
index 94e368180..1f22e63dc 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ imgaug>=0.4.0
 ConfigSpace>=0.4.14,<0.5
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
-smac==0.14.0
+smac
 dask
 distributed>=2.2.0
 catboost

From 918776ffb2e440e075bf79bb1a088deed1bef210 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 21 Jan 2022 15:20:25 +0100
Subject: [PATCH 140/347] maint

---
 autoPyTorch/evaluation/abstract_evaluator.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 2c4b361e4..8c310b68f 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -36,7 +36,7 @@
     FORECASTING_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
 
 from autoPyTorch.evaluation.utils import (
     VotingRegressorWrapper,
@@ -327,15 +327,23 @@ def fit(self, X: Dict[str, Any], y: Any,
         self.n_prediction_steps = X['dataset_properties']['n_prediction_steps']
         return super(DummyTimeSeriesForecastingPipeline, self).fit(X, y)
 
+    def _genreate_dummy_forecasting(self, X):
+        if isinstance(X[0], TimeSeriesSequence):
+            X_tail = [x.X[-1] for x in X]
+        else:
+            # test
+            X_tail = [x[-1] for x in X]
+        return X_tail
+
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int = 1000) -> np.array:
-        new_X = [x.X[-1] for x in X]
-        return np.tile(new_X, (1, self.n_prediction_steps)).astype(np.float32)
+        X_tail = self._genreate_dummy_forecasting(X)
+        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
                 batch_size: int = 1000) -> np.array:
-        new_X = [x.X[-1]for x in X]
-        return np.tile(new_X, (1, self.n_prediction_steps)).astype(np.float32)
+        X_tail = self._genreate_dummy_forecasting(X)
+        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
 
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:

From 85709513fe40d79dad5494d5f131fff88eefb1e9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 21 Jan 2022 17:33:47 +0100
Subject: [PATCH 141/347] allow to change decoder search space

---
 autoPyTorch/pipeline/base_pipeline.py             | 15 +++++++++++----
 .../forecasting_backbone/MLPEncoder.py            |  2 +-
 .../forecasting_backbone/__init__.py              |  7 ++++++-
 .../forecasting_decoder/NBEATSDecoder.py          |  6 +++---
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index 90c0f6362..1150bbca6 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -440,10 +440,17 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 # needs to be updated is in components of the
                 # choice module
                 elif split_hyperparameter[0] not in components.keys():
-                    raise ValueError("Unknown hyperparameter for choice {}. "
-                                     "Expected update hyperparameter "
-                                     "to be in {} got {}".format(node.__class__.__name__,
-                                                                 components.keys(), split_hyperparameter[0]))
+                    hp_in_component = False
+                    if hasattr(node, 'additional_components') and node.additional_components:
+                        for component_func in node.additional_components:
+                            if split_hyperparameter[0] in component_func().keys():
+                                hp_in_component = True
+                                break
+                    if not hp_in_component:
+                        raise ValueError("Unknown hyperparameter for choice {}. "
+                                         "Expected update hyperparameter "
+                                         "to be in {} got {}".format(node.__class__.__name__,
+                                                                     components.keys(), split_hyperparameter[0]))
                 else:
                     # check if hyperparameter is in the search space of the component
                     component = components[split_hyperparameter[0]]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
index 1483f2833..900a8a8bc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
@@ -127,7 +127,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
-                                                                              value_range=(1, 5),
+                                                                              value_range=(1, 10),
                                                                               default_value=3,
                                                                               ),
             activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index d8daf652d..13885acc4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -1,6 +1,7 @@
 import os
 from collections import OrderedDict
 from typing import Dict, Optional, List, Any
+from sklearn.pipeline import Pipeline
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
@@ -65,6 +66,11 @@ def get_decoder_components(self) -> Dict[str, autoPyTorchComponent]:
         components.update(decoder_addons.components)
         return components
 
+    @property
+    def additional_components(self):
+        # This function is deigned to add additional components rather than the components in __choice__
+        return [self.get_decoder_components]
+
     def get_available_components(
         self,
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
@@ -346,7 +352,6 @@ def set_hyperparameters(self,
         self.new_params = new_params
         self.choice = self.get_components()[choice](**new_params)
         self.decoder_choice = decoder_components[decoder_type](**decoder_params)
-        from sklearn.pipeline import Pipeline
         self.pipe = Pipeline([('encoder', self.choice), ('decoder', self.decoder_choice)])
         return self
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 503fba735..4fa5ca965 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -190,7 +190,7 @@ def get_hyperparameter_search_space(
             ),
             num_stacks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="num_stacks_G",
-                value_range=(4, 32),
+                value_range=(2, 32),
                 default_value=30
             ),
             num_blocks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
@@ -205,7 +205,7 @@ def get_hyperparameter_search_space(
             ),
             width_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 'width_G',
-                value_range=(32, 512),
+                value_range=(16, 512),
                 default_value=256,
                 log=True
             ),
@@ -226,7 +226,7 @@ def get_hyperparameter_search_space(
             ),
             width_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 'width_I',
-                value_range=(32, 2048),
+                value_range=(16, 2048),
                 default_value=512,
                 log=True
             ),

From 90edcfbf583f4f6a6b16ee47da736238c2096e43 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 24 Jan 2022 17:32:18 +0100
Subject: [PATCH 142/347] more resampling strategy, more options for MLP

---
 autoPyTorch/api/time_series_forecasting.py    |  4 +-
 .../configs/forecasting_init_cfgs.json        | 14 ++--
 autoPyTorch/constants_forecasting.py          |  2 -
 autoPyTorch/datasets/resampling_strategy.py   | 72 ++++++++++++++++---
 autoPyTorch/datasets/time_series_dataset.py   | 69 ++++++++++--------
 .../forecasting_decoder/MLPDecoder.py         | 38 ++++++----
 .../base_forecasting_decoder.py               |  1 +
 .../forecasting_network_head/distribution.py  | 32 +++++----
 .../forecasting_head.py                       | 33 ++++++---
 9 files changed, 179 insertions(+), 86 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 6ec01eccf..833f9d749 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -281,8 +281,8 @@ def search(
             normalize_y=normalize_y,
         )
 
-        if self.dataset.freq_value is not None or not self.customized_window_size:
-            base_window_size = int(np.ceil(self.dataset.freq_value))
+        if self.dataset.base_window_size is not None or not self.customized_window_size:
+            base_window_size = int(np.ceil(self.dataset.base_window_size))
             # we don't want base window size to large, which might cause a too long computation time, in which case
             # we will use n_prediction_step instead (which is normally smaller than base_window_size)
             if base_window_size > MAX_WINDOW_SIZE_BASE:
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 155bf6e14..a246b5213 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -32,8 +32,9 @@
             "network_backbone:MLPEncoder:use_dropout": false,
             "network_backbone:MLPEncoder:normalization": "NoNorm",
             "network_backbone:MLPDecoder:num_layers": 0,
-            "network_backbone:MLPDecoder:units_final_layer": 40,
+            "network_backbone:MLPDecoder:units_local_layer": 40,
             "network_backbone:MLPDecoder:auto_regressive": false,
+            "network_backbone:MLPDecoder:has_local_layer": true,
             "network:forecast_strategy": "sample",
             "network:aggregation": "median",
             "network:num_samples": 100
@@ -69,7 +70,8 @@
             "network_backbone:RNNEncoder:decoder_type": "MLPDecoder",
             "network_backbone:MLPDecoder:num_layers": 0,
             "network_backbone:MLPDecoder:auto_regressive": false,
-            "network_backbone:MLPDecoder:units_final_layer": 30,
+            "network_backbone:MLPDecoder:has_local_layer": true,
+            "network_backbone:MLPDecoder:units_local_layer": 30,
             "network:forecast_strategy": "sample",
             "network:aggregation": "median",
             "network:num_samples": 100
@@ -89,7 +91,7 @@
             "network_backbone:TCNEncoder:kernel_size_3": 3,
             "network_backbone:MLPDecoder:num_layers": 0,
             "network_backbone:MLPDecoder:auto_regressive": false,
-            "network_backbone:MLPDecoder:units_final_layer": 30,
+            "network_backbone:MLPDecoder:has_local_layer": false,
             "network:forecast_strategy": "sample",
             "network:aggregation": "median",
             "network:num_samples": 100
@@ -171,7 +173,6 @@
             "network_backbone:NBEATSDecoder:dropout_I_2": 0.1
         },
         "NBEATS-G": {
-            "target_scaler:__choice__": "TargetNoScaler",
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mape",
             "network:net_out_type": "regression",
@@ -180,13 +181,14 @@
             "network_backbone:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:NBEATSDecoder:activation": "relu",
             "network_backbone:NBEATSDecoder:n_beats_type": "G",
-            "network_backbone:NBEATSDecoder:use_dropout_G": false,
+            "network_backbone:NBEATSDecoder:use_dropout_G": true,
             "network_backbone:NBEATSDecoder:num_stacks_G": 30,
             "network_backbone:NBEATSDecoder:num_blocks_G": 1,
             "network_backbone:NBEATSDecoder:num_layers_G": 4,
             "network_backbone:NBEATSDecoder:width_G": 512,
             "network_backbone:NBEATSDecoder:weight_sharing_G": false,
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32,
+            "network_backbone:NBEATSDecoder:dropout_G": 0.1
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index e1d27f70e..3b5a355ca 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -17,6 +17,4 @@
 }
 
 MAX_WINDOW_SIZE_BASE = 500
-MIN_LONG_WINDOW_SIZE = 100
-MIN_LONG_FORECASTING_HORIZON = 100
 
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 2ab334b00..bd1aad599 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -32,6 +32,19 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
         ...
 
 
+def holdout_split_forecasting(holdout: TimeSeriesSplit, indices: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    A function that do holdout split without raising an error: When the target sequence is too short to be split into
+    training and validation set, the training set will simply ignore that and we only consider the validation set.
+    """
+    try:
+        train, val = list(holdout.split(indices))[-1]
+    except ValueError:
+        train = np.array([], dtype=indices.dtype)
+        val = [-1]
+    return indices[train], indices[val]
+
+
 class CrossValTypes(IntEnum):
     """The type of cross validation
 
@@ -58,6 +71,7 @@ class CrossValTypes(IntEnum):
     stratified_shuffle_split_cross_validation = 3
     shuffle_split_cross_validation = 4
     time_series_cross_validation = 5
+    time_series_ts_cross_validation = 6
 
     def is_stratified(self) -> bool:
         stratified = [self.stratified_k_fold_cross_validation,
@@ -68,9 +82,9 @@ def is_stratified(self) -> bool:
 class HoldoutValTypes(IntEnum):
     """TODO: change to enum using functools.partial"""
     """The type of hold out validation (refer to CrossValTypes' doc-string)"""
-    holdout_validation = 6
-    stratified_holdout_validation = 7
-    time_series_hold_out_validation = 8
+    holdout_validation = 11
+    stratified_holdout_validation = 12
+    time_series_hold_out_validation = 13
 
     def is_stratified(self) -> bool:
         stratified = [self.stratified_holdout_validation]
@@ -88,7 +102,7 @@ def is_stratified(self) -> bool:
         'val_share': 0.33,
     },
     HoldoutValTypes.time_series_hold_out_validation: {
-    'val_share': 0.2
+        'val_share': 0.2
     },
     CrossValTypes.k_fold_cross_validation: {
         'num_splits': 5,
@@ -102,6 +116,9 @@ def is_stratified(self) -> bool:
     CrossValTypes.time_series_cross_validation: {
         'num_splits': 3,
     },
+    CrossValTypes.time_series_ts_cross_validation: {
+        'num_splits': 2
+    }
 }
 
 
@@ -146,12 +163,8 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
         # Time Series prediction only requires on set of prediction for each
         # This implement needs to be combined with time series forecasting dataloader, where each time an entire
         # time series is used for prediction
-        try:
-            cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
-            train, val = list(cv.split(indices))[-1]
-        except ValueError:
-            train = np.array([], dtype=indices.dtype)
-            val = indices[-1:]
+        cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
+        train, val = holdout_split_forecasting(holdout=cv, indices=indices)
         return train, val
 
     @classmethod
@@ -191,7 +204,6 @@ def stratified_k_fold_cross_validation(random_state: np.random.RandomState,
                                            indices: np.ndarray,
                                            **kwargs: Any
                                            ) -> List[Tuple[np.ndarray, np.ndarray]]:
-
         shuffle = kwargs.get('shuffle', True)
         cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle,
                              random_state=random_state if not shuffle else None)
@@ -248,6 +260,44 @@ def time_series_cross_validation(random_state: np.random.RandomState,
         splits = [(indices[split[0]], indices[split[1][-1:]]) for split in cv.split(indices)]
         return splits
 
+    @staticmethod
+    def time_series_ts_cross_validation(random_state: np.random.RandomState,
+                                        num_splits: int,
+                                        indices: np.ndarray,
+                                        **kwargs: Any
+                                        ) -> List[Tuple[np.ndarray, np.ndarray]]:
+        """
+        A special sort of Time series cross validator: it could be considered as a mixture of two sorts of holdout set:
+        The first holdout setting: trend setting, simply consider the tail of the sequence as validation sets and the
+        part before as training set
+        The second holdout setting: seasonality setting, ensures that the distance between validation sets and test sets
+        is the multiple of seasonality period. Thus we could ensure that validation and test sets are at the same
+        position of the period
+
+        Args:
+            indices (np.ndarray): array of indices to be split
+            num_splits (int): number of cross validation splits
+
+        Returns:
+            splits (List[Tuple[List, List]]): list of tuples of training and validation indices
+        """
+        n_prediction_steps = kwargs['n_prediction_steps']
+        seasonality_h_value = kwargs['seasonality_h_value']
+        assert seasonality_h_value >= n_prediction_steps
+        cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=n_prediction_steps - 1)
+        train_t, val_t = holdout_split_forecasting(cv, indices)
+        splits = [(train_t, val_t)]
+
+        train_s, val_s = holdout_split_forecasting(cv, indices[:-seasonality_h_value + n_prediction_steps])
+        splits.append((train_s, val_s))
+        if num_splits > 2:
+            freq_value = int(kwargs['freq_value'])
+            for i_split in range(2, num_splits):
+                n_tail = (i_split - 1) * freq_value + seasonality_h_value - n_prediction_steps
+                train_s, val_s = holdout_split_forecasting(cv, indices[:-n_tail])
+                splits.append((train_s, val_s))
+        return splits
+
     @classmethod
     def get_cross_validators(cls, *cross_val_types: CrossValTypes) -> Dict[str, CrossValFunc]:
         cross_validators = {
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 90ebbe644..4dab1ee2a 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -223,11 +223,19 @@ def __init__(self,
                         "you could pass freq with a numerical value")
             freq_value = SEASONALITY_MAP.get(freq, None)
         if isinstance(freq, list):
-            tmp_freq = min([freq_value for freq_value in freq if freq_value >= n_prediction_steps])
+            if np.max(freq) < n_prediction_steps:
+                tmp_freq = n_prediction_steps
+            else:
+                tmp_freq = min([freq_value for freq_value in freq if freq_value >= n_prediction_steps])
             freq_value = tmp_freq
         if isinstance(freq_value, list):
-            tmp_freq = min([freq_value_item for freq_value_item in freq_value if freq_value_item >= n_prediction_steps])
+            if np.max(freq_value) < n_prediction_steps:
+                tmp_freq = n_prediction_steps
+            else:
+                tmp_freq = min([freq_value_item for
+                                freq_value_item in freq_value if freq_value_item >= n_prediction_steps])
             freq_value = tmp_freq
+        self.base_window_size = max(n_prediction_steps, freq_value)
 
         seasonality = SEASONALITY_MAP.get(freq, 1)
         if isinstance(seasonality, list):
@@ -293,20 +301,22 @@ def __init__(self,
                 'num_splits', None)
             if resampling_strategy_args is not None:
                 num_splits = resampling_strategy_args.get('num_split', num_splits)
-            while minimal_seq_length - n_prediction_steps * num_splits <= 0:
-                num_splits -= 1
 
-            if num_splits >= 2:
-                resampling_strategy = CrossValTypes.time_series_cross_validation
-                if resampling_strategy_args is None:
-                    resampling_strategy_args = {'num_splits': num_splits}
+            if resampling_strategy != CrossValTypes.time_series_ts_cross_validation:
+                while minimal_seq_length - n_prediction_steps * num_splits <= 0:
+                    num_splits -= 1
+
+                if num_splits >= 2:
+                    resampling_strategy = CrossValTypes.time_series_cross_validation
+                    if resampling_strategy_args is None:
+                        resampling_strategy_args = {'num_splits': num_splits}
+                    else:
+                        resampling_strategy_args.update({'num_splits': num_splits})
                 else:
-                    resampling_strategy_args.update({'num_splits': num_splits})
-            else:
-                warnings.warn('The dataset is not suitable for cross validation, we will apply holdout instead')
+                    warnings.warn('The dataset is not suitable for cross validation, we will apply holdout instead')
 
-                resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
-                resampling_strategy_args = None
+                    resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
+                    resampling_strategy_args = None
 
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
@@ -370,8 +380,17 @@ def __init__(self,
 
         self.numerical_features: List[int] = list(range(self.num_features))
         self.categorical_features: List[int] = []
-        self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation)
-        self.holdout_validators = HoldOutFuncs.get_holdout_validators(HoldoutValTypes.time_series_hold_out_validation)
+
+        if isinstance(resampling_strategy, CrossValTypes):
+            self.cross_validators = CrossValFuncs.get_cross_validators(resampling_strategy)
+        else:
+            self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation)
+        if isinstance(resampling_strategy, HoldoutValTypes):
+            self.holdout_validators = HoldOutFuncs.get_holdout_validators(resampling_strategy)
+
+        else:
+            self.holdout_validators = HoldOutFuncs.get_holdout_validators(
+                HoldoutValTypes.time_series_hold_out_validation)
 
         self.splits = self.get_splits_from_resampling_strategy()
 
@@ -592,11 +611,6 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
                                                         val_share=val_share))
 
-            if self.val_tensors is not None:
-                upper_window_size = np.min(self.sequence_lengths_train) - self.n_prediction_steps
-            else:
-                upper_window_size = int(np.min(self.sequence_lengths_train) * 1 - val_share)
-
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
                 'num_splits', None)
@@ -607,13 +621,10 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                 cross_val_type=self.resampling_strategy,
                 num_splits=cast(int, num_splits),
             ))
-            upper_window_size = (np.min(self.sequence_lengths_train)) - self.n_prediction_steps * (num_splits - 1)
         elif self.resampling_strategy is None:
             splits.append(self.create_refit_split())
-            upper_window_size = np.inf
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
-        self.upper_window_size = upper_window_size
         return splits
 
     def get_required_dataset_info(self) -> Dict[str, Any]:
@@ -635,7 +646,6 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
     def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
         dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
-                                   'upper_window_size': self.upper_window_size,
                                    'sp': self.seasonality,  # For metric computation,
                                    'freq': self.freq,
                                    'sequence_lengths_train': self.sequence_lengths_train,
@@ -670,8 +680,11 @@ def create_cross_val_splits(
         idx_start = 0
 
         kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        if cross_val_type == CrossValTypes.time_series_ts_cross_validation:
+            seasonality_h_value = int(np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
+            kwargs.update({'seasonality_h_value': seasonality_h_value,
+                           'freq_value': self.freq_value})
         splits = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
-        idx_all = self._get_indices()
 
         for idx_seq, dataset in enumerate(self.datasets):
             if self.shift_input_data:
@@ -733,17 +746,17 @@ def create_holdout_val_split(
             if self.shift_input_data:
                 split = self.holdout_validators[holdout_val_type.name](self.random_state,
                                                                        val_share,
-                                                                       indices=np.arange(len(dataset)),
+                                                                       indices=np.arange(len(dataset)) + idx_start,
                                                                        **kwargs)
             else:
                 split = self.holdout_validators[holdout_val_type.name](self.random_state,
                                                                        val_share,
-                                                                       indices=np.arange(
+                                                                       indices=idx_start + np.arange(
                                                                            len(dataset) - self.n_prediction_steps),
                                                                        **kwargs)
 
             for idx_split in range(2):
-                splits[idx_split][idx_seq] = idx_start + split[idx_split]
+                splits[idx_split][idx_seq] = split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
 
         train_indices = np.hstack([sp for sp in splits[0]])
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 1c3c6db70..bdfe9e0a5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union, Any
 
 from torch import nn
 
@@ -29,12 +29,12 @@ def _build_decoder(self,
                 layers.append(_activations[self.config["activation"]]())
                 in_features = self.config[f"units_layer_{i}"]
                 num_decoder_output_features = in_features
-        if 'units_final_layer' in self.config:
+        if 'units_local_layer' in self.config:
             layers.append(nn.Linear(in_features=in_features,
-                                    out_features=self.config['units_final_layer'] * n_prediction_heads))
+                                    out_features=self.config['units_local_layer'] * n_prediction_heads))
             if 'activation' in self.config:
                 layers.append(_activations[self.config["activation"]]())
-            num_decoder_output_features = self.config['units_final_layer']
+            num_decoder_output_features = self.config['units_local_layer']
 
         return nn.Sequential(*layers), num_decoder_output_features
 
@@ -49,6 +49,10 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_time_series': True,
         }
 
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({'mlp_has_local_layer': self.config.get('has_local_layer', True)})
+        return super().transform(X)
+
     @property
     def fitted_encoder(self):
         return ['RNNEncoder', 'TCNEncoder', 'MLEncoder', 'NBEATSEncoder']
@@ -70,7 +74,10 @@ def get_hyperparameter_search_space(
             auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
                                                                                    value_range=(True, False),
                                                                                    default_value=False),
-            units_final_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_final_layer",
+            has_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='has_local_layer',
+                                                                                   value_range=(True, False),
+                                                                                   default_value=True),
+            units_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_local_layer",
                                                                                      value_range=(16, 128),
                                                                                      default_value=32,
                                                                                      log=True),
@@ -95,12 +102,11 @@ def get_hyperparameter_search_space(
             could start from 0)
             units_layer (HyperparameterSearchSpace): number of units of each layer (except for the last layer)
             activation (HyperparameterSearchSpace): activation function
-            units_final_layer (HyperparameterSearchSpace): number of units of final layer. The size of this layer is
-            smaller as it needs to be expanded to adapt to the number of predictions
             auto_regressive (HyperparameterSearchSpace): if the model acts as a DeepAR model
-            deepar_n_samples (HyperparameterSearchSpace) activate when auto_regressive is True, how many points to
-            sample when doing deepAR prediction (we note that this hyperparameters are only applied to generate new
-            future distribution in the future, but it does not influence the way that network makes prediction)
+            has_local_layer (HyperparameterSearchSpace): if local MLP layer is applied, if not, the output of the
+                network will be directly attached with different heads
+            units_local_layer (HyperparameterSearchSpace): number of units of local layer. The size of this layer is
+                smaller as it needs to be expanded to adapt to the number of predictions
         Returns:
             cs (ConfigurationSpace): ConfigurationSpace
         """
@@ -137,16 +143,18 @@ def get_hyperparameter_search_space(
                 # So no condition is needed. If it is not a constant but a hyperparameter,
                 # then a condition has to be made so that it accounts for the value of the
                 # hyperparameter.
-                cs.add_condition(GreaterThanCondition(num_units_hp, num_layers_hp, i))
+                cs.add_condition(GreaterThanCondition(num_units_hp, num_layers_hp, i - 1))
 
         # add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
 
         # TODO let dataset_properties decide if auto_regressive models is applicable
         auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
-        units_final_layer = get_hyperparameter(units_final_layer, UniformIntegerHyperparameter)
+        has_local_layer = get_hyperparameter(has_local_layer, CategoricalHyperparameter)
+        units_local_layer = get_hyperparameter(units_local_layer, UniformIntegerHyperparameter)
 
-        cond_units_final_layer = EqualsCondition(units_final_layer, auto_regressive, False)
-        cs.add_hyperparameters([auto_regressive, units_final_layer])
-        cs.add_condition(cond_units_final_layer)
+        cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive, False)
+        cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
+        cs.add_hyperparameters([auto_regressive, has_local_layer, units_local_layer])
+        cs.add_conditions([cond_use_local_layer, cond_units_local_layer])
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 1e1a6ecaa..0c9094109 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -58,6 +58,7 @@ def fitted_encoder(self):
 
     def decoder_properties(self):
         decoder_properties = {'has_hidden_states': False,
+                              'has_local_layer': True,
                               'recurrent': False,
                               'lagged_input': False,
                               'multi_blocks': False,
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index f303ea7d0..48e53ec49 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -42,6 +42,7 @@ class ProjectionLayer(nn.Module):
     """
 
     value_in_support = 0.0
+
     # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
 
     def __init__(self,
@@ -49,6 +50,7 @@ def __init__(self,
                  output_shape: Tuple[int, ...],
                  n_prediction_heads: int,
                  auto_regressive: bool,
+                 decoder_has_local_layer: bool,
                  **kwargs, ):
         super().__init__(**kwargs)
 
@@ -66,15 +68,20 @@ def build_single_proj_layer(arg_dim):
             Returns:
 
             """
-            if auto_regressive:
-                unflatten_layer = []
+            if decoder_has_local_layer:
+                if auto_regressive:
+                    unflatten_layer = []
+                else:
+                    # we need to unflatten the input from 2D to 3D such that local MLP can be applied to each prediction
+                    # separately
+                    unflatten_layer = [nn.Unflatten(-1, (n_prediction_heads, num_in_features))]
+                return nn.Sequential(*unflatten_layer,
+                                     nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+                                     nn.Unflatten(-1, (*output_shape, arg_dim)))
             else:
-                # we need to unflatten the input from 2D to 3D such that local MLP can be applied to each prediction
-                # separately
-                unflatten_layer = [nn.Unflatten(-1, (n_prediction_heads, num_in_features))]
-            return nn.Sequential(*unflatten_layer,
-                                 nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
-                                 nn.Unflatten(-1, (*output_shape, arg_dim)))
+                return nn.Sequential(
+                    nn.Linear(num_in_features, n_prediction_heads * np.prod(output_shape).item() * arg_dim),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape, arg_dim)))
 
         self.proj = nn.ModuleList(
             [build_single_proj_layer(dim) for dim in self.arg_dims.values()]
@@ -162,6 +169,7 @@ def dist_cls(self) -> type(Distribution):
 
 class GammaOutput(ProjectionLayer):
     value_in_support = 0.5
+
     @property
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration": 1, "rate": 1}
@@ -195,10 +203,10 @@ def dist_cls(self) -> type(Distribution):
 
 ALL_DISTRIBUTIONS = {'studentT': StudentTOutput,
                      'normal': NormalOutput,
-                     #'beta': BetaOutput,
-                     #'gamma': GammaOutput,
-                     #'poisson': PoissonOutput
-                    }  # type: Dict[str, ProjectionLayer]
+                     # 'beta': BetaOutput,
+                     # 'gamma': GammaOutput,
+                     # 'poisson': PoissonOutput
+                     }  # type: Dict[str, ProjectionLayer]
 
 # TODO find components that are compatible with beta, gamma and poisson distrubtion!
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index ca0938257..508618833 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -37,7 +37,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('decoder_properties', (Dict,), user_defined=False, dataset_property=False),
-            FitRequirement('n_decoder_output_features', (int, ), user_defined=False, dataset_property=False),
+            FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
         ]
@@ -53,7 +53,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             Self
         """
 
-
         self.check_requirements(X, y)
         output_shape = X['dataset_properties']['output_shape']
 
@@ -70,7 +69,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         if self.required_net_out_put_type == 'distribution':
             if 'dist_cls' not in X:
-                raise ValueError('Distribution output type must contain dist_cls!!')
+                raise ValueError('Distribution output type must contain dist_cls!')
 
         dist_cls = X.get('dist_cls', None)
 
@@ -79,11 +78,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         head_input_shape = X["n_decoder_output_features"]
         n_prediction_heads = X["n_prediction_heads"]
 
+        decoder_has_local_layer = X.get('mlp_has_local_layer', True)
+
         self.head = self.build_head(
             input_shape=head_input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
             dist_cls=dist_cls,
+            decoder_has_local_layer=decoder_has_local_layer,
             n_prediction_heads=n_prediction_heads,
         )
         return self
@@ -106,11 +108,11 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
                       'network_decoder': decoder})
         return X
 
-
     def build_head(self,
                    input_shape: Tuple[int, ...],
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
+                   decoder_has_local_layer: bool =True,
                    dist_cls: Optional[str] = None,
                    n_prediction_heads: int = 1) -> nn.Module:
         """
@@ -120,6 +122,7 @@ def build_head(self,
             input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
             output_shape (Tuple[int, ...]): shape of the output of the head
             auto_regressive (bool): if the network is auto-regressive
+            decoder_has_local_layer (bool): if the decoder has local layer
             dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
             n_prediction_heads (Dict): additional paramter for initializing architectures. How many heads to predict
 
@@ -130,6 +133,7 @@ def build_head(self,
             input_shape=input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
+            decoder_has_local_layer=decoder_has_local_layer,
             net_out_put_type=self.required_net_out_put_type,
             dist_cls=dist_cls,
             n_prediction_heads=n_prediction_heads
@@ -141,6 +145,7 @@ def build_proj_layer(input_shape: Tuple[int, ...],
                          output_shape: Tuple[int, ...],
                          n_prediction_heads: int,
                          auto_regressive: bool,
+                         decoder_has_local_layer: bool,
                          net_out_put_type: str,
                          dist_cls: Optional[str] = None) -> torch.nn.Module:
         """
@@ -151,6 +156,7 @@ def build_proj_layer(input_shape: Tuple[int, ...],
             output_shape (Tuple[int, ..]): deserved output shape
             n_prediction_heads: int, how many steps the head want to predict
             auto_regressive (bool): if the network is auto-regressive
+            decoder_has_local_layer (bool): if the decoder has local layer
             net_out_put_type (str), type of the loss, it determines the output of the network
             dist_cls (str), distribution class, only activate if output is a distribution
 
@@ -163,20 +169,27 @@ def build_proj_layer(input_shape: Tuple[int, ...],
         """
         if net_out_put_type == 'distribution':
             if dist_cls not in ALL_DISTRIBUTIONS.keys():
-                raise ValueError(f'Unsupported distribution class type: {dist_cls}')
+                raise NotImplementedError(f'Unsupported distribution class type: {dist_cls}')
             proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=input_shape,
                                                      output_shape=output_shape[1:],
                                                      n_prediction_heads=n_prediction_heads,
-                                                     auto_regressive=auto_regressive)
+                                                     auto_regressive=auto_regressive,
+                                                     decoder_has_local_layer=decoder_has_local_layer
+                                                     )
             return proj_layer
         elif net_out_put_type == 'regression':
             if auto_regressive:
                 proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
             else:
-                proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, input_shape)),
-                                       nn.Linear(input_shape, np.product(output_shape[1:])),
-                                       # nn.Unflatten(-1, tuple(output_shape)),
-                                       )
+                if decoder_has_local_layer:
+                    proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, input_shape)),
+                                               nn.Linear(input_shape, np.product(output_shape[1:])),
+                                               )
+                else:
+                    proj_layer = nn.Sequential(
+                        nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
+                        nn.Unflatten(-1, (n_prediction_heads, output_shape[1:])),
+                    )
             return proj_layer
         else:
             raise ValueError(f"Unsupported network type "

From df7f568fe4f1966ca4974cbc2b760a6296c0a191 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 24 Jan 2022 18:55:49 +0100
Subject: [PATCH 143/347] reduced NBEATS

---
 .../forecasting_decoder/NBEATSDecoder.py                   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 4fa5ca965..67076e138 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -191,16 +191,17 @@ def get_hyperparameter_search_space(
             num_stacks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="num_stacks_G",
                 value_range=(2, 32),
-                default_value=30
+                default_value=30,
+                log=True,
             ),
             num_blocks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 'num_blocks_G',
-                value_range=(1, 3),
+                value_range=(1, 2),
                 default_value=1
             ),
             num_layers_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 'num_layers_G',
-                value_range=(1, 5),
+                value_range=(1, 4),
                 default_value=4
             ),
             width_g: HyperparameterSearchSpace = HyperparameterSearchSpace(

From 6a22e4d1b0c44d7e236a51c524c15a44b4d1eb31 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 25 Jan 2022 10:47:15 +0100
Subject: [PATCH 144/347] subsampler for val loader

---
 .../time_series_forecasting_data_loader.py    | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5ff9d37b9..d82c7674a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Optional, Tuple, Union, Sequence, List
+from typing import Any, Dict, Optional, Union, Sequence, List, Iterator, Sized
+
 from functools import partial
 
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -9,7 +10,7 @@
 
 import torch
 import collections
-from torch.utils.data.sampler import SubsetRandomSampler
+from torch.utils.data.sampler import SubsetRandomSampler, SequentialSampler
 from torch._six import string_classes
 from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format, default_collate
 
@@ -236,6 +237,27 @@ def __len__(self):
         return self.num_instances
 
 
+class SequentialSubSetSampler(SequentialSampler):
+    data_source: Sized
+
+    def __init__(self, data_source: Sized, num_samples: int, generator: Optional[torch.Generator] = None) -> None:
+        super(SequentialSubSetSampler, self).__init__(data_source)
+        if num_samples > len(data_source):
+            self.num_samples = len(data_source)
+        else:
+            self.num_samples = num_samples
+        self.generator = generator
+
+    def __iter__(self) -> Iterator[int]:
+        if self.num_samples == len(self.data_source):
+            return super(SequentialSubSetSampler, self).__iter__()
+        else:
+            yield from torch.randperm(len(self.data_source), generator=self.generator)[:self.num_samples]
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+
 class ExpandTransformTimeSeries(object):
     """Expand Dimensionality so tabular transformations see
        a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
@@ -466,6 +488,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', False),
             collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
+            sampler=SequentialSubSetSampler(val_dataset, int(np.sum(num_instances_per_seqs)) // 5)
         )
         return self
 

From 01ee1f1f58b3f1c62a6c2f0abcb5ebaa29da7d09 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 25 Jan 2022 10:56:48 +0100
Subject: [PATCH 145/347] rng for dataloader sampler

---
 .../time_series_forecasting_data_loader.py        | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index d82c7674a..cec7ebaee 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -465,9 +465,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         sampler_indices_train = np.arange(num_instances_dataset)
 
+        seed_train = self.random_state.randint(0, 2 ** 20)
+        generator_train = torch.Generator()
+        generator_train.manual_seed(seed_train)
+
         self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
                                                num_instances_per_seqs=num_instances_per_seqs,
-                                               min_start=min_start)
+                                               min_start=min_start,
+                                               generator=generator_train)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
@@ -480,6 +485,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             sampler=self.sampler_train,
         )
 
+        seed_val = self.random_state.randint(0, 2 ** 20)
+        generator_val = torch.Generator()
+        generator_val.manual_seed(seed_val)
+
         self.val_data_loader = torch.utils.data.DataLoader(
             val_dataset,
             batch_size=min(1000, len(val_dataset)),
@@ -488,7 +497,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', False),
             collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
-            sampler=SequentialSubSetSampler(val_dataset, int(np.sum(num_instances_per_seqs)) // 5)
+            sampler=SequentialSubSetSampler(data_source=val_dataset,
+                                            num_samples=int(np.sum(num_instances_per_seqs)) // 5,
+                                            generator=generator_val)
         )
         return self
 

From 5dcdc4e5c46d830ce5272691a2e684a87247d8fe Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 26 Jan 2022 12:28:33 +0100
Subject: [PATCH 146/347] maint

---
 autoPyTorch/datasets/resampling_strategy.py   | 10 ++++-
 autoPyTorch/datasets/time_series_dataset.py   | 24 ++++++++++-
 .../forecasting_network_head/distribution.py  |  4 +-
 .../forecasting_head.py                       |  2 +-
 .../time_series_forecasting_data_loader.py    | 40 +++++++++++++++----
 .../pipeline/time_series_forecasting.py       |  2 +-
 6 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index bd1aad599..fbc885706 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -287,8 +287,14 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
         cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=n_prediction_steps - 1)
         train_t, val_t = holdout_split_forecasting(cv, indices)
         splits = [(train_t, val_t)]
-
-        train_s, val_s = holdout_split_forecasting(cv, indices[:-seasonality_h_value + n_prediction_steps])
+        if len(indices) < seasonality_h_value - n_prediction_steps:
+            if len(indices) == 1:
+                train_s = train_t
+                val_s = val_t
+            else:
+                train_s, val_s = holdout_split_forecasting(cv, indices[:-1])
+        else:
+            train_s, val_s = holdout_split_forecasting(cv, indices[:-seasonality_h_value + n_prediction_steps])
         splits.append((train_s, val_s))
         if num_splits > 2:
             freq_value = int(kwargs['freq_value'])
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 4dab1ee2a..0fae8019c 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -235,7 +235,6 @@ def __init__(self,
                 tmp_freq = min([freq_value_item for
                                 freq_value_item in freq_value if freq_value_item >= n_prediction_steps])
             freq_value = tmp_freq
-        self.base_window_size = max(n_prediction_steps, freq_value)
 
         seasonality = SEASONALITY_MAP.get(freq, 1)
         if isinstance(seasonality, list):
@@ -317,6 +316,18 @@ def __init__(self,
 
                     resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
                     resampling_strategy_args = None
+            else:
+                seasonality_h_value = int(
+                    np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
+
+                while minimal_seq_length < (num_splits - 1) * freq_value + seasonality_h_value - n_prediction_steps:
+                    if num_splits <= 2:
+                        break
+                    num_splits -= 1
+                if resampling_strategy_args is None:
+                    resampling_strategy_args = {'num_splits': num_splits}
+                else:
+                    resampling_strategy_args.update({'num_splits': num_splits})
 
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
@@ -347,7 +358,15 @@ def __init__(self,
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
-        self.seq_length_min = np.min(self.sequence_lengths_train)
+        self.seq_length_min = int(np.min(self.sequence_lengths_train))
+        self.seq_length_median = int(np.median(self.sequence_lengths_train))
+        self.seq_length_max = int(np.max(self.sequence_lengths_train))
+
+        if max(n_prediction_steps, freq_value) > self.seq_length_median:
+            self.base_window_size = min(n_prediction_steps, freq_value, self.seq_length_median)
+        else:
+            self.base_window_size = max(n_prediction_steps, freq_value)
+
 
         self.train_tensors = train_tensors
 
@@ -649,6 +668,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'sp': self.seasonality,  # For metric computation,
                                    'freq': self.freq,
                                    'sequence_lengths_train': self.sequence_lengths_train,
+                                   'seq_length_max': self.seq_length_max,
                                    'lagged_value': self.lagged_value})
         return dataset_properties
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 48e53ec49..a8087f274 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -122,7 +122,7 @@ def arg_dims(self) -> Dict[str, int]:
         return {"loc": 1, "scale": 1}
 
     def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        scale = F.softplus(scale)
+        scale = F.softplus(scale) + 1e-10
         return loc.squeeze(-1), scale.squeeze(-1)
 
     @property
@@ -137,7 +137,7 @@ def arg_dims(self) -> Dict[str, int]:
 
     def domain_map(self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor) \
             -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale = F.softplus(scale)
+        scale = F.softplus(scale) + 1e-10
         df = 2.0 + F.softplus(df)
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 508618833..151c9ad3a 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -188,7 +188,7 @@ def build_proj_layer(input_shape: Tuple[int, ...],
                 else:
                     proj_layer = nn.Sequential(
                         nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
-                        nn.Unflatten(-1, (n_prediction_heads, output_shape[1:])),
+                        nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
                     )
             return proj_layer
         else:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index cec7ebaee..a52815ddc 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,9 +1,9 @@
 from typing import Any, Dict, Optional, Union, Sequence, List, Iterator, Sized
-
+import warnings
 from functools import partial
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
+from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter, Constant
 from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
@@ -243,13 +243,15 @@ class SequentialSubSetSampler(SequentialSampler):
     def __init__(self, data_source: Sized, num_samples: int, generator: Optional[torch.Generator] = None) -> None:
         super(SequentialSubSetSampler, self).__init__(data_source)
         if num_samples > len(data_source):
+            self.eval_all_sequences = True
             self.num_samples = len(data_source)
         else:
+            self.eval_all_sequences = False
             self.num_samples = num_samples
         self.generator = generator
 
     def __iter__(self) -> Iterator[int]:
-        if self.num_samples == len(self.data_source):
+        if self.eval_all_sequences:
             return super(SequentialSubSetSampler, self).__iter__()
         else:
             yield from torch.randperm(len(self.data_source), generator=self.generator)[:self.num_samples]
@@ -489,6 +491,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         generator_val = torch.Generator()
         generator_val.manual_seed(seed_val)
 
+        num_samples_val = int(np.sum(num_instances_per_seqs)) // 5
+        if num_samples_val > len(val_dataset):
+            sampler_val = None
+        else:
+            sampler_val = SequentialSubSetSampler(data_source=val_dataset,
+                                                  num_samples=num_samples_val,
+                                                  generator=generator_val)
+
         self.val_data_loader = torch.utils.data.DataLoader(
             val_dataset,
             batch_size=min(1000, len(val_dataset)),
@@ -497,9 +507,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             pin_memory=X.get('pin_memory', True),
             drop_last=X.get('drop_last', False),
             collate_fn=partial(custom_collate_fn, x_collector=self.padding_collector),
-            sampler=SequentialSubSetSampler(data_source=val_dataset,
-                                            num_samples=int(np.sum(num_instances_per_seqs)) // 5,
-                                            generator=generator_val)
+            sampler=sampler_val
         )
         return self
 
@@ -677,7 +685,25 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         add_hyperparameter(cs, num_batch_per_epoch, UniformIntegerHyperparameter)
         add_hyperparameter(cs, sample_strategy, CategoricalHyperparameter)
 
-        window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+        seq_length_max = dataset_properties.get('seq_length_max', np.inf)
+
+        if seq_length_max <= window_size.value_range[1]:
+            if seq_length_max <= window_size.value_range[0]:
+                warnings.warn('The base window_size is larger than the maximal sequence length in the dataset,'
+                              'we simply set it as a constant value with maximal sequence length')
+                window_size = HyperparameterSearchSpace(hyperparameter=window_size.hyperparameter, 
+                                                        value_range=(seq_length_max, ),
+                                                        default_value=seq_length_max)
+                window_size = get_hyperparameter(window_size, Constant)
+            else:
+                window_size_value_range = window_size.value_range
+                window_size = HyperparameterSearchSpace(hyperparameter='window_size',
+                                                        value_range=(window_size_value_range[0], seq_length_max),
+                                                        default_value=window_size_value_range[0])
+                window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+        else:
+            window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
+
         backcast = get_hyperparameter(backcast, CategoricalHyperparameter)
         backcast_period = get_hyperparameter(backcast_period, UniformIntegerHyperparameter)
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index a6a988e4d..125fb474c 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -452,7 +452,7 @@ def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray
         except Exception as e:
             # https://github.com/pytorch/fairseq/blob/50a671f78d0c8de0392f924180db72ac9b41b801/fairseq/trainer.py#L283
             if 'out of memory' in str(e):
-                if batch_size == 1:
+                if batch_size <= 1:
                     raise e
                 warnings.warn('| WARNING: ran out of memory, retrying batch')
                 torch.cuda.empty_cache()

From f32a12bdd2032d2934ff0aa6ed63a5000db3b787 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 26 Jan 2022 12:43:59 +0100
Subject: [PATCH 147/347] remove generator as it cannot be pickled

---
 .../data_loader/time_series_forecasting_data_loader.py   | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index a52815ddc..5dd65db5a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -473,8 +473,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
                                                num_instances_per_seqs=num_instances_per_seqs,
-                                               min_start=min_start,
-                                               generator=generator_train)
+                                               min_start=min_start)
 
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
@@ -487,17 +486,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             sampler=self.sampler_train,
         )
 
-        seed_val = self.random_state.randint(0, 2 ** 20)
-        generator_val = torch.Generator()
-        generator_val.manual_seed(seed_val)
 
         num_samples_val = int(np.sum(num_instances_per_seqs)) // 5
         if num_samples_val > len(val_dataset):
             sampler_val = None
         else:
             sampler_val = SequentialSubSetSampler(data_source=val_dataset,
-                                                  num_samples=num_samples_val,
-                                                  generator=generator_val)
+                                                  num_samples=num_samples_val)
 
         self.val_data_loader = torch.utils.data.DataLoader(
             val_dataset,

From 344e7dff49f9db0ac1a19ae5362e14f36dd8b73d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 28 Jan 2022 22:56:47 +0100
Subject: [PATCH 148/347] allow lower fidelity to evaluate less test instances

---
 autoPyTorch/api/time_series_forecasting.py    | 17 ++--
 autoPyTorch/datasets/time_series_dataset.py   | 57 ++++++++-----
 autoPyTorch/evaluation/abstract_evaluator.py  |  1 -
 ...time_series_forecasting_train_evaluator.py | 84 ++++++++++++++-----
 autoPyTorch/evaluation/train_evaluator.py     |  4 +-
 autoPyTorch/optimizer/smbo.py                 | 11 ++-
 .../time_series_forecasting_data_loader.py    |  5 --
 7 files changed, 118 insertions(+), 61 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 833f9d749..631b7f2d0 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -139,6 +139,7 @@ def search(
             normalize_y: bool = True,
             suggested_init_models: Optional[List[str]] = None,
             custom_init_setting_path: Optional[str] = None,
+            min_num_test_instances: Optional[int] = None,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -239,13 +240,14 @@ def search(
                 if the input data needs to be shifted
             normalize_y: bool
                 if the input y values need to be normalized
-            train_with_log_prob: bool
-                if the network is trained with log_prob losses, this will create a network header that is different
-                from the current version.
             suggested_init_models: Optional[List[str]]
                 suggested initial models with their default configurations setting
             custom_init_setting_path: Optional[str]
                 path to a json file that contains the initial configuration suggested by the users
+            min_num_test_instances: Optional[int]
+                if it is set None, then full validation sets will be evaluated in each fidelity. Otherwise, the number
+                of instances in the test sets should be a value that is at least as great as this value, otherwise, the
+                number of test instance is proportional to its fidelity
         Returns:
             self
 
@@ -255,7 +257,7 @@ def search(
 
         # we have to create a logger for at this point for the validator
         self._logger = self._get_logger(dataset_name)
-        #TODO we will only consider target variables as int here
+        # TODO we will only consider target variables as int here
         self.target_variables = target_variables
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
@@ -307,6 +309,10 @@ def search(
         self._metrics_kwargs = {'sp': self.dataset.seasonality,
                                 'n_prediction_steps': n_prediction_steps}
 
+        forecasting_kwargs = dict(suggested_init_models=suggested_init_models,
+                                  custom_init_setting_path=custom_init_setting_path,
+                                  min_num_test_instances=min_num_test_instances)
+
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
@@ -325,8 +331,7 @@ def search(
             load_models=load_models,
             portfolio_selection=portfolio_selection,
             time_series_forecasting=self.time_series_forecasting,
-            suggested_init_models=suggested_init_models,
-            custom_init_setting_path=custom_init_setting_path,
+            **forecasting_kwargs,
         )
 
     def predict(
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 0fae8019c..e700bff89 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -33,7 +33,6 @@
 from gluonts.time_feature.lag import get_lags_for_frequency
 from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
-
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
@@ -41,7 +40,6 @@
 from autoPyTorch.constants_forecasting import SEASONALITY_MAP
 from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
 
-
 TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
 TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
 TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
@@ -166,7 +164,11 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                                       n_prediction_steps=self.n_prediction_steps,
                                       sp=self.sp)
 
-
+    def get_test_target(self, test_idx: int):
+        if test_idx < 0:
+            test_idx = self.__len__() + test_idx
+        Y_future = self.Y[test_idx + 1: test_idx + self.n_prediction_steps + 1]
+        return Y_future
 
 
 class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
@@ -274,9 +276,9 @@ def __init__(self,
         if target_variables is None:
             if self.num_target != 1:
                 raise ValueError("target_variables must be specified if more the input has more than one feature value")
-            self.target_columns = (0, ) # to keep the output dimension unchanged
+            self.target_columns = (0,)  # to keep the output dimension unchanged
         elif isinstance(target_variables, int):
-            self.target_columns = (target_variables, )
+            self.target_columns = (target_variables,)
         else:
             self.target_columns = target_variables
 
@@ -367,7 +369,6 @@ def __init__(self,
         else:
             self.base_window_size = max(n_prediction_steps, freq_value)
 
-
         self.train_tensors = train_tensors
 
         self.test_tensors = test_tensors
@@ -420,38 +421,47 @@ def __init__(self,
 
         self.lagged_value = lagged_value
 
-    def __getitem__(self, idx, train=True):
+    def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:
         if idx < 0:
             if -idx > len(self):
                 raise ValueError("absolute value of index should not exceed dataset length")
             idx = len(self) + idx
         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if only_dataset_idx:
+            return dataset_idx
         if dataset_idx == 0:
             sample_idx = idx
         else:
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return dataset_idx, sample_idx
+
+    def __getitem__(self, idx, train=True):
+        dataset_idx, sample_idx = self._get_dataset_indices(idx)
         return self.datasets[dataset_idx].__getitem__(sample_idx, train)
 
     def get_validation_set(self, idx):
-        if idx < 0:
-            if -idx > len(self):
-                raise ValueError("absolute value of index should not exceed dataset length")
-            idx = len(self) + idx
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
-        if dataset_idx == 0:
-            sample_idx = idx
-        else:
-            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        dataset_idx, sample_idx = self._get_dataset_indices(idx)
         return self.datasets[dataset_idx].get_val_seq_set(sample_idx)
 
     def get_time_series_seq(self, idx) -> TimeSeriesSequence:
-        if idx < 0:
-            if -idx > len(self):
-                raise ValueError("absolute value of index should not exceed dataset length")
-            idx = len(self) + idx
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        dataset_idx = self._get_dataset_indices(idx, True)
         return self.datasets[dataset_idx]
 
+    def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
+        test_indices = np.where(test_indices < 0, test_indices + len(self), test_indices)
+        y_test = np.ones([len(test_indices), self.n_prediction_steps, self.num_target])
+        y_test_argsort = np.argsort(test_indices)
+        dataset_idx = self._get_dataset_indices(test_indices[y_test_argsort[0]], only_dataset_idx=True)
+
+        for y_i in y_test_argsort:
+            test_idx = test_indices[y_i]
+            while test_idx > self.cumulative_sizes[dataset_idx]:
+                dataset_idx += 1
+            if dataset_idx != 0:
+                test_idx = test_idx - self.cumulative_sizes[dataset_idx - 1]
+            y_test[y_i] = self.datasets[dataset_idx].get_test_target(test_idx)
+        return y_test.reshape([-1, self.num_target])
+
     def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
@@ -710,7 +720,8 @@ def create_cross_val_splits(
             if self.shift_input_data:
                 split = self.cross_validators[cross_val_type.name](self.random_state,
                                                                    num_splits,
-                                                                   indices=idx_start + np.arange(len(dataset)), **kwargs)
+                                                                   indices=idx_start + np.arange(len(dataset)),
+                                                                   **kwargs)
             else:
                 # If the data is not shifted, we need to discard the last n_prediction_steps such that we have enough
                 # y values
@@ -805,7 +816,7 @@ def create_refit_split(
             if self.shift_input_data:
                 split = [np.arange(len(dataset)), np.array([len(dataset) - 1])]
             else:
-                last_idx = len(dataset) - self.n_prediction_steps -1
+                last_idx = len(dataset) - self.n_prediction_steps - 1
                 split = [np.arange(len(dataset) - self.n_prediction_steps), np.array([last_idx])]
 
             for idx_split in range(2):
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 8c310b68f..f704573a2 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -657,7 +657,6 @@ def _init_fit_dictionary(
             raise ValueError(f"budget type must be `epochs` or `runtime` or {FORECASTING_BUDGET_TYPE} "
                              f"(Only used in forecasting taskss), but got {self.budget_type}")
 
-
     def _get_pipeline(self) -> BaseEstimator:
         """
         Implements a pipeline object based on the self.configuration attribute.
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 2a5f55864..106f1107b 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -19,6 +19,7 @@
 from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.common import subsampler
+from autoPyTorch.evaluation.abstract_evaluator import DummyTimeSeriesForecastingPipeline
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
@@ -42,7 +43,19 @@ def __init__(self, backend: Backend, queue: Queue,
                  logger_port: Optional[int] = None,
                  keep_models: Optional[bool] = None,
                  all_supported_metrics: bool = True,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 max_budget: float = 1.0,
+                 min_num_test_instances: Optional[int] = None) -> None:
+        """
+        Attributes:
+            max_budget (Optional[float]):
+                maximal budget the optimizer could allocate
+            min_num_test_instances: Optional[int]
+                minimal number of validation instances to be evaluated, if the size of the validation set is greater
+                than this value, then less instances from validation sets will be evaluated. The other predictions
+                 will be filled with dummy predictor
+
+        """
         super(TimeSeriesForecastingTrainEvaluator, self).__init__(
             backend=backend,
             queue=queue,
@@ -73,6 +86,10 @@ def __init__(self, backend: Backend, queue: Queue,
             seasonality = min(seasonality)  # Use to calculate MASE
         self.seasonality = int(seasonality)
 
+        self.max_budget = max_budget
+        self.min_num_test_instances = min_num_test_instances
+
+
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
         holdout"""
@@ -88,18 +105,10 @@ def fit_predict_and_loss(self) -> None:
 
             train_split, test_split = self.splits[split_id]
 
-            # TODO move these lines to TimeSeriesForecastingDatasets (Create a new object there that inherents from
-            #  np.array while return multiple values by __get_item__)!
-            # the +1 in the end indicates that X and y are not aligned (y and x with the same index corresponds to
-            # the same time step).
-            test_split_base = test_split + np.arange(len(test_split)) * self.n_prediction_steps + 1
+            self.Y_optimization = self.datamanager.get_test_target(test_split)
 
-            y_test_split = np.repeat(test_split_base, self.n_prediction_steps) + \
-                           np.tile(np.arange(self.n_prediction_steps), len(test_split_base))
 
-            self.Y_optimization = self.y_train[y_test_split]
-
-            #self.Y_actual_train = self.y_train[train_split]
+            # self.Y_actual_train = self.y_train[train_split]
             y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
                                                                                         train_indices=train_split,
                                                                                         test_indices=test_split,
@@ -113,7 +122,7 @@ def fit_predict_and_loss(self) -> None:
                                   }
 
             train_loss = None
-            loss = self._loss(self.y_train[y_test_split], y_opt_pred, **forecasting_kwargs)
+            loss = self._loss(self.Y_optimization, y_opt_pred, **forecasting_kwargs)
 
             additional_run_info = pipeline.get_additional_run_info() if hasattr(
                 pipeline, 'get_additional_run_info') else {}
@@ -158,24 +167,18 @@ def fit_predict_and_loss(self) -> None:
 
             for i, (train_split, test_split) in enumerate(self.splits):
                 pipeline = self.pipelines[i]
-                test_split_base = test_split + np.arange(len(test_split)) * self.n_prediction_steps + 1
 
                 train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
                                                                                     train_indices=train_split,
                                                                                     test_indices=test_split,
                                                                                     add_pipeline_to_self=False)
-                #Y_train_pred[i] = train_pred
+                # Y_train_pred[i] = train_pred
                 Y_optimization_pred[i] = opt_pred
                 Y_valid_pred[i] = valid_pred
                 Y_test_pred[i] = test_pred
                 train_splits[i] = train_split
 
-                #self.Y_train_targets[train_split] = self.y_train[train_split]
-
-                y_test_split = np.repeat(test_split_base, self.n_prediction_steps) + \
-                               np.tile(np.arange(self.n_prediction_steps), len(test_split_base))
-
-                self.Y_targets[i] = self.y_train[y_test_split]
+                self.Y_targets[i] = self.datamanager.get_test_target(test_split)
                 # Compute train loss of this fold and store it. train_loss could
                 # either be a scalar or a dict of scalars with metrics as keys.
 
@@ -288,17 +291,53 @@ def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.n
         mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
         return mase_coefficient
 
+    def create_validation_sub_set(self, test_indices: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+        num_test_instances = len(test_indices)
+
+        if num_test_instances < self.min_num_test_instances or self.budget >= self.max_budget:
+            # if the length of test indices is smaller than the
+            return test_indices, None
+        num_val_instance = min(num_test_instances,
+                               max(self.min_num_test_instances,
+                                   int(num_test_instances * self.budget / self.max_budget)
+                                   ))
+        test_subset_indices = np.linspace(0, num_test_instances, num_val_instance, endpoint=False, dtype=np.int)
+        return test_indices[test_subset_indices], test_subset_indices
+
     def _predict(self, pipeline: BaseEstimator,
                  train_indices: Union[np.ndarray, List],
                  test_indices: Union[np.ndarray, List],
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
+
+        if self.min_num_test_instances is not None:
+            test_indices_subset, test_split_subset_idx = self.create_validation_sub_set(test_indices)
+        else:
+            test_indices_subset = test_indices
+            test_split_subset_idx = None
+
         val_sets = []
-        for test_idx in test_indices:
+
+        for test_idx in test_indices_subset:
             val_sets.append(self.datamanager.get_validation_set(test_idx))
         opt_pred = self.predict_function(val_sets, pipeline)
         opt_pred = opt_pred.reshape(-1, self.num_targets)
 
-        #TODO we consider X_valid and X_test as a multiple sequences???
+        if test_split_subset_idx is not None:
+            dummy_pipeline = DummyTimeSeriesForecastingPipeline(0, n_prediction_steps=self.n_prediction_steps)
+            remaining_indices = np.setdiff1d(np.arange(len(test_indices)), test_indices_subset)
+            val_set_remain = []
+            for remaining_idx in remaining_indices:
+                val_set_remain.append(self.datamanager.get_validation_set(test_indices[remaining_idx]))
+            y_opt_full = np.zeros([len(test_indices), self.n_prediction_steps, self.num_targets])
+            y_opt_full[remaining_indices] = dummy_pipeline.predict(val_set_remain).reshape([-1,
+                                                                                            self.n_prediction_steps,
+                                                                                            self.num_targets])
+            y_opt_full[test_split_subset_idx] = opt_pred.reshape([-1, self.n_prediction_steps, self.num_targets])
+            opt_pred = y_opt_full
+
+        opt_pred = opt_pred.reshape(-1, self.num_targets)
+
+        # TODO we consider X_valid and X_test as a multiple sequences???
         if self.X_valid is not None:
             valid_sets = []
             for val_seq in enumerate(self.datamanager.datasets):
@@ -320,4 +359,3 @@ def _predict(self, pipeline: BaseEstimator,
             test_pred = None
 
         return np.empty(1), opt_pred, valid_pred, test_pred
-
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 99a0bc448..e2351142d 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -404,6 +404,7 @@ def eval_function(
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
         instance: str = None,
         evaluator_class: Optional[AbstractEvaluator] = None,
+        **evaluator_kwargs,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
@@ -488,6 +489,7 @@ def eval_function(
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
-        search_space_updates=search_space_updates
+        search_space_updates=search_space_updates,
+        **evaluator_kwargs
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 7e77bd0f9..88f618e2b 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -278,6 +278,9 @@ def __init__(self,
                 suggested_init_models=suggested_init_models,
                 custom_init_setting_path=custom_init_setting_path)
 
+        if self.time_series_forecasting:
+            self.min_num_test_instances = kwargs.get('min_num_test_instances', None)
+
     def reset_data_manager(self) -> None:
         if self.datamanager is not None:
             del self.datamanager
@@ -332,8 +335,6 @@ def run_smbo(self, func: Optional[Callable] = None
             pynisher_context=self.pynisher_context,
         )
 
-        if self.time_series_forecasting:
-            ta_kwargs["evaluator_class"] = TimeSeriesForecastingTrainEvaluator
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")
 
@@ -386,6 +387,12 @@ def run_smbo(self, func: Optional[Callable] = None
                 self.min_budget = self.min_budget / self.max_budget
                 self.max_budget = 1.0
 
+        if self.time_series_forecasting:
+            ta_kwargs["evaluator_class"] = TimeSeriesForecastingTrainEvaluator
+            ta_kwargs['max_budget'] = self.max_budget
+            ta_kwargs['min_num_test_instances'] = self.min_num_test_instances
+
+
         if self.get_smac_object_callback is not None:
             smac = self.get_smac_object_callback(scenario_dict=scenario_dict,
                                                  seed=seed,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5dd65db5a..c21c739a4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -467,10 +467,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         sampler_indices_train = np.arange(num_instances_dataset)
 
-        seed_train = self.random_state.randint(0, 2 ** 20)
-        generator_train = torch.Generator()
-        generator_train.manual_seed(seed_train)
-
         self.sampler_train = TimeSeriesSampler(indices=sampler_indices_train, seq_lengths=seq_train_length,
                                                num_instances_per_seqs=num_instances_per_seqs,
                                                min_start=min_start)
@@ -486,7 +482,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             sampler=self.sampler_train,
         )
 
-
         num_samples_val = int(np.sum(num_instances_per_seqs)) // 5
         if num_samples_val > len(val_dataset):
             sampler_val = None

From 94891f20b416e177beb857cb71ae6b80567c3b4c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 30 Jan 2022 16:35:46 +0100
Subject: [PATCH 149/347] fix dummy forecastro isues

---
 autoPyTorch/evaluation/abstract_evaluator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index f704573a2..fbd1dabb4 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -329,9 +329,8 @@ def fit(self, X: Dict[str, Any], y: Any,
 
     def _genreate_dummy_forecasting(self, X):
         if isinstance(X[0], TimeSeriesSequence):
-            X_tail = [x.X[-1] for x in X]
+            X_tail = [x.Y[-1 - self.n_prediction_steps] for x in X]
         else:
-            # test
             X_tail = [x[-1] for x in X]
         return X_tail
 
@@ -342,7 +341,9 @@ def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
                 batch_size: int = 1000) -> np.array:
-        X_tail = self._genreate_dummy_forecasting(X)
+        X_tail = np.asarray(self._genreate_dummy_forecasting(X))
+        if X_tail.ndim == 1:
+            X_tail = np.expand_dims(X_tail, -1)
         return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
 
     @staticmethod

From 8fc51b16f1aba071de9655ea54d0e2ff8a085edd Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 30 Jan 2022 17:43:44 +0100
Subject: [PATCH 150/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index e700bff89..affa6d546 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -230,6 +230,9 @@ def __init__(self,
             else:
                 tmp_freq = min([freq_value for freq_value in freq if freq_value >= n_prediction_steps])
             freq_value = tmp_freq
+        else:
+            freq_value = min(1, n_prediction_steps)
+
         if isinstance(freq_value, list):
             if np.max(freq_value) < n_prediction_steps:
                 tmp_freq = n_prediction_steps
@@ -416,9 +419,11 @@ def __init__(self,
 
         # TODO doing experiments to give the most proper way of defining these two values
         if lagged_value is None:
-            freq = FREQUENCY_MAP[self.freq]
-            lagged_value = [0] + get_lags_for_frequency(freq)
-
+            if self.freq in FREQUENCY_MAP:
+                freq = FREQUENCY_MAP[self.freq]
+                lagged_value = [0] + get_lags_for_frequency(freq)
+            else:
+                lagged_value = list(range(8))
         self.lagged_value = lagged_value
 
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:

From 0b3468380bea81908c5d4ed61f6f27890d2aa901 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 30 Jan 2022 18:22:01 +0100
Subject: [PATCH 151/347] add gluonts as requirement

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 1f22e63dc..9d33bb7fd 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ ConfigSpace>=0.4.14,<0.5
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
 smac
+gluonts
 dask
 distributed>=2.2.0
 catboost

From f23840bfece649e6d27513848b3888f45f048340 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 1 Feb 2022 11:54:24 +0100
Subject: [PATCH 152/347] more data for val set for larger dataset

---
 autoPyTorch/datasets/resampling_strategy.py | 42 ++++++++++----
 autoPyTorch/datasets/time_series_dataset.py | 63 ++++++++++++++++++++-
 2 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index fbc885706..dab414ead 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -32,13 +32,15 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
         ...
 
 
-def holdout_split_forecasting(holdout: TimeSeriesSplit, indices: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+def holdout_split_forecasting(holdout: TimeSeriesSplit, indices: np.ndarray, n_prediction_steps: int,
+                              n_repeat: int = 1) -> Tuple[np.ndarray, np.ndarray]:
     """
     A function that do holdout split without raising an error: When the target sequence is too short to be split into
     training and validation set, the training set will simply ignore that and we only consider the validation set.
     """
     try:
         train, val = list(holdout.split(indices))[-1]
+        val = [val[-1 - i * n_prediction_steps] for i in reversed(range(n_repeat))]
     except ValueError:
         train = np.array([], dtype=indices.dtype)
         val = [-1]
@@ -159,17 +161,22 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
 
         Returns:
         """
+        n_prediction_steps = kwargs['n_prediction_steps']
+        n_repeat = kwargs['n_repeat']
         # TODO consider how we handle test size properly
         # Time Series prediction only requires on set of prediction for each
         # This implement needs to be combined with time series forecasting dataloader, where each time an entire
         # time series is used for prediction
-        cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=kwargs['n_prediction_steps'] - 1)
-        train, val = holdout_split_forecasting(holdout=cv, indices=indices)
+        cv = TimeSeriesSplit(n_splits=2, test_size=1 + n_prediction_steps * (n_repeat - 1), gap=n_prediction_steps - 1)
+
+        train, val = holdout_split_forecasting(holdout=cv,
+                                               indices=indices,
+                                               n_prediction_steps=n_prediction_steps,
+                                               n_repeat=n_repeat)
         return train, val
 
     @classmethod
     def get_holdout_validators(cls, *holdout_val_types: HoldoutValTypes) -> Dict[str, HoldOutFunc]:
-
         holdout_validators = {
             holdout_val_type.name: getattr(cls, holdout_val_type.name)
             for holdout_val_type in holdout_val_types
@@ -256,8 +263,11 @@ def time_series_cross_validation(random_state: np.random.RandomState,
 
         """
         test_size = kwargs['n_prediction_steps']
-        cv = TimeSeriesSplit(n_splits=num_splits, test_size=test_size, gap=0)
-        splits = [(indices[split[0]], indices[split[1][-1:]]) for split in cv.split(indices)]
+        n_repeat = kwargs['n_repeat']
+        cv = TimeSeriesSplit(n_splits=num_splits, test_size=test_size * n_repeat, gap=0)
+        splits = [(
+            indices[split[0]],
+            indices[split[1][[-1 - n * test_size for n in reversed(range(n_repeat))]]]) for split in cv.split(indices)]
         return splits
 
     @staticmethod
@@ -283,24 +293,36 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
         """
         n_prediction_steps = kwargs['n_prediction_steps']
         seasonality_h_value = kwargs['seasonality_h_value']
+        n_repeat = kwargs["n_repeat"]
         assert seasonality_h_value >= n_prediction_steps
         cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=n_prediction_steps - 1)
-        train_t, val_t = holdout_split_forecasting(cv, indices)
+
+        train_t, val_t = holdout_split_forecasting(holdout=cv,
+                                                   indices=indices,
+                                                   n_prediction_steps=n_prediction_steps,
+                                                   n_repeat=n_repeat)
+
         splits = [(train_t, val_t)]
         if len(indices) < seasonality_h_value - n_prediction_steps:
             if len(indices) == 1:
                 train_s = train_t
                 val_s = val_t
             else:
-                train_s, val_s = holdout_split_forecasting(cv, indices[:-1])
+                train_s, val_s = holdout_split_forecasting(cv, indices[:-1],
+                                                           n_prediction_steps=n_prediction_steps,
+                                                           n_repeat=n_repeat)
         else:
-            train_s, val_s = holdout_split_forecasting(cv, indices[:-seasonality_h_value + n_prediction_steps])
+            train_s, val_s = holdout_split_forecasting(cv, indices[:-seasonality_h_value + n_prediction_steps],
+                                                       n_prediction_steps=n_prediction_steps,
+                                                       n_repeat=n_repeat)
         splits.append((train_s, val_s))
         if num_splits > 2:
             freq_value = int(kwargs['freq_value'])
             for i_split in range(2, num_splits):
                 n_tail = (i_split - 1) * freq_value + seasonality_h_value - n_prediction_steps
-                train_s, val_s = holdout_split_forecasting(cv, indices[:-n_tail])
+                train_s, val_s = holdout_split_forecasting(cv, indices[:-n_tail],
+                                                           n_prediction_steps=n_prediction_steps,
+                                                           n_repeat=n_repeat)
                 splits.append((train_s, val_s))
         return splits
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index affa6d546..1dbd531b8 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -334,6 +334,48 @@ def __init__(self,
                 else:
                     resampling_strategy_args.update({'num_splits': num_splits})
 
+        num_seqs = len(sequence_lengths)
+
+        if resampling_strategy_args is not None and "n_repeat" not in resampling_strategy_args:
+            n_repeat = resampling_strategy_args["n_repeat"]
+        else:
+            n_repeat = None
+        if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
+                minimal_seq_length > 50 * n_prediction_steps:
+            if n_repeat is None:
+                if num_seqs < 100:
+                    n_repeat = int(np.ceil(100.0 / num_seqs))
+                else:
+                    n_repeat = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
+
+            if resampling_strategy == CrossValTypes.time_series_cross_validation:
+                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps * num_splits))
+            elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
+                seasonality_h_value = int(np.round(
+                    (self.n_prediction_steps * n_repeat // int(self.freq_value) + 1) * self.freq_value)
+                )
+
+                while minimal_seq_length // 5 < (num_splits - 1) * n_repeat * freq_value \
+                        + seasonality_h_value - n_repeat * n_prediction_steps:
+                    n_repeat -= 1
+                    seasonality_h_value = int(np.round(
+                        (self.n_prediction_steps * n_repeat // int(self.freq_value) + 1) * self.freq_value)
+                    )
+            elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
+                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps ) - 1)
+
+            else:
+                raise NotImplementedError("Unsupported resampling_strategy")
+
+            n_repeat = max(n_repeat, 1)
+        if n_repeat is None:
+            n_repeat = 1
+
+        if resampling_strategy_args is None:
+            resampling_strategy_args = {'n_repeat': n_repeat}
+        else:
+            resampling_strategy_args.update({'n_repeat': n_repeat})
+
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
 
@@ -642,18 +684,26 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                 'val_share', None)
             if self.resampling_strategy_args is not None:
                 val_share = self.resampling_strategy_args.get('val_share', val_share)
+                n_repeat = self.resampling_strategy_args.get("n_repeat", 1)
+            else:
+                n_repeat = 1
             splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
-                                                        val_share=val_share))
+                                                        val_share=val_share,
+                                                        n_repeat=n_repeat))
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
                 'num_splits', None)
             if self.resampling_strategy_args is not None:
                 num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
+                n_repeat = self.resampling_strategy_args.get("n_repeat", 1)
+            else:
+                n_repeat = 1
             # Create the split if it was not created before
             splits.extend(self.create_cross_val_splits(
                 cross_val_type=self.resampling_strategy,
                 num_splits=cast(int, num_splits),
+                n_repeat=n_repeat
             ))
         elif self.resampling_strategy is None:
             splits.append(self.create_refit_split())
@@ -690,7 +740,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
     def create_cross_val_splits(
             self,
             cross_val_type: CrossValTypes,
-            num_splits: int
+            num_splits: int,
+            n_repeat=1,
     ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
         """
         This function creates the cross validation split for the given task.
@@ -699,6 +750,7 @@ def create_cross_val_splits(
         Args:
             cross_val_type (CrossValTypes):
             num_splits (int): number of splits to be created
+            n_repeat (int): how many n_prediction_steps to repeat in the validation set
 
         Returns:
             (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]):
@@ -719,6 +771,8 @@ def create_cross_val_splits(
             seasonality_h_value = int(np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
             kwargs.update({'seasonality_h_value': seasonality_h_value,
                            'freq_value': self.freq_value})
+        kwargs["n_repeat"] = n_repeat
+
         splits = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
 
         for idx_seq, dataset in enumerate(self.datasets):
@@ -753,6 +807,7 @@ def create_holdout_val_split(
             self,
             holdout_val_type: HoldoutValTypes,
             val_share: float,
+            n_repeat: int = 1,
     ) -> Tuple[np.ndarray, np.ndarray]:
         """
         This function creates the holdout split for the given task.
@@ -761,6 +816,7 @@ def create_holdout_val_split(
         Args:
             holdout_val_type (HoldoutValTypes):
             val_share (float): share of the validation data
+            n_repeat (int): how many n_prediction_steps to repeat in the validation set
 
         Returns:
             (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
@@ -774,7 +830,8 @@ def create_holdout_val_split(
             raise ValueError(f"`val_share` must be between 0 and 1, got {val_share}.")
         if not isinstance(holdout_val_type, HoldoutValTypes):
             raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
-        kwargs = {"n_prediction_steps": self.n_prediction_steps}
+        kwargs = {"n_prediction_steps": self.n_prediction_steps,
+                  "n_repeat": n_repeat}
 
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0

From dc84904801869a33c39ed68def65af96c8e791b4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 2 Feb 2022 11:41:01 +0100
Subject: [PATCH 153/347] maint

---
 .../data_loader/time_series_forecasting_data_loader.py        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index c21c739a4..fb7d72b84 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -437,7 +437,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         else:
             _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
         # create masks for masking
-        seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)
+        seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)[0]
+        if len(seq_idx_inactivate) == seq_train_length.size:
+            seq_idx_inactivate = self.random_state.choice(seq_idx_inactivate, len(seq_idx_inactivate)-1, replace=False )
         # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
 
         """

From 6981553f877694e77c7c88ccc61bdb37125aa1e8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 3 Feb 2022 09:42:18 +0100
Subject: [PATCH 154/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 1dbd531b8..d1b9565be 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -365,7 +365,7 @@ def __init__(self,
                 n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps ) - 1)
 
             else:
-                raise NotImplementedError("Unsupported resampling_strategy")
+                n_repeat = 1
 
             n_repeat = max(n_repeat, 1)
         if n_repeat is None:

From 3fda94aed87053ac3139e5135161e06f6029a1d0 Mon Sep 17 00:00:00 2001
From: NHML23117 <nhmldeng@login03.css.lan>
Date: Mon, 14 Feb 2022 19:23:37 +0100
Subject: [PATCH 155/347] fix nbeats decoder

---
 .../forecasting_decoder/NBEATSDecoder.py      | 94 +++++++++----------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 67076e138..ae769c1b5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -104,57 +104,57 @@ def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
         in_features = input_shape[-1]
         n_beats_type = self.config['n_beats_type']
         if n_beats_type == 'G':
-            stacks = [[] for _ in range(self.config['num_stacks_G'])]
-            for stack_idx in range(1, self.config['num_stacks_G'] + 1):
-                for block_idx in range(self.config['num_blocks_G']):
-                    if self.config['weight_sharing_G'] and block_idx > 0:
+            stacks = [[] for _ in range(self.config['num_stacks_g'])]
+            for stack_idx in range(1, self.config['num_stacks_g'] + 1):
+                for block_idx in range(self.config['num_blocks_g']):
+                    if self.config['weight_sharing_g'] and block_idx > 0:
                         # for weight sharing, we only create one instance
                         break
-                    ecl = self.config['expansion_coefficient_length_G']
+                    ecl = self.config['expansion_coefficient_length_g']
                     stacks[stack_idx - 1].append(NBEATSBLock(in_features,
                                                              stack_idx=stack_idx,
                                                              stack_type='generic',
-                                                             num_blocks=self.config['num_blocks_G'],
-                                                             num_layers=self.config['num_layers_G'],
-                                                             width=self.config['width_G'],
+                                                             num_blocks=self.config['num_blocks_g'],
+                                                             num_layers=self.config['num_layers_g'],
+                                                             width=self.config['width_g'],
                                                              normalization=self.config['normalization'],
                                                              activation=self.config['activation'],
-                                                             weight_sharing=self.config['weight_sharing_G'],
+                                                             weight_sharing=self.config['weight_sharing_g'],
                                                              expansion_coefficient_length=ecl,
-                                                             use_dropout=self.config['use_dropout_G'],
-                                                             dropout_rate=self.config.get('dropout_G', None),
+                                                             use_dropout=self.config['use_dropout_g'],
+                                                             dropout_rate=self.config.get('dropout_g', None),
                                                              ))
 
         elif n_beats_type == 'I':
-            stacks = [[] for _ in range(self.config['num_stacks_I'])]
-            for stack_idx in range(1, self.config['num_stacks_I'] + 1):
-                for block_idx in range(self.config['num_blocks_I_%d' % stack_idx]):
-                    if self.config['weight_sharing_I_%d' % stack_idx] and block_idx > 0:
+            stacks = [[] for _ in range(self.config['num_stacks_i'])]
+            for stack_idx in range(1, self.config['num_stacks_i'] + 1):
+                for block_idx in range(self.config['num_blocks_i_%d' % stack_idx]):
+                    if self.config['weight_sharing_i_%d' % stack_idx] and block_idx > 0:
                         # for weight sharing, we only create one instance
                         break
-                    stack_type = self.config['stack_type_I_%d' % stack_idx]
+                    stack_type = self.config['stack_type_i_%d' % stack_idx]
                     if stack_type == 'generic':
-                        ecl = self.config['expansion_coefficient_length_I_generic_%d' % stack_idx]
+                        ecl = self.config['expansion_coefficient_length_i_generic_%d' % stack_idx]
                     elif stack_type == 'trend':
-                        ecl = self.config['expansion_coefficient_length_I_trend_%d' % stack_idx]
+                        ecl = self.config['expansion_coefficient_length_i_trend_%d' % stack_idx]
                     elif stack_type == 'seasonality':
-                        ecl = self.config['expansion_coefficient_length_I_seasonality_%d' % stack_idx]
+                        ecl = self.config['expansion_coefficient_length_i_seasonality_%d' % stack_idx]
                     else:
                         raise ValueError(f"Unsupported stack_type {stack_type}")
 
                     stacks[stack_idx - 1].append(NBEATSBLock(in_features,
                                                              stack_idx=stack_idx,
                                                              stack_type=stack_type,
-                                                             num_blocks=self.config['num_blocks_I_%d' % stack_idx],
-                                                             num_layers=self.config['num_layers_I_%d' % stack_idx],
-                                                             width=self.config['width_I_%d' % stack_idx],
+                                                             num_blocks=self.config['num_blocks_i_%d' % stack_idx],
+                                                             num_layers=self.config['num_layers_i_%d' % stack_idx],
+                                                             width=self.config['width_i_%d' % stack_idx],
                                                              normalization=self.config['normalization'],
                                                              activation=self.config['activation'],
-                                                             weight_sharing=self.config[f'weight_sharing_I_%d' %
+                                                             weight_sharing=self.config[f'weight_sharing_i_%d' %
                                                                                         stack_idx],
                                                              expansion_coefficient_length=ecl,
-                                                             use_dropout=self.config['use_dropout_I'],
-                                                             dropout_rate=self.config.get('dropout_I_%d' %
+                                                             use_dropout=self.config['use_dropout_i'],
+                                                             dropout_rate=self.config.get('dropout_i_%d' %
                                                                                           stack_idx, None),
                                                              ))
         else:
@@ -189,44 +189,44 @@ def get_hyperparameter_search_space(
                 default_value='I'
             ),
             num_stacks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="num_stacks_G",
+                hyperparameter="num_stacks_g",
                 value_range=(2, 32),
                 default_value=30,
                 log=True,
             ),
             num_blocks_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'num_blocks_G',
+                'num_blocks_g',
                 value_range=(1, 2),
                 default_value=1
             ),
             num_layers_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'num_layers_G',
+                'num_layers_g',
                 value_range=(1, 4),
                 default_value=4
             ),
             width_g: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'width_G',
+                'width_g',
                 value_range=(16, 512),
                 default_value=256,
                 log=True
             ),
             num_stacks_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="num_stacks_I",
+                hyperparameter="num_stacks_i",
                 value_range=(1, 4),
                 default_value=2
             ),
             num_blocks_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'num_blocks_I',
+                'num_blocks_i',
                 value_range=(1, 5),
                 default_value=3
             ),
             num_layers_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'num_layers_I',
+                'num_layers_i',
                 value_range=(1, 5),
                 default_value=3
             ),
             width_i: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                'width_I',
+                'width_i',
                 value_range=(16, 2048),
                 default_value=512,
                 log=True
@@ -335,20 +335,20 @@ def get_hyperparameter_search_space(
         cs.add_hyperparameter(n_beats_type)
         # N-BEATS-G
 
-        weight_sharing_g = HyperparameterSearchSpace(hyperparameter='weight_sharing_G',
+        weight_sharing_g = HyperparameterSearchSpace(hyperparameter='weight_sharing_g',
                                                      value_range=weight_sharing.value_range,
                                                      default_value=weight_sharing.default_value,
                                                      log=weight_sharing.log)
-        use_dropout_g = HyperparameterSearchSpace(hyperparameter='use_dropout_G',
+        use_dropout_g = HyperparameterSearchSpace(hyperparameter='use_dropout_g',
                                                   value_range=use_dropout.value_range,
                                                   default_value=use_dropout.default_value,
                                                   log=use_dropout.log)
-        dropout_g = HyperparameterSearchSpace(hyperparameter='dropout_G',
+        dropout_g = HyperparameterSearchSpace(hyperparameter='dropout_g',
                                               value_range=dropout.value_range,
                                               default_value=dropout.default_value,
                                               log=dropout.log)
         ecl_g_search_space = HyperparameterSearchSpace(
-            hyperparameter='expansion_coefficient_length_G',
+            hyperparameter='expansion_coefficient_length_g',
             value_range=expansion_coefficient_length_generic.value_range,
             default_value=expansion_coefficient_length_generic.default_value,
             log=expansion_coefficient_length_generic.log
@@ -374,7 +374,7 @@ def get_hyperparameter_search_space(
 
         min_num_stacks_i, max_num_stacks_i = num_stacks_i.value_range
 
-        use_dropout_i = HyperparameterSearchSpace(hyperparameter='use_dropout_I',
+        use_dropout_i = HyperparameterSearchSpace(hyperparameter='use_dropout_i',
                                                   value_range=use_dropout.value_range,
                                                   default_value=use_dropout.default_value,
                                                   log=use_dropout.log)
@@ -388,40 +388,40 @@ def get_hyperparameter_search_space(
                            ])
 
         for stack_idx in range(1, int(max_num_stacks_i) + 1):
-            num_blocks_i_search_space = HyperparameterSearchSpace(hyperparameter='num_blocks_I_%d' % stack_idx,
+            num_blocks_i_search_space = HyperparameterSearchSpace(hyperparameter='num_blocks_i_%d' % stack_idx,
                                                                   value_range=num_blocks_i.value_range,
                                                                   default_value=num_blocks_i.default_value,
                                                                   log=num_blocks_i.log)
-            num_layers_i_search_space = HyperparameterSearchSpace(hyperparameter='num_layers_I_%d' % stack_idx,
+            num_layers_i_search_space = HyperparameterSearchSpace(hyperparameter='num_layers_i_%d' % stack_idx,
                                                                   value_range=num_layers_i.value_range,
                                                                   default_value=num_layers_i.default_value,
                                                                   log=num_layers_i.log)
-            width_i_search_space = HyperparameterSearchSpace(hyperparameter='width_I_%d' % stack_idx,
+            width_i_search_space = HyperparameterSearchSpace(hyperparameter='width_i_%d' % stack_idx,
                                                              value_range=width_i.value_range,
                                                              default_value=width_i.default_value,
                                                              log=width_i.log)
-            weight_sharing_i_search_space = HyperparameterSearchSpace(hyperparameter='weight_sharing_I_%d' % stack_idx,
+            weight_sharing_i_search_space = HyperparameterSearchSpace(hyperparameter='weight_sharing_i_%d' % stack_idx,
                                                                       value_range=weight_sharing.value_range,
                                                                       default_value=weight_sharing.default_value,
                                                                       log=weight_sharing.log)
-            stack_type_i_search_space = HyperparameterSearchSpace(hyperparameter='stack_type_I_%d' % stack_idx,
+            stack_type_i_search_space = HyperparameterSearchSpace(hyperparameter='stack_type_i_%d' % stack_idx,
                                                                   value_range=stack_type.value_range,
                                                                   default_value=stack_type.default_value,
                                                                   log=stack_type.log)
             expansion_coefficient_length_generic_search_space = HyperparameterSearchSpace(
-                hyperparameter='expansion_coefficient_length_I_generic_%d' % stack_idx,
+                hyperparameter='expansion_coefficient_length_i_generic_%d' % stack_idx,
                 value_range=expansion_coefficient_length_generic.value_range,
                 default_value=expansion_coefficient_length_generic.default_value,
                 log=expansion_coefficient_length_generic.log
             )
             expansion_coefficient_length_seasonality_search_space = HyperparameterSearchSpace(
-                hyperparameter='expansion_coefficient_length_I_seasonality_%d' % stack_idx,
+                hyperparameter='expansion_coefficient_length_i_seasonality_%d' % stack_idx,
                 value_range=expansion_coefficient_length_seasonality.value_range,
                 default_value=expansion_coefficient_length_seasonality.default_value,
                 log=expansion_coefficient_length_seasonality.log
             )
             expansion_coefficient_length_trend_search_space = HyperparameterSearchSpace(
-                hyperparameter='expansion_coefficient_length_I_trend_%d' % stack_idx,
+                hyperparameter='expansion_coefficient_length_i_trend_%d' % stack_idx,
                 value_range=expansion_coefficient_length_trend.value_range,
                 default_value=expansion_coefficient_length_trend.default_value,
                 log=expansion_coefficient_length_trend.log
@@ -480,7 +480,7 @@ def get_hyperparameter_search_space(
                 ]
                 )
 
-            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_I_%d' % stack_idx,
+            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_i_%d' % stack_idx,
                                                              value_range=dropout.value_range,
                                                              default_value=dropout.default_value,
                                                              log=dropout.log)

From d95e230d0ef3cfe626b3b5bf9fad5aaf115a6726 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 16 Feb 2022 10:07:09 +0100
Subject: [PATCH 156/347] new dataset interface

---
 autoPyTorch/api/time_series_forecasting.py    |  14 +-
 .../data/time_series_forecasting_validator.py | 141 ++++++++-------
 autoPyTorch/datasets/time_series_dataset.py   | 170 ++++++++++--------
 .../forecasting_target_scaling/__init__.py    |  20 +--
 .../base_target_scaler.py                     |   4 -
 .../base_forecasting_encoder.py               |  34 ++--
 .../time_series_forecasting_data_loader.py    | 102 +++++++----
 .../pipeline/time_series_forecasting.py       |  25 +--
 8 files changed, 289 insertions(+), 221 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 631b7f2d0..efc6bb349 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -336,16 +336,22 @@ def search(
 
     def predict(
             self,
-            X_test: List[np.ndarray],
+            X_test: Union[Optional[Union[List[np.ndarray]], pd.DataFrame], Dict]=None,
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
-            y_train: Optional[List[np.ndarray]] = None,
+            past_targets: Optional[List[np.ndarray]] = None,
     ) -> np.ndarray:
         """
                     target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
                 (used for multi-variable prediction), indicates which value needs to be predicted
         """
+        if past_targets is None:
+            if not isinstance(X_test, Dict) or "past_targets" not in X_test:
+                raise ValueError("Past Targets must be given")
+        else:
+            X_test = {"features": X_test,
+                      "past_targets": past_targets}
         flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
         if self.dataset.num_target == 1:
-            return flattened_res.reshape([len(X_test), self.dataset.n_prediction_steps])
-        return flattened_res.reshape([len(X_test), self.dataset.n_prediction_steps, self.dataset.num_target])
+            return flattened_res.reshape([-1, self.dataset.n_prediction_steps])
+        return flattened_res.reshape([-1, self.dataset.n_prediction_steps, self.dataset.num_target])
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 8943087db..9bf89c9a8 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -2,7 +2,7 @@
 
 # -*- encoding: utf-8 -*-
 import logging
-import typing
+from typing import Optional, Tuple, List, Union
 import numpy as np
 
 from sklearn.base import BaseEstimator
@@ -11,9 +11,19 @@
 from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
 from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 
 
 class TimeSeriesForecastingInputValidator(TabularInputValidator):
+    def __init__(self,
+                 is_classification: bool = False,
+                 logger_port: Optional[int] = None,
+                 ) -> None:
+        super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port)
+        self._is_uni_variant = False
+        self.known_future_features = None
+        self.n_prediction_steps = 1
+
     """
     A validator designed for a time series forecasting dataset.
     As a time series forecasting dataset might contain several time sequnces with
@@ -21,29 +31,47 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
 
     def fit(
             self,
-            X_train: SUPPORTED_FEAT_TYPES,
+            X_train: Optional[SUPPORTED_FEAT_TYPES],
             y_train: SUPPORTED_TARGET_TYPES,
-            X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
-            y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+            X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+            y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+            n_prediction_steps: int = 1,
+            known_future_features: Optional[List[Union[int, str]]] = None,
     ) -> BaseEstimator:
-        # Check that the data is valid
-        if len(X_train) != len(y_train):
-            raise ValueError("Inconsistent number of sequences for features and targets,"
-                             " {} for features and {} for targets".format(len(X_train), len(y_train), ))
-
-        if X_test is not None:
-            if len(X_test) != len(y_test):
-                raise ValueError("Inconsistent number of test datapoints for features and targets,"
-                                 " {} for features and {} for targets".format(len(X_test), len(y_test), ))
-            super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
+        self.n_prediction_steps = n_prediction_steps
+        if X_train is None:
+            self._is_uni_variant = True
+        if self._is_uni_variant:
+            self.feature_validator.num_features = 0
+            self.feature_validator.numerical_columns = []
+            self.feature_validator.categorical_columns = []
+
+            if y_test is not None:
+                self.target_validator.fit(y_train[0], y_test[0])
+            else:
+                self.target_validator.fit(y_train[0])
+            self._is_fitted = True
         else:
-            super().fit(X_train[0], y_train[0])
-
-        self.check_input_shapes(X_train, y_train, is_training=True)
-
-        if X_test is not None:
-            self.check_input_shapes(X_test, y_test, is_training=False)
-        return self
+            self.known_future_features = known_future_features
+            # Check that the data is valid
+            if len(X_train) != len(y_train):
+                raise ValueError("Inconsistent number of sequences for features and targets,"
+                                 " {} for features and {} for targets".format(len(X_train), len(y_train), ))
+
+            if X_test is not None:
+                if len(X_test) != len(y_test):
+                    raise ValueError("Inconsistent number of test datapoints for features and targets,"
+                                     " {} for features and {} for targets".format(len(X_test), len(y_test), ))
+                # TODO write a feature input validator to check X_test for known_future_features
+                super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
+            else:
+                super().fit(X_train[0], y_train[0])
+
+            self.check_input_shapes(X_train, y_train, is_training=True)
+
+            if X_test is not None:
+                self.check_input_shapes(X_test, y_test, is_training=False)
+            return self
 
     @staticmethod
     def get_num_features(X):
@@ -69,66 +97,59 @@ def check_input_shapes(X, y, is_training: bool = True):
 
     def transform(
             self,
-            X: SUPPORTED_FEAT_TYPES,
-            y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
-            shift_input_data: bool = True,
-            n_prediction_steps: int = 1
-    ) -> typing.Tuple[np.ndarray, typing.List[int], typing.Optional[np.ndarray]]:
+            X: Optional[SUPPORTED_FEAT_TYPES],
+            y: Optional[SUPPORTED_TARGET_TYPES] = None,
+    ) -> Tuple[Union[np.ndarray], List[int], Optional[np.ndarray]]:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
-        num_sequences = len(X)
-        sequence_lengths = [0] * num_sequences
-        num_features = self.feature_validator.num_features
+        if y is None:
+            raise ValueError('Targets must be given!')
 
-        if shift_input_data:
-            for seq_idx in range(num_sequences):
-                X[seq_idx] = X[seq_idx][:-n_prediction_steps]
-                # y[seq_idx] = y[seq_idx][n_prediction_steps:]
-                sequence_lengths[seq_idx] = len(X[seq_idx])
+        num_sequences = len(y)
+        sequence_lengths = [0] * num_sequences
+        if self._is_uni_variant:
+            num_features = 0
         else:
-            for seq_idx in range(num_sequences):
-                sequence_lengths[seq_idx] = len(X[seq_idx])
+            if X is None:
+                raise ValueError('Multi Variant dataset requires X as input!')
+            num_features = self.feature_validator.num_features
 
-        if y is not None:
-            num_targets = self.target_validator.out_dimensionality
+        for seq_idx in range(num_sequences):
+            sequence_lengths[seq_idx] = len(y[seq_idx])
+        sequence_lengths = np.asarray(sequence_lengths)
 
-            num_train_data = np.sum(sequence_lengths)
+        num_targets = self.target_validator.out_dimensionality
 
-            # a matrix that is concatenated by all the time series sequences
-            X_flat = np.empty([num_train_data, num_features])
-            y_flat = np.empty([num_train_data + n_prediction_steps * num_sequences, num_targets])
+        num_data = np.sum(sequence_lengths)
 
-            start_idx = 0
+        start_idx = 0
+
+        if self._is_uni_variant:
+            y_flat = np.empty([num_data, num_targets])
             for seq_idx, seq_length in enumerate(sequence_lengths):
                 end_idx = start_idx + seq_length
-                X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
-                if shift_input_data:
-                    y_flat[
-                    start_idx + n_prediction_steps * seq_idx: end_idx + n_prediction_steps * (seq_idx + 1)] = np.array(
-                        y[seq_idx]).reshape([-1, num_targets])
-                else:
-                    y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
+                y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
-
-            X_transformed = self.feature_validator.transform(X_flat)
-            y_transformed = self.target_validator.transform(y_flat)
+            y_transformed = self.target_validator.transform(y_flat)  # type:np.ndarray
             if y_transformed.ndim == 1:
                 y_transformed = np.expand_dims(y_transformed, -1)
-            return X_transformed, sequence_lengths, y_transformed
-
-        num_train_data = np.sum(sequence_lengths)
+            return np.asarray([]), y_transformed, sequence_lengths
 
         # a matrix that is concatenated by all the time series sequences
-        X_flat = np.empty([num_train_data, num_features])
+        X_flat = np.empty([num_data, num_features])
+        y_flat = np.empty([num_data, num_targets])
 
         start_idx = 0
-        # TODO make it parallel with large number of sequences
         for seq_idx, seq_length in enumerate(sequence_lengths):
             end_idx = start_idx + seq_length
             X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
+            y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
             start_idx = end_idx
 
-        X_transformed = self.feature_validator.transform(X_flat)
+        X_transformed = self.feature_validator.transform(X_flat)  # type:np.ndarray
+        y_transformed = self.target_validator.transform(y_flat)  # type:np.ndarray
+        if y_transformed.ndim == 1:
+            y_transformed = np.expand_dims(y_transformed, -1)
+        return X_transformed, y_transformed, sequence_lengths
 
-        return X_transformed, sequence_lengths
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 1dbd531b8..443e45cfc 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -47,14 +47,17 @@
 
 class TimeSeriesSequence(Dataset):
     def __init__(self,
-                 X: Union[np.ndarray, pd.DataFrame],
+                 X: Optional[Union[np.ndarray, pd.DataFrame]],
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
+                 statistic_features: Optional[np.ndarray] = None,
                  n_prediction_steps: int = 1,
                  sp: int = 1,
+                 known_future_features: Optional[List[Union[str, int]]] = None,
+                 only_has_past_targets: bool = False,
                  ):
         """
         A dataset representing a time series sequence.
@@ -75,13 +78,20 @@ def __init__(self,
         self.X_test = X_test
         self.Y_tet = Y_test
 
+        self.statistic_features = statistic_features
+
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
         self.train_transform = train_transforms
         self.val_transform = val_transforms
         self.sp = sp
-
-        self.mase_coefficient = compute_mase_coefficient(self.X, sp=self.sp, n_prediction_steps=n_prediction_steps)
+        if only_has_past_targets:
+            self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp, n_prediction_steps=n_prediction_steps)
+        else:
+            self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp,
+                                                             n_prediction_steps=n_prediction_steps)
+        self.only_has_past_targets = only_has_past_targets
+        self.known_future_features = known_future_features
 
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
@@ -98,34 +108,44 @@ def __getitem__(self, index: int, train: bool = True) \
         """
         if index < 0:
             index = self.__len__() + index
+        if self.X is not None:
+            if hasattr(self.X, 'loc'):
+                past_features = self.X.iloc[:index + 1]
+            else:
+                past_features = self.X[:index + 1]
 
-        if hasattr(self.X, 'loc'):
-            X = self.X.iloc[:index + 1]
-        else:
-            X = self.X[:index + 1]
+            if self.train_transform is not None and train:
+                past_features = self.train_transform(past_features)
+            elif self.val_transform is not None and not train:
+                past_features = self.val_transform(past_features)
 
-        if self.train_transform is not None and train:
-            X = self.train_transform(X)
-        elif self.val_transform is not None and not train:
-            X = self.val_transform(X)
+            if self.known_future_features is not None:
+                future_features = self.X[index + 1: index + self.n_prediction_steps + 1, self.known_future_features]
+            else:
+                future_features = None
+        else:
+            past_features = None
+            future_features = None
 
         # In case of prediction, the targets are not provided
-        Y = self.Y
-        if Y is not None:
-            # Y = Y[:index + self.n_prediction_steps]
-            # Y = Y[index + 1: index + self.n_prediction_steps + 1]
-            Y_future = Y[index + 1: index + self.n_prediction_steps + 1]
-
-            Y_future = torch.from_numpy(Y_future)
-            # Y_Past does not need to be fed to the network, we keep it as np array
+        targets = self.Y
+        if self.only_has_past_targets:
+            targets_future = None
         else:
-            Y_future = None
+            targets_future = targets[index + 1: index + self.n_prediction_steps + 1]
+            targets_future = torch.from_numpy(targets_future)
+
+        past_target = targets[:index + 1]
+        past_target = torch.from_numpy(past_target)
 
-        return {"past_target": torch.from_numpy(X),
-                "mase_coefficient": self.mase_coefficient}, Y_future
+        return {"past_target": past_target,
+                "past_features": past_features,
+                "future_features": future_features,
+                "statistic_features": self.statistic_features,
+                "mase_coefficient": self.mase_coefficient}, targets_future
 
     def __len__(self) -> int:
-        return self.X.shape[0]
+        return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
                          train: bool = True,
@@ -152,19 +172,29 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
         return self
 
     def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
+        if self.only_has_past_targets:
+            raise ValueError("get_val_seq_set is not supported for the sequence that only has past targets!")
         if index < 0:
             index = self.__len__() + index
         if index == self.__len__() - 1:
             return copy.copy(self)
         else:
-            return TimeSeriesSequence(self.X[:index + 1],
+            if self.X is not None:
+                X = self.X[:index + 1 + self.n_prediction_steps]
+            else:
+                X = None
+            return TimeSeriesSequence(X,
                                       self.Y[:index + 1 + self.n_prediction_steps],
                                       train_transforms=self.train_transform,
                                       val_transforms=self.val_transform,
                                       n_prediction_steps=self.n_prediction_steps,
+                                      statistic_features=self.statistic_features,
+                                      known_future_features=self.known_future_features,
                                       sp=self.sp)
 
     def get_test_target(self, test_idx: int):
+        if self.only_has_past_targets:
+            raise ValueError("get_test_target is not supported for the sequence that only has past targets!")
         if test_idx < 0:
             test_idx = self.__len__() + test_idx
         Y_future = self.Y[test_idx + 1: test_idx + self.n_prediction_steps + 1]
@@ -176,11 +206,11 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
     cumulative_sizes: List[int]
 
     def __init__(self,
-                 X: Union[np.ndarray, List[List]],
+                 X: Optional[Union[np.ndarray, List[List]]],
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 target_variables: Optional[Union[Tuple[int], int]] = None,
+                 known_future_features: Optional[Union[Tuple[int], int]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  resampling_strategy: Optional[Union[
                      CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
@@ -195,6 +225,7 @@ def __init__(self,
                  dataset_name: Optional[str] = None,
                  shift_input_data: bool = True,
                  normalize_y: bool = True,
+                 statistic_features: Optional[np.ndarray] = None,
                  ):
         """
         :param target_variables:  Optional[Union[Tuple[int], int]] used for multi-variant forecasting
@@ -211,6 +242,7 @@ def __init__(self,
         if y values needs to be normalized with mean 0 and variance 1
         if the dataset is trained with log_prob losses, this needs to be specified in the very beginning such that the
         header's configspace can be built beforehand.
+        :param statistic_features: statistic features, invariant across different
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
@@ -264,7 +296,10 @@ def __init__(self,
                              f"but receive {type(validator)}")
 
         if not self.validator._is_fitted:
-            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test, )
+            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
+                               n_prediction_steps=n_prediction_steps)
+
+        self._is_uni_variant = self.validator._is_uni_variant
 
         self.numerical_columns = self.validator.feature_validator.numerical_columns
         self.categorical_columns = self.validator.feature_validator.categorical_columns
@@ -275,31 +310,17 @@ def __init__(self,
         self.categories = self.validator.feature_validator.categories
 
         self.shift_input_data = shift_input_data
-        self.target_variables = target_variables
-        if target_variables is None:
-            if self.num_target != 1:
-                raise ValueError("target_variables must be specified if more the input has more than one feature value")
-            self.target_columns = (0,)  # to keep the output dimension unchanged
-        elif isinstance(target_variables, int):
-            self.target_columns = (target_variables,)
-        else:
-            self.target_columns = target_variables
 
-        X, sequence_lengths, Y = self.validator.transform(X, Y,
-                                                          shift_input_data=shift_input_data,
-                                                          n_prediction_steps=n_prediction_steps)
+        X, Y, sequence_lengths = self.validator.transform(X, Y)
         if X_test is not None:
-            X_test, self.sequence_lengths_tests, Y_test = self.validator.transform(X_test, Y_test,
-                                                                                   shift_input_data=shift_input_data,
-                                                                                   n_prediction_steps=n_prediction_steps
-                                                                                   )
+            X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
         else:
             self.sequence_lengths_tests = None
 
         self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
 
-        minimal_seq_length = np.min(sequence_lengths)
+        minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
         if isinstance(resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
                 'num_splits', None)
@@ -362,10 +383,10 @@ def __init__(self,
                         (self.n_prediction_steps * n_repeat // int(self.freq_value) + 1) * self.freq_value)
                     )
             elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
-                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps ) - 1)
+                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps) - 1)
 
             else:
-                raise NotImplementedError("Unsupported resampling_strategy")
+                n_repeat = 1
 
             n_repeat = max(n_repeat, 1)
         if n_repeat is None:
@@ -384,14 +405,16 @@ def __init__(self,
         self.train_transform = train_transforms
         self.val_transform = val_transforms
 
-        self.num_sequences = len(X)
-        self.sequence_lengths_train = sequence_lengths
+        self.num_sequences = len(Y)
+        self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
 
         # initialize datasets
         sequences_kwargs = {"train_transforms": self.train_transform,
                             "val_transforms": self.val_transform,
                             "n_prediction_steps": n_prediction_steps,
-                            "sp": self.seasonality}
+                            "sp": self.seasonality,
+                            "known_future_features": known_future_features,
+                            "statistic_features": statistic_features}
 
         self.y_train_mean = [0] * len(self.sequence_lengths_train)
         self.y_train_std = [1] * len(self.sequence_lengths_train)
@@ -404,6 +427,8 @@ def __init__(self,
         self.normalize_y = normalize_y
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
+        self.known_future_features = known_future_features
+        self.statistic_features = statistic_features
 
         self.seq_length_min = int(np.min(self.sequence_lengths_train))
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
@@ -435,7 +460,7 @@ def __init__(self,
                 # self.output_shape = len(np.unique(Y))
             else:
                 # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
-                num_target = X.shape[-1] if X.ndim > 1 else 1
+                num_target = Y.shape[-1] if Y.ndim > 1 else 1
             self.output_shape = [self.n_prediction_steps, num_target]
 
         # TODO: Look for a criteria to define small enough to preprocess
@@ -443,8 +468,8 @@ def __init__(self,
 
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
-        self.numerical_features: List[int] = list(range(self.num_features))
-        self.categorical_features: List[int] = []
+        self.numerical_features: List[int] = self.numerical_columns
+        self.categorical_features: List[int] = self.categorical_columns
 
         if isinstance(resampling_strategy, CrossValTypes):
             self.cross_validators = CrossValFuncs.get_cross_validators(resampling_strategy)
@@ -551,53 +576,44 @@ def make_sequences_datasets(self,
         idx_start_train = 0
         idx_start_test = 0
 
-        for seq_idx, seq_length_train in enumerate(self.sequence_lengths_train):
+        seq_length_train_flat = self.sequence_lengths_train + self.n_prediction_steps
+
+        for seq_idx, seq_length_train in enumerate(seq_length_train_flat):
             idx_end_train = idx_start_train + seq_length_train
 
             X_seq = X[idx_start_train: idx_end_train]
-            if self.shift_input_data:
-                Y_seq = Y[idx_start_train + seq_idx * self.n_prediction_steps:
-                          idx_end_train + (1 + seq_idx) * self.n_prediction_steps]
-            else:
-                Y_seq = Y[idx_start_train: idx_end_train]
+            Y_seq = Y[idx_start_train: idx_end_train]
 
             if normalize_y:
                 Y_seq_mean = np.mean(Y_seq)
                 Y_seq_std = np.std(Y_seq)
                 Y_seq = (Y_seq - Y_seq_mean) / Y_seq_std
 
-            if self.shift_input_data:
-                Y[idx_start_train + seq_idx * self.n_prediction_steps:
-                  idx_end_train + (1 + seq_idx) * self.n_prediction_steps] = Y_seq
-            else:
-                Y[idx_start_train: idx_end_train] = Y_seq
+            Y[idx_start_train: idx_end_train] = Y_seq
 
             if X_test is not None and Y_test is not None:
                 seq_length_test = self.sequence_lengths_tests[seq_idx]
                 idx_end_test = idx_start_test + seq_length_test
 
                 X_test_seq = X_test[idx_start_test: idx_end_test]
-                if self.shift_input_data:
-                    Y_test_seq = Y[idx_start_test + seq_idx * self.n_prediction_steps:
-                                   idx_end_test + (1 + seq_idx) * self.n_prediction_steps]
-                else:
-                    Y_test_seq = Y_test[idx_start_test: idx_end_test]
+                Y_test_seq = Y_test[idx_start_test: idx_end_test]
 
                 if normalize_y:
                     Y_test_seq_mean = np.mean(Y_test_seq)
                     Y_test_seq_std = np.std(Y_test_seq)
                     Y_seq = (Y_seq - Y_test_seq_mean) / Y_test_seq_std
 
-                if self.shift_input_data:
-                    Y_test[idx_start_test + seq_idx * self.n_prediction_steps:
-                           idx_end_test + (1 + seq_idx) * self.n_prediction_steps] = Y_seq
-                else:
-                    Y_test[idx_start_test: idx_end_test] = Y_seq
+
+                Y_test[idx_start_test: idx_end_test] = Y_seq
 
             else:
                 X_test_seq = None
                 Y_test_seq = None
 
+            if not X_seq:
+                X_seq = None
+                X_test_seq = None
+
             sequence = TimeSeriesSequence(X=X_seq,
                                           Y=Y_seq,
                                           X_test=X_test_seq,
@@ -615,7 +631,7 @@ def make_sequences_datasets(self,
             # Y_test_seq_all.append(Y_test_seq)
         # train_tensors = (X_seq_all, Y_seq_all)
         train_tensors = (X, Y)
-        if X_test is None or Y_test is None:
+        if Y_test is None:
             test_tensors = None
         else:
             # test_tensors = (X_test_seq_all, Y_test_seq_all)
@@ -625,7 +641,7 @@ def make_sequences_datasets(self,
 
     def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
-        self.update_tensros_seqs(X_train, self.sequence_lengths_train, is_train=True)
+        self.update_tensros_seqs(X_train, self.sequence_lengths_train + self.n_prediction_steps, is_train=True)
         if X_test is not None:
             self.update_tensros_seqs(X_test, self.sequence_lengths_tests, is_train=False)
         return self
@@ -723,7 +739,6 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
             'categories': self.categories,
-            'target_columns': self.target_columns
         })
         return info
 
@@ -734,7 +749,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'freq': self.freq,
                                    'sequence_lengths_train': self.sequence_lengths_train,
                                    'seq_length_max': self.seq_length_max,
-                                   'lagged_value': self.lagged_value})
+                                   'lagged_value': self.lagged_value,
+                                   'uni_variant': self._is_uni_variant})
         return dataset_properties
 
     def create_cross_val_splits(
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
index 83e0c9e7a..1dde467aa 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
@@ -67,25 +67,17 @@ def get_hyperparameter_search_space(self,
             raise ValueError("no scalers found, please add a scaler")
 
         if default is None:
-            defaults = ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'NoScaler']
+            defaults = ['TargetStandardScaler', 'TargetMinMaxScaler', 'TargetMaxAbsScaler', 'TargetNoScaler']
             for default_ in defaults:
                 if default_ in available_scalers:
                     default = default_
                     break
 
         # add only no scaler to choice hyperparameters in case the dataset is only categorical
-        if len(dataset_properties['numerical_features']) == 0:
-            default = 'NoScaler'
-            if include is not None and default not in include:
-                raise ValueError("Provided {} in include, however, "
-                                 "the dataset is incompatible with it".format(include))
-            preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                         ['NoScaler'],
-                                                         default_value=default)
-        else:
-            preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                         list(available_scalers.keys()),
-                                                         default_value=default)
+
+        preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                     list(available_scalers.keys()),
+                                                     default_value=default)
         cs.add_hyperparameter(preprocessor)
 
         # add only child hyperparameters of early_preprocessor choices
@@ -111,5 +103,3 @@ def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
 
         """
         super()._check_dataset_properties(dataset_properties)
-        assert "target_columns" in dataset_properties, \
-            "Dataset properties must contain information about the target_columns"
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
index a50e6f5a0..0e37c488e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
@@ -21,9 +21,6 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         super().__init__()
         self.random_state = random_state
         self.preprocessor: Optional[Pipeline] = None
-        self.add_fit_requirements([
-            FitRequirement('target_columns', (Tuple,), user_defined=True, dataset_property=True),
-        ])
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "BaseBatchScaler":
         """
@@ -36,7 +33,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "BaseBatchScaler":
             "TabularColumnTransformer": an instance of self
         """
         self.check_requirements(X, y)
-        self.target_columns = X['dataset_properties']['target_columns']
         self.scaler = TargetScaler(mode=self.scaler_mode)
         return self
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
index 755a68941..af38b320d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
@@ -59,27 +59,39 @@ def __init__(self,
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
         return [
-                FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-                FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
                                dataset_property=False),
-                FitRequirement('time_series_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
-                FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            ]
+            FitRequirement('y_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+                               dataset_property=False),
+            FitRequirement('uni_variant', (bool, ), user_defined=False, dataset_property=True),
+            FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
+        ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.check_requirements(X, y)
         X_train = X['X_train']
+        y_train = X['y_train']
 
         input_shape = X["dataset_properties"]['input_shape']
-
-        if not X["dataset_properties"]["is_small_preprocess"]:
-            # get input shape by transforming first two elements of the training set
-            transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
-            X_train = X_train[:1, np.newaxis, ...]
-            input_shape = transforms(X_train).shape[1:]
+        output_shape = X["dataset_properties"]['output_shape']
+
+        if X["dataset_properties"]["uni_variant"]:
+            if not X["dataset_properties"]["is_small_preprocess"]:
+                # get input shape by transforming first two elements of the training set
+                transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
+                X_train = X_train[:1, np.newaxis, ...]
+                y_train = y_train[:1, np.newaxis, ...]
+                X_train = transforms(X_train)
+                input_shape = np.concatenate(X_train, y_train).shape[1:]
+        else:
+            y_train = y_train[:1, np.newaxis, ...]
+            input_shape = y_train.shape[1:]
 
         if 'network_embedding' in X.keys():
             input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
+            input_shape = (*input_shape[:-1], input_shape[-1] + output_shape[-1])
         self.input_shape = input_shape
 
         self.encoder = self.build_encoder(
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index c21c739a4..7b53495af 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -115,8 +115,10 @@ def __call__(self, batch, padding_value=0.0):
         elif isinstance(elem, string_classes):
             return batch
         elif isinstance(elem, collections.abc.Mapping):
-            return {key: self([d[key] for d in batch]) if key != "past_target"
+            return {key: self([d[key] for d in batch]) if "past" not in key
             else self([d[key] for d in batch], self.target_padding_value) for key in elem}
+        elif elem is None:
+            return None
         raise TypeError(f"Unsupported data type {elem_type}")
 
 
@@ -355,6 +357,9 @@ def __init__(self,
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector = None
 
+        self.statistic_features = None
+        self.known_future_features = None
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
         Fits a component by using an input dictionary with pre-requisites
@@ -375,6 +380,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         if self.backcast:
             self.window_size = self.backcast_period * self.n_prediction_steps
 
+        self.statistic_features = datamanager.statistic_features
+        self.known_future_features = datamanager.known_future_features
+
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
@@ -421,13 +429,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         num_instances_dataset = np.size(train_split)
         num_instances_train = self.num_batches_per_epoch * self.batch_size
 
-        # get the length of each sequence of training data (after split)
-        # as we already know that the elements in 'train_split' increases consecutively with a certain number of
-        # discontinuity where a new sequence is sampled: [0, 1, 2 ,3, 7 ,8 ].
+        # get the length of each sequence of training data (after split), as we know that validation sets are always
+        # place on the tail of the series, the discontinuity only happens if a new series is concated.
+        # for instance, if we have a train indices is experssed as [0, 1, 2 ,3, 7 ,8 ].
         #  A new sequence must start from the index 7. We could then split each unique values to represent the length
         # of each split
 
-        # TODO consider min_starrt as a hp (multiple of self.n_prediction_steps?)
+        # TODO consider min_start as a hp (multiple of self.n_prediction_steps?)
         min_start = self.n_prediction_steps
 
         dataset_seq_length_train_all = X['dataset_properties']['sequence_lengths_train']
@@ -529,7 +537,8 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         candidate_transformations.append(ExpandTransformTimeSeries())
         if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
-            candidate_transformations.extend(X['preprocess_transforms'])
+            if "preprocess_transforms" in X:
+                candidate_transformations.extend(X['preprocess_transforms'])
 
         # We transform to tensor under dataset
         return torchvision.transforms.Compose(candidate_transformations)
@@ -542,44 +551,59 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         applying the transformations meant to validation objects
         This is a lazy loaded test set, each time only one piece of series
         """
-        # TODO more supported inputs
-        if isinstance(X, (np.ndarray, torch.Tensor)):
-            if isinstance(X, torch.Tensor):
-                X = X.numpy()
-            if X.ndim == 1:
-                X = [X]
         if isinstance(X, TimeSeriesSequence):
             X.update_transform(self.test_transform, train=False)
             dataset = [X]
-        elif isinstance(X, Sequence):
-            dataset = []
-            if isinstance(X[0], TimeSeriesSequence):
-                for X_seq in X:
-                    X_seq.update_transform(self.test_transform, train=False)
-                    dataset.append(X_seq)
-            else:
-                if y is None:
+        else:
+            if y is None:
+                # TODO consider other circumstances!
+                y = X['past_targets']
+                X = X['features']
+                
+
+
+            # TODO more supported inputs
+            if isinstance(X, (np.ndarray, torch.Tensor)):
+                if isinstance(X, torch.Tensor):
+                    X = X.numpy()
+                if X.ndim == 1:
+                    X = [X]
+
+            if isinstance(X, Sequence):
+                dataset = []
+                if isinstance(X[0], TimeSeriesSequence):
                     for X_seq in X:
-                        seq = TimeSeriesSequence(
-                            X=X_seq, Y=y,
-                            # This dataset is used for loading test data in a batched format
-                            train_transforms=self.test_transform,
-                            val_transforms=self.test_transform,
-                            n_prediction_steps=0,
-                        )
-                        dataset.append(seq)
+                        X_seq.update_transform(self.test_transform, train=False)
+                        dataset.append(X_seq)
                 else:
-                    for X_seq, y_seq in zip(X, y):
-                        seq = TimeSeriesSequence(
-                            X=X_seq, Y=y_seq,
-                            # This dataset is used for loading test data in a batched format
-                            train_transforms=self.test_transform,
-                            val_transforms=self.test_transform,
-                            n_prediction_steps=0,
-                        )
-                        dataset.append(seq)
-        else:
-            raise NotImplementedError(f"Unsupported type of input X: {type(X)}")
+                    if y is None:
+                        for X_seq in X:
+                            seq = TimeSeriesSequence(
+                                X=X_seq, Y=y,
+                                # This dataset is used for loading test data in a batched format
+                                train_transforms=self.test_transform,
+                                val_transforms=self.test_transform,
+                                n_prediction_steps=0,
+                                statistic_features=self.statistic_features,
+                                known_future_features=self.known_future_features,
+                                only_has_past_targets=True,
+                            )
+                            dataset.append(seq)
+                    else:
+                        for X_seq, y_seq in zip(X, y):
+                            seq = TimeSeriesSequence(
+                                X=X_seq, Y=y_seq,
+                                # This dataset is used for loading test data in a batched format
+                                train_transforms=self.test_transform,
+                                val_transforms=self.test_transform,
+                                n_prediction_steps=0,
+                                statistic_features=self.statistic_features,
+                                known_future_features=self.known_future_features,
+                                only_has_past_targets=True,
+                            )
+                            dataset.append(seq)
+            else:
+                raise NotImplementedError(f"Unsupported type of input X: {type(X)}")
 
         dataset_test = TestSequenceDataset(dataset, train=False)
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 125fb474c..576b445c0 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -6,6 +6,7 @@
 from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause, ForbiddenInClause
 
 import numpy as np
+import pandas as pd
 
 from sklearn.base import RegressorMixin
 from sklearn.pipeline import Pipeline
@@ -21,6 +22,7 @@
     TimeSeriesTransformer
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
@@ -76,9 +78,9 @@ def __init__(self,
                  init_params: Optional[Dict[str, Any]] = None,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  ):
-        super().__init__(
-            config, steps, dataset_properties, include, exclude,
-            random_state, init_params, search_space_updates)
+        BasePipeline.__init__(self,
+                              config, steps, dataset_properties, include, exclude,
+                              random_state, init_params, search_space_updates)
 
         self.target_scaler = None
 
@@ -280,7 +282,6 @@ def _get_hyperparameter_search_space(self,
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
             forbidden_backcast_false = ForbiddenEqualsClause(data_loader_backcast, False)
 
-
             # Ensure that NBEATS encoder only works with NBEATS decoder
             if 'NBEATSEncoder' in network_encoder_hp.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
@@ -337,7 +338,6 @@ def _get_hyperparameter_search_space(self,
             cs.get_hyperparameter_names()
         """
 
-
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
         return cs
@@ -358,13 +358,17 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         default_dataset_properties = {'target_type': 'time_series_prediction'}
         if dataset_properties is not None:
             default_dataset_properties.update(dataset_properties)
+
+        if not default_dataset_properties.get("uni_variant", False):
+            steps.extend([("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+                          ("imputer", SimpleImputer(random_state=self.random_state)),
+                          ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+                          ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+                          ])
+
         # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
             ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
-            ("imputer", SimpleImputer(random_state=self.random_state)),
-            # ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
-            ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
-            ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
             ("target_scaler", TargetScalerChoice(default_dataset_properties,
                                                  random_state=self.random_state)),
             ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
@@ -426,8 +430,7 @@ def _get_estimator_hyperparameter_name(self) -> str:
         """
         return "time_series_forecasting"
 
-
-    def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
+    def predict(self, X: Union[Dict[str, np.ndarray], pd.DataFrame], batch_size: Optional[int] = None) -> np.ndarray:
         """Predict the output using the selected model.
 
         Args:

From d5459fa3a71f42af7a81b273689a317bd8aae260 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 16 Feb 2022 12:18:51 +0100
Subject: [PATCH 157/347] resolve conflict

---
 .../setup/network/forecasting_network.py      | 245 +++++++------
 .../time_series_forecasting_data_loader.py    | 344 ++----------------
 .../training/data_loader/time_series_util.py  | 267 ++++++++++++++
 .../pipeline/time_series_forecasting.py       |   2 +
 4 files changed, 430 insertions(+), 428 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/training/data_loader/time_series_util.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index bf6bbe8c0..5aeec5326 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -261,28 +261,28 @@ def scale_value(self,
         return outputs
 
     def forward(self,
-                targets_past: torch.Tensor,
-                targets_future: Optional[torch.Tensor] = None,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
         # TODO We need to replace thus None with empty tensors to avoid checking if they are None every time!
         if self.encoder_lagged_input:
-            targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
-            targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_past,
+            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
+            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
                                                                            self.window_size,
                                                                            self.encoder.lagged_value,
                                                                            self.cached_lag_mask_encoder)
         else:
-            if self.window_size < targets_past.shape[1]:
-                targets_past = targets_past[:, -self.window_size:]
-            targets_past, _, loc, scale = self.target_scaler(targets_past)
-            x_past = targets_past
+            if self.window_size < past_targets.shape[1]:
+                past_targets = past_targets[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets)
+            x_past = past_targets
 
-        if features_past is not None:
-            x_past = torch.cat([features_past, x_past], dim=1)
+        if past_features is not None:
+            x_past = torch.cat([past_features, x_past], dim=1)
 
         x_past = x_past.to(device=self.device)
         x_past = self.embedding(x_past)
@@ -321,12 +321,12 @@ def pred_from_net_output(self, net_output):
             raise ValueError(f'Unknown output_type: {self.output_type}')
 
     def predict(self,
-                targets_past: torch.Tensor,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
                 ):
-        net_output = self(targets_past, features_past)
+        net_output = self(past_targets, past_features)
         return self.pred_from_net_output(net_output)
 
 
@@ -337,7 +337,7 @@ class ForecastingSeq2SeqNet(ForecastingNet):
 
     This structure is activate when the decoder is recurrent (RNN or transformer). 
     We train the network with teacher forcing, thus
-    targets_future is required for the network. To train the network, past targets and past features are fed to the
+    future_targets is required for the network. To train the network, past targets and past features are fed to the
     encoder to obtain the hidden states whereas future targets and future features.
     When the output type is distribution and forecast_strategy is sampling, this model is equivalent to a deepAR model 
     during inference.
@@ -350,26 +350,26 @@ def __init__(self, **kwargs):
             self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
 
     def forward(self,
-                targets_past: torch.Tensor,
-                targets_future: Optional[torch.Tensor] = None,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
         if self.encoder_lagged_input:
-            targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(targets_past[:, -self.window_size:])
-            targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_past,
+            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
+            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
                                                                            self.window_size,
                                                                            self.encoder.lagged_value,
                                                                            self.cached_lag_mask_encoder)
         else:
-            if self.window_size < targets_past.shape[1]:
-                targets_past = targets_past[:, -self.window_size:]
-            targets_past, _, loc, scale = self.target_scaler(targets_past)
-            x_past = targets_past
+            if self.window_size < past_targets.shape[1]:
+                past_targets = past_targets[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets)
+            x_past = past_targets
 
-        x_past = x_past if features_past is None else torch.cat([features_past, x_past], dim=-1)
+        x_past = x_past if past_features is None else torch.cat([past_features, x_past], dim=-1)
 
         x_past = x_past.to(self.device)
         x_past = self.embedding(x_past)
@@ -377,15 +377,15 @@ def forward(self,
         if self.training:
             # we do one step ahead forecasting
             if self.decoder_lagged_input:
-                targets_future = torch.cat([targets_past, targets_future[:, :-1, :]], dim=1)
-                targets_future, self.cached_lag_mask_decoder = get_lagged_subsequences(targets_future,
+                future_targets = torch.cat([past_targets, future_targets[:, :-1, :]], dim=1)
+                future_targets, self.cached_lag_mask_decoder = get_lagged_subsequences(future_targets,
                                                                                        self.n_prediction_steps,
                                                                                        self.decoder.lagged_value,
                                                                                        self.cached_lag_mask_decoder)
             else:
-                targets_future = torch.cat([targets_past[:, [-1], :], targets_future[:, :-1, :]], dim=1)
+                future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
 
-            x_future = targets_future if features_future is None else torch.cat([features_future, targets_future],
+            x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
                                                                                 dim=-1)
             x_future = x_future.to(self.device)
 
@@ -407,16 +407,16 @@ def forward(self,
             else:
                 features_latent = self.encoder(x_past, output_seq=True)
 
-            if features_future is not None:
-                features_future = features_future
+            if future_features is not None:
+                future_features = future_features
 
             if self.forecast_strategy != 'sample':
                 all_predictions = []
-                predicted_target = targets_past[:, [-1]]
-                targets_past = targets_past[:, :-1]
+                predicted_target = past_targets[:, [-1]]
+                past_targets = past_targets[:, :-1]
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
-                        x_future = torch.cat([targets_past, predicted_target.cpu()], dim=1)
+                        x_future = torch.cat([past_targets, predicted_target.cpu()], dim=1)
                         if self.decoder_has_hidden_states:
                             x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
                         else:
@@ -429,11 +429,11 @@ def forward(self,
                             x_future = predicted_target
 
                     if self.decoder_has_hidden_states:
-                        x_future = x_future if features_future is None else torch.cat(
-                            [features_future[:, [idx_pred], :], x_future], dim=-1)
+                        x_future = x_future if future_features is None else torch.cat(
+                            [future_features[:, [idx_pred], :], x_future], dim=-1)
                     else:
-                        x_future = x_future if features_future is None else torch.cat(
-                            [features_future[:, idx_pred + 1, :], x_future], dim=-1)
+                        x_future = x_future if future_features is None else torch.cat(
+                            [future_features[:, idx_pred + 1, :], x_future], dim=-1)
 
                     x_future = x_future.to(self.device)
                     if self.decoder_has_hidden_states:
@@ -457,7 +457,7 @@ def forward(self,
             else:
                 # we follow the DeepAR implementation:
                 all_samples = []
-                batch_size = targets_past.shape[0]
+                batch_size = past_targets.shape[0]
 
                 if self.encoder_has_hidden_states:
 
@@ -475,18 +475,18 @@ def forward(self,
                     max_lag_seq_length = max(self.decoder.lagged_value) + 1
                 else:
                     max_lag_seq_length = 1 + self.window_size
-                repeated_past_target = targets_past[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
+                repeated_past_target = past_targets[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
                                                                                                dim=0).squeeze(1)
                 repeated_predicted_target = repeated_past_target[:, [-1]]
                 repeated_past_target = repeated_past_target[:, :-1, ]
 
-                repeated_static_feat = features_static.repeat_interleave(
+                repeated_static_feat = static_features.repeat_interleave(
                     repeats=self.num_samples, dim=0
-                ).unsqueeze(dim=1) if features_static is not None else None
+                ).unsqueeze(dim=1) if static_features is not None else None
 
-                repeated_time_feat = features_future.repeat_interleave(
+                repeated_time_feat = future_features.repeat_interleave(
                     repeats=self.num_samples, dim=0
-                ) if features_future is not None else None
+                ) if future_features is not None else None
 
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
@@ -532,12 +532,12 @@ def forward(self,
                     raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
     def predict(self,
-                targets_past: torch.Tensor,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
                 ):
-        net_output = self(targets_past, features_past, features_future)
+        net_output = self(past_targets, past_features, future_features)
         if self.output_type != 'distribution':
             return self.pred_from_net_output(net_output)
         else:
@@ -568,35 +568,35 @@ def train(self, mode: bool = True) -> nn.Module:
         return super().train(mode=mode)
 
     def forward(self,
-                targets_past: torch.Tensor,
-                targets_future: Optional[torch.Tensor] = None,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
         if self.training:
             if self.encoder_lagged_input:
-                targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    targets_past[:, -self.window_size:])
-                targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-                targets_future = self.scale_value(targets_future, loc, scale)
+                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    past_targets[:, -self.window_size:])
+                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+                future_targets = self.scale_value(future_targets, loc, scale)
 
-                targets_all = torch.cat([targets_past, targets_future[:, :-1]], dim=1)
+                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
                 seq_length = self.window_size + self.n_prediction_steps
                 targets_all, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_all,
                                                                                     seq_length - 1,
                                                                                     self.encoder.lagged_value,
                                                                                     self.cached_lag_mask_encoder)
             else:
-                if self.window_size < targets_past.shape[1]:
-                    targets_past = targets_past[:, -self.window_size:]
-                targets_past, _, loc, scale = self.target_scaler(targets_past)
-                targets_future = self.scale_value(targets_future, loc, scale)
-                targets_all = torch.cat([targets_past, targets_future[:, :-1]], dim=1)
+                if self.window_size < past_targets.shape[1]:
+                    past_targets = past_targets[:, -self.window_size:]
+                past_targets, _, loc, scale = self.target_scaler(past_targets)
+                future_targets = self.scale_value(future_targets, loc, scale)
+                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
 
             x_input = targets_all
-            if features_past is not None:
-                features_all = torch.cat([features_past[:, 1:], features_future], dim=1)
+            if past_features is not None:
+                features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
                 x_input = torch.cat([features_all, x_input], dim=-1)
             x_input = x_input.to(self.device)
 
@@ -612,27 +612,27 @@ def forward(self,
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
             if self.encoder_lagged_input:
-                targets_past[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    targets_past[:, -self.window_size:])
-                targets_past[:, :-self.window_size] = self.scale_value(targets_past[:, :-self.window_size], loc, scale)
-                x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(targets_past,
+                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    past_targets[:, -self.window_size:])
+                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+                x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
                                                                                     self.window_size,
                                                                                     self.encoder.lagged_value,
                                                                                     self.cached_lag_mask_encoder_test)
             else:
-                if self.window_size < targets_past.shape[1]:
-                    targets_past = targets_past[:, -self.window_size:]
+                if self.window_size < past_targets.shape[1]:
+                    past_targets = past_targets[:, -self.window_size:]
 
-                targets_past, _, loc, scale = self.target_scaler(targets_past)
-                x_past = targets_past
-            if features_past is not None:
+                past_targets, _, loc, scale = self.target_scaler(past_targets)
+                x_past = past_targets
+            if past_features is not None:
                 # features is one step ahead of target
                 if self.window_size > 1:
-                    features_all = torch.cat([features_past[:, -self.window_size + 1:, ],
-                                              features_future],
+                    features_all = torch.cat([past_features[:, -self.window_size + 1:, ],
+                                              future_features],
                                              dim=1)
                 else:
-                    features_all = features_future
+                    features_all = future_features
             else:
                 features_all = None
             x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
@@ -643,7 +643,7 @@ def forward(self,
             x_past = self.embedding(x_past)
 
             all_samples = []
-            batch_size = targets_past.shape[0]
+            batch_size = past_targets.shape[0]
 
             if self.encoder_has_hidden_states:
                 # For RNN, we only feed the hidden state and generated future input to the netwrok
@@ -663,27 +663,27 @@ def forward(self,
             else:
                 max_lag_seq_length = self.window_size
             # TODO considering padding targets here instead of inside get_lagged function
-            repeated_past_target = targets_past[:, -max_lag_seq_length:, ].repeat_interleave(
+            repeated_past_target = past_targets[:, -max_lag_seq_length:, ].repeat_interleave(
                 repeats=self.num_samples,
                 dim=0).squeeze(1)
 
-            repeated_static_feat = features_static.repeat_interleave(
+            repeated_static_feat = static_features.repeat_interleave(
                 repeats=self.num_samples, dim=0
-            ).unsqueeze(dim=1) if features_static is not None else None
+            ).unsqueeze(dim=1) if static_features is not None else None
 
             if features_all is not None:
                 if not self.encoder_has_hidden_states:
-                        # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
-                        # data properties!!!
-                        time_feature = features_all
+                    # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
+                    # data properties!!!
+                    time_feature = features_all
                 else:
-                    time_feature = features_future[:, 1:] if self.n_prediction_steps > 1 else None
+                    time_feature = future_features[:, 1:] if self.n_prediction_steps > 1 else None
             else:
                 time_feature = None
 
             repeated_time_feat = time_feature.repeat_interleave(
                 repeats=self.num_samples, dim=0
-            ) if features_future is not None else None
+            ) if future_features is not None else None
 
             net_output = self.head(self.decoder(encoder_output))
 
@@ -745,12 +745,12 @@ def forward(self,
                 raise ValueError(f'Unknown aggregation: {self.aggregation}')
 
     def predict(self,
-                targets_past: torch.Tensor,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
                 ):
-        net_output = self(targets_past, features_past, features_future)
+        net_output = self(past_targets, past_features, future_features)
         return net_output
 
 
@@ -758,29 +758,29 @@ class NBEATSNet(ForecastingNet):
     future_target_required = False
 
     def forward(self,
-                targets_past: torch.Tensor,
-                targets_future: Optional[torch.Tensor] = None,
-                features_past: Optional[torch.Tensor] = None,
-                features_future: Optional[torch.Tensor] = None,
-                features_static: Optional[torch.Tensor] = None,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.window_size < targets_past.shape[1]:
-            targets_past = targets_past[:, -self.window_size:]
-        targets_past, _, loc, scale = self.target_scaler(targets_past)
-        targets_past = targets_past.to(self.device)
+        if self.window_size < past_targets.shape[1]:
+            past_targets = past_targets[:, -self.window_size:]
+        past_targets, _, loc, scale = self.target_scaler(past_targets)
+        past_targets = past_targets.to(self.device)
 
-        batch_size = targets_past.shape[0]
-        output_shape = targets_past.shape[2:]
+        batch_size = past_targets.shape[0]
+        output_shape = past_targets.shape[2:]
         forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
 
         forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
-        backcast = self.encoder(targets_past)
+        backcast = self.encoder(past_targets)
         for block in self.decoder:
             backcast_block, forecast_block = block(backcast)
 
             backcast = backcast - backcast_block
             forecast = forecast + forecast_block
-        backcast = backcast.reshape(targets_past.shape)
+        backcast = backcast.reshape(past_targets.shape)
         forecast = forecast.reshape(forcast_shape)
 
         forecast = self.rescale_output(forecast, loc, scale, self.device)
@@ -887,12 +887,25 @@ def predict(self, loader: torch.utils.data.DataLoader,
 
         for i, (X_batch, Y_batch) in enumerate(loader):
             # Predict on batch
-            X = X_batch['past_target']
+            past_target = X_batch['past_target']
+            past_features = X_batch['past_features']
+            future_features = X_batch["future_features"]
+            statistic_features = X_batch["statistic_features"]
 
-            X = X.float()
+            if past_target.ndim == 2:
+                past_target = past_target.unsqueeze(-1)
+
+            pred_kwargs = {"past_target": past_target,
+                           "past_features": past_features,
+                           "future_features": future_features,
+                           "statistic_features": statistic_features}
+
+            for key in pred_kwargs.keys():
+                if pred_kwargs[key] is not None:
+                    pred_kwargs[key] = pred_kwargs[key].float()
 
             with torch.no_grad():
-                Y_batch_pred = self.network.predict(X)
+                Y_batch_pred = self.network.predict(**pred_kwargs)
 
             Y_batch_preds.append(Y_batch_pred.cpu())
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index ee7c4ca04..19a04e913 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -9,15 +9,11 @@
 import numpy as np
 
 import torch
-import collections
-from torch.utils.data.sampler import SubsetRandomSampler, SequentialSampler
-from torch._six import string_classes
-from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format, default_collate
+from torch.utils.data.sampler import SubsetRandomSampler
 
 import torchvision
 
-from autoPyTorch.datasets.base_dataset import TransformSubset
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
 from autoPyTorch.utils.common import (
     HyperparameterSearchSpace,
     custom_collate_fn,
@@ -26,291 +22,13 @@
 )
 
 from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
-
-
-class TestSequenceDataset(TransformSubset):
-    def __init__(self, dataset: List[TimeSeriesSequence], train: bool = False) -> None:
-        self.dataset = dataset
-        self.indices = torch.arange(len(dataset))
-        self.train = train
-
-    def __getitem__(self, idx: int) -> np.ndarray:
-        # we only consider the entire sequence
-        seq = self.dataset[idx]
-        return seq.__getitem__(len(seq) - 1, self.train)
-
-
-def pad_sequence_from_start(sequences: List[torch.Tensor],
-                            seq_minimal_length: int,
-                            seq_max_length: int = np.inf,
-                            batch_first=True,
-                            padding_value=0.0) -> torch.Tensor:
-    r"""
-    This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we pad new values from the start of
-    the sequence. i.e., instead of extending [1,2,3] to [1,2,3,0,0], we extend it as [0,0,1,2,3]. Additionally, the
-    generated sequnece needs to have a length of at least seq_minimal_length
-    """
-
-    # assuming trailing dimensions and type of all the Tensors
-    # in sequences are same and fetching those from sequences[0]
-    max_size = sequences[0].size()
-    trailing_dims = max_size[1:]
-    max_len = min(max(max([s.size(0) for s in sequences]), seq_minimal_length), seq_max_length)
-    if seq_max_length > max_len:
-        seq_max_length = max_len
-    if batch_first:
-        out_dims = (len(sequences), max_len) + trailing_dims
-    else:
-        out_dims = (max_len, len(sequences)) + trailing_dims
-
-    out_tensor = sequences[0].new_full(out_dims, padding_value)
-    for i, tensor in enumerate(sequences):
-        length = min(tensor.size(0), seq_max_length)
-        # use index notation to prevent duplicate references to the tensor
-        if batch_first:
-            out_tensor[i, -length:, ...] = tensor[-length:]
-        else:
-            out_tensor[-length:, i, ...] = tensor[-length:]
-
-    return out_tensor
-
-
-class PadSequenceCollector:
-    """
-    A collector that transform the sequences from dataset. Since the sequences might contain different
-    length, they need to be padded with constant value. Given that target value might require special value to
-    fit the requirement of distribution, past_target will be padded with special values
-
-    """
-
-    def __init__(self, window_size: int, target_padding_value: float = 0.0, seq_max_length: int = np.inf):
-        self.window_size = window_size
-        self.target_padding_value = target_padding_value
-        self.seq_max_length = seq_max_length
-
-    def __call__(self, batch, padding_value=0.0):
-        elem = batch[0]
-        elem_type = type(elem)
-        if isinstance(elem, torch.Tensor):
-            seq = pad_sequence_from_start(batch,
-                                          seq_minimal_length=self.window_size,
-                                          seq_max_length=self.seq_max_length,
-                                          batch_first=True, padding_value=padding_value)  # type: torch.Tensor
-            return seq
-
-        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
-                and elem_type.__name__ != 'string_':
-            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
-                # array of string classes and object
-                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
-                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
-
-                return default_collate([torch.as_tensor(b) for b in batch])
-            elif elem.shape == ():  # scalars
-                return torch.as_tensor(batch)
-        elif isinstance(elem, float):
-            return torch.tensor(batch, dtype=torch.float64)
-        elif isinstance(elem, int):
-            return torch.tensor(batch)
-        elif isinstance(elem, string_classes):
-            return batch
-        elif isinstance(elem, collections.abc.Mapping):
-            return {key: self([d[key] for d in batch]) if "past" not in key
-            else self([d[key] for d in batch], self.target_padding_value) for key in elem}
-        elif elem is None:
-            return None
-        raise TypeError(f"Unsupported data type {elem_type}")
-
-
-class TimeSeriesSampler(SubsetRandomSampler):
-    def __init__(self,
-                 indices: Sequence[int],
-                 seq_lengths: Sequence[int],
-                 num_instances_per_seqs: Optional[List[float]] = None,
-                 min_start: int = 0,
-                 generator: Optional[torch.Generator] = None) -> None:
-        """
-        A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
-        sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
-        the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
-        and  randomly sample one instance from each interval. If num_instances_per_seqs is not an integral, then the
-        first interval is selected with a certain probability:
-        for instance, if we want to sample 1.3 instance from a sequence [0,1,2,3,4,5], then we first divide the seuqence
-        into two parts: [0, 3] and [3, 6], one sample is sampled from the second part, while an expected value of 0.3 is
-        sampled from the first part (This part will be sampled in the very end with torch.multinomial)
-
-        Parameters
-        ----------
-        indices: Sequence[int]
-            The set of all the possible indices that can be sampled from
-        seq_lengths: Sequence[int]
-            lengths of each sequence, applied to unsqueeze indices
-        num_instances_per_seqs: Optional[List[int]]=None
-            expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
-            sampled
-        min_start: int
-            the how many first instances we want to skip (the first few sequences need to be padded with 0)
-        generator: Optional[torch.Generator]
-            pytorch generator to control the randomness
-        """
-        super(TimeSeriesSampler, self).__init__(indices, generator)
-        if num_instances_per_seqs is None:
-            self.iter_all_seqs = True
-        else:
-            self.iter_all_seqs = False
-            if len(seq_lengths) != len(num_instances_per_seqs):
-                raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
-                                 f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
-            seq_intervals_int = []
-            seq_intervals_decimal = []
-            # seq_intervals_decimal_length = []
-            num_expected_ins_decimal = []
-            idx_tracker = 0
-            for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
-                idx_end = idx_tracker + seq_length
-                idx_start = idx_tracker + min_start
-                if idx_start > idx_end:
-                    idx_start = idx_tracker
-
-                num_interval = int(np.ceil(num_instances))
-                if num_interval > idx_end - idx_start or num_interval == 0:
-                    interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
-                    # we consider
-                    num_expected_ins_decimal.append(num_instances)
-                    seq_intervals_decimal.append(interval[:2])
-                    seq_intervals_int.append(interval[1:])
-                else:
-                    interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
-
-                    num_expected_ins_decimal.append(np.modf(num_instances)[0])
-                    seq_intervals_decimal.append(interval[:2])
-
-                    seq_intervals_int.append(interval[1:])
-                idx_tracker += seq_length
-
-            num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
-            # seq_intervals_decimal_length = np.stack(seq_intervals_decimal_length)
-            self.seq_lengths = seq_lengths
-            self.seq_lengths_sum = np.sum(seq_lengths)
-            self.num_instances = int(np.round(np.sum(num_instances_per_seqs)))
-
-            self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
-            self.seq_intervals_int = seq_intervals_int
-
-            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal) + 1e-8
-
-    def __iter__(self):
-        if self.iter_all_seqs:
-            return super().__iter__()
-        samples = torch.ones(self.num_instances, dtype=torch.int)
-        idx_samples_start = 0
-        idx_samples_end = 0
-        for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals_int, self.seq_lengths)):
-            if len(interval) == 1:
-                continue
-            num_samples = len(interval) - 1
-            idx_samples_end = idx_samples_start + num_samples
-
-            samples_shift = torch.rand(num_samples, generator=self.generator) * (interval[1:] - interval[:-1])
-            samples_seq = torch.floor(samples_shift + interval[:-1]).int()
-            samples[idx_samples_start: idx_samples_end] = samples_seq
-
-            idx_samples_start = idx_samples_end
-        num_samples_remain = self.num_instances - idx_samples_end
-        if num_samples_remain > 0:
-            if num_samples_remain > self.num_expected_ins_decimal[-1]:
-                replacement = True
-            else:
-                replacement = False
-            samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain, replacement)
-            seq_interval = self.seq_intervals_decimal[samples_idx]
-
-            samples_shift = torch.rand(num_samples_remain, generator=self.generator)
-            samples_shift *= (seq_interval[:, 1] - seq_interval[:, 0])
-            samples_seq_remain = torch.floor(samples_shift).int() + seq_interval[:, 0]
-            samples[-num_samples_remain:] = samples_seq_remain
-
-        # sometimes if self.seq_lengths_sum is too large, float might not be accurate enough
-        samples = torch.where(samples == self.seq_lengths_sum, samples - 1, samples)
-
-        yield from (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
-
-    def __len__(self):
-        return self.num_instances
-
-
-class SequentialSubSetSampler(SequentialSampler):
-    data_source: Sized
-
-    def __init__(self, data_source: Sized, num_samples: int, generator: Optional[torch.Generator] = None) -> None:
-        super(SequentialSubSetSampler, self).__init__(data_source)
-        if num_samples > len(data_source):
-            self.eval_all_sequences = True
-            self.num_samples = len(data_source)
-        else:
-            self.eval_all_sequences = False
-            self.num_samples = num_samples
-        self.generator = generator
-
-    def __iter__(self) -> Iterator[int]:
-        if self.eval_all_sequences:
-            return super(SequentialSubSetSampler, self).__iter__()
-        else:
-            yield from torch.randperm(len(self.data_source), generator=self.generator)[:self.num_samples]
-
-    def __len__(self) -> int:
-        return self.num_samples
-
-
-class ExpandTransformTimeSeries(object):
-    """Expand Dimensionality so tabular transformations see
-       a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
-       along the last axis
-    """
-
-    def __call__(self, data: np.ndarray) -> np.ndarray:
-        if len(data.shape) <= 1:
-            data = np.expand_dims(data, axis=-1)
-        return data
-
-
-class SequenceBuilder(object):
-    """build a time sequence token from the given time sequence
-    it requires two hyperparameters: sample_interval and window size
-    let's assume we have a time sequence
-    x = [0 1 2 3 4 5 6 7 8 9 10].with window_size=3 and sample resolution=2
-    then the extracted time series is [6, 8, 10] (or x[-5,-3,-1])
-    if window_size=3 and sample_resolution=3
-    then the extracted token is [4, 7, 10] (or x[-7,-4,-1])
-    Parameters
-    ----------
-    sample_interval : int, default=1
-        sample resolution
-    window_size : int, default=1
-        sliding window size
-    """
-
-    def __init__(self, sample_interval: int = 1, ):
-        """
-        initialization
-        Args:
-            sample_interval: int: sample resolution
-            window_size: int: the size of the sliding window
-        """
-        self.sample_interval = sample_interval
-        # assuming that subseq_length is 10, e.g., we can only start from -10. sample_interval = -4
-        # we will sample the following indices: [-9,-5,-1]
-        # self.first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
-
-    def __call__(self, data: np.ndarray) -> np.ndarray:
-        if self.sample_interval == 1:
-            return data
-        else:
-            subseq_length = len(data)
-            first_indices = -(self.sample_interval * ((subseq_length - 1) // self.sample_interval) + 1)
-            sample_indices = np.arange(first_indices, 0, step=self.sample_interval)
-
-            return data[sample_indices]
+from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
+    TestSequenceDataset,
+    PadSequenceCollector,
+    TimeSeriesSampler,
+    SequentialSubSetSampler,
+    ExpandTransformTimeSeries
+)
 
 
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
@@ -359,6 +77,7 @@ def __init__(self,
 
         self.statistic_features = None
         self.known_future_features = None
+        self._is_uni_variant = False
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -394,7 +113,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         max_lagged_value = max(X['dataset_properties'].get('lagged_value', [np.inf]))
         max_lagged_value += self.window_size + self.n_prediction_steps
 
-        self.padding_collector = PadSequenceCollector(self.window_size, padding_value, max_lagged_value)
+        self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value, max_lagged_value)
 
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
@@ -422,6 +141,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         else:
             self.dataset_small_preprocess = False
 
+        self._is_uni_variant = X['dataset_properties']['uni_variant']
+
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
         train_split, test_split = datamanager.splits[X['split_id']]
@@ -534,8 +255,6 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
 
         # if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
         #    candidate_transformations.extend(X['preprocess_transforms'])
-        if self.sample_interval > 1:
-            candidate_transformations.append(SequenceBuilder(sample_interval=self.sample_interval))
 
         candidate_transformations.append(ExpandTransformTimeSeries())
         if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
@@ -561,27 +280,29 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                 # TODO consider other circumstances!
                 y = X['past_targets']
                 X = X['features']
-                
-
-
-            # TODO more supported inputs
-            if isinstance(X, (np.ndarray, torch.Tensor)):
-                if isinstance(X, torch.Tensor):
-                    X = X.numpy()
-                if X.ndim == 1:
-                    X = [X]
 
-            if isinstance(X, Sequence):
+            # TODO replace with strings
+            if isinstance(y, (np.ndarray, torch.Tensor)):
+                if isinstance(y, torch.Tensor):
+                    y = y.numpy()
+                    if self._is_uni_variant:
+                        X = X.numpy()
+                if y.ndim == 1:
+                    y = [y]
+                    if self._is_uni_variant:
+                        X = [X]
+
+            if isinstance(y, Sequence):
                 dataset = []
-                if isinstance(X[0], TimeSeriesSequence):
-                    for X_seq in X:
-                        X_seq.update_transform(self.test_transform, train=False)
-                        dataset.append(X_seq)
+                if isinstance(y[0], TimeSeriesSequence):
+                    for y_seq in y:
+                        y_seq.update_transform(self.test_transform, train=False)
+                        dataset.append(y_seq)
                 else:
-                    if y is None:
-                        for X_seq in X:
+                    if self._is_uni_variant:
+                        for y_seq in y:
                             seq = TimeSeriesSequence(
-                                X=X_seq, Y=y,
+                                X=None, Y=y_seq,
                                 # This dataset is used for loading test data in a batched format
                                 train_transforms=self.test_transform,
                                 val_transforms=self.test_transform,
@@ -676,7 +397,6 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         Oreshkin et al., N-BEATS: Neural basis expansion analysis for interpretable time series forecasting, ICLR 2020
         https://arxiv.org/abs/1905.10437)
         Currently back_cast_period is only activate when back_cast is activate
-        TODO ablation study on whether this technique can be applied to other models
         Args:
             dataset_properties (Optional[Dict]): dataset properties
             batch_size (int): batch size
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
new file mode 100644
index 000000000..64669a0af
--- /dev/null
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -0,0 +1,267 @@
+from typing import Optional, Sequence, List, Iterator, Sized
+
+import numpy as np
+
+import torch
+import collections
+from torch.utils.data.sampler import SubsetRandomSampler, SequentialSampler
+from torch._six import string_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format, default_collate
+
+from autoPyTorch.datasets.base_dataset import TransformSubset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+
+
+class TestSequenceDataset(TransformSubset):
+    def __init__(self, dataset: List[TimeSeriesSequence], train: bool = False) -> None:
+        self.dataset = dataset
+        self.indices = torch.arange(len(dataset))
+        self.train = train
+
+    def __getitem__(self, idx: int) -> np.ndarray:
+        # we only consider the entire sequence
+        seq = self.dataset[idx]
+        return seq.__getitem__(len(seq) - 1, self.train)
+
+
+def pad_sequence_from_start(sequences: List[torch.Tensor],
+                            seq_minimal_length: int,
+                            seq_max_length: int = np.inf,
+                            batch_first=True,
+                            padding_value=0.0) -> torch.Tensor:
+    r"""
+    This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we pad new values from the start of
+    the sequence. i.e., instead of extending [1,2,3] to [1,2,3,0,0], we extend it as [0,0,1,2,3]. Additionally, the
+    generated sequnece needs to have a length of at least seq_minimal_length
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+    max_len = min(max(max([s.size(0) for s in sequences]), seq_minimal_length), seq_max_length)
+    if seq_max_length > max_len:
+        seq_max_length = max_len
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].new_full(out_dims, padding_value)
+    for i, tensor in enumerate(sequences):
+        length = min(tensor.size(0), seq_max_length)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, -length:, ...] = tensor[-length:]
+        else:
+            out_tensor[-length:, i, ...] = tensor[-length:]
+
+    return out_tensor
+
+
+class PadSequenceCollector:
+    """
+    A collector that transform the sequences from dataset. Since the sequences might contain different
+    length, they need to be padded with constant value. Given that target value might require special value to
+    fit the requirement of distribution, past_target will be padded with special values
+
+    """
+
+    def __init__(self, window_size: int, sample_interval, target_padding_value: float = 0.0, seq_max_length: int = np.inf):
+        self.window_size = window_size
+        self.sample_interval = sample_interval
+        self.target_padding_value = target_padding_value
+        self.seq_max_length = seq_max_length
+
+    def __call__(self, batch, sample_interval=1, padding_value=0.0):
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            seq = pad_sequence_from_start(batch,
+                                          seq_minimal_length=self.window_size,
+                                          seq_max_length=self.seq_max_length,
+                                          batch_first=True, padding_value=padding_value)  # type: torch.Tensor
+
+            if sample_interval > 1:
+                subseq_length = seq.shape[1]
+                first_indices = -(sample_interval * ((subseq_length - 1) // sample_interval) + 1)
+                sample_indices = torch.arange(first_indices, 0, step=sample_interval)
+                return seq[:, sample_indices]
+            else:
+                return seq
+
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+                return default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        elif isinstance(elem, string_classes):
+            return batch
+        elif isinstance(elem, collections.abc.Mapping):
+            # only past targets and features needs to be transformed
+            return {key: self([d[key] for d in batch]) if "past" not in key
+            else self([d[key] for d in batch], self.sample_interval, self.target_padding_value) for key in elem}
+        elif elem is None:
+            return None
+        raise TypeError(f"Unsupported data type {elem_type}")
+
+
+class TimeSeriesSampler(SubsetRandomSampler):
+    def __init__(self,
+                 indices: Sequence[int],
+                 seq_lengths: Sequence[int],
+                 num_instances_per_seqs: Optional[List[float]] = None,
+                 min_start: int = 0,
+                 generator: Optional[torch.Generator] = None) -> None:
+        """
+        A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
+        sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
+        the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
+        and  randomly sample one instance from each interval. If num_instances_per_seqs is not an integral, then the
+        first interval is selected with a certain probability:
+        for instance, if we want to sample 1.3 instance from a sequence [0,1,2,3,4,5], then we first divide the seuqence
+        into two parts: [0, 3] and [3, 6], one sample is sampled from the second part, while an expected value of 0.3 is
+        sampled from the first part (This part will be sampled in the very end with torch.multinomial)
+
+        Parameters
+        ----------
+        indices: Sequence[int]
+            The set of all the possible indices that can be sampled from
+        seq_lengths: Sequence[int]
+            lengths of each sequence, applied to unsqueeze indices
+        num_instances_per_seqs: Optional[List[int]]=None
+            expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
+            sampled
+        min_start: int
+            the how many first instances we want to skip (the first few sequences need to be padded with 0)
+        generator: Optional[torch.Generator]
+            pytorch generator to control the randomness
+        """
+        super(TimeSeriesSampler, self).__init__(indices, generator)
+        if num_instances_per_seqs is None:
+            self.iter_all_seqs = True
+        else:
+            self.iter_all_seqs = False
+            if len(seq_lengths) != len(num_instances_per_seqs):
+                raise ValueError(f'the lengths of seq_lengths must equal the lengths of num_instances_per_seqs.'
+                                 f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
+            seq_intervals_int = []
+            seq_intervals_decimal = []
+            # seq_intervals_decimal_length = []
+            num_expected_ins_decimal = []
+            idx_tracker = 0
+            for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
+                idx_end = idx_tracker + seq_length
+                idx_start = idx_tracker + min_start
+                if idx_start > idx_end:
+                    idx_start = idx_tracker
+
+                num_interval = int(np.ceil(num_instances))
+                if num_interval > idx_end - idx_start or num_interval == 0:
+                    interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
+                    # we consider
+                    num_expected_ins_decimal.append(num_instances)
+                    seq_intervals_decimal.append(interval[:2])
+                    seq_intervals_int.append(interval[1:])
+                else:
+                    interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
+
+                    num_expected_ins_decimal.append(np.modf(num_instances)[0])
+                    seq_intervals_decimal.append(interval[:2])
+
+                    seq_intervals_int.append(interval[1:])
+                idx_tracker += seq_length
+
+            num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
+            # seq_intervals_decimal_length = np.stack(seq_intervals_decimal_length)
+            self.seq_lengths = seq_lengths
+            self.seq_lengths_sum = np.sum(seq_lengths)
+            self.num_instances = int(np.round(np.sum(num_instances_per_seqs)))
+
+            self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
+            self.seq_intervals_int = seq_intervals_int
+
+            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal) + 1e-8
+
+    def __iter__(self):
+        if self.iter_all_seqs:
+            return super().__iter__()
+        samples = torch.ones(self.num_instances, dtype=torch.int)
+        idx_samples_start = 0
+        idx_samples_end = 0
+        for idx_seq, (interval, seq_length) in enumerate(zip(self.seq_intervals_int, self.seq_lengths)):
+            if len(interval) == 1:
+                continue
+            num_samples = len(interval) - 1
+            idx_samples_end = idx_samples_start + num_samples
+
+            samples_shift = torch.rand(num_samples, generator=self.generator) * (interval[1:] - interval[:-1])
+            samples_seq = torch.floor(samples_shift + interval[:-1]).int()
+            samples[idx_samples_start: idx_samples_end] = samples_seq
+
+            idx_samples_start = idx_samples_end
+        num_samples_remain = self.num_instances - idx_samples_end
+        if num_samples_remain > 0:
+            if num_samples_remain > self.num_expected_ins_decimal[-1]:
+                replacement = True
+            else:
+                replacement = False
+            samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain, replacement)
+            seq_interval = self.seq_intervals_decimal[samples_idx]
+
+            samples_shift = torch.rand(num_samples_remain, generator=self.generator)
+            samples_shift *= (seq_interval[:, 1] - seq_interval[:, 0])
+            samples_seq_remain = torch.floor(samples_shift).int() + seq_interval[:, 0]
+            samples[-num_samples_remain:] = samples_seq_remain
+
+        # sometimes if self.seq_lengths_sum is too large, float might not be accurate enough
+        samples = torch.where(samples == self.seq_lengths_sum, samples - 1, samples)
+
+        yield from (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
+
+    def __len__(self):
+        return self.num_instances
+
+
+class SequentialSubSetSampler(SequentialSampler):
+    data_source: Sized
+
+    def __init__(self, data_source: Sized, num_samples: int, generator: Optional[torch.Generator] = None) -> None:
+        super(SequentialSubSetSampler, self).__init__(data_source)
+        if num_samples > len(data_source):
+            self.eval_all_sequences = True
+            self.num_samples = len(data_source)
+        else:
+            self.eval_all_sequences = False
+            self.num_samples = num_samples
+        self.generator = generator
+
+    def __iter__(self) -> Iterator[int]:
+        if self.eval_all_sequences:
+            return super(SequentialSubSetSampler, self).__iter__()
+        else:
+            yield from torch.randperm(len(self.data_source), generator=self.generator)[:self.num_samples]
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+
+class ExpandTransformTimeSeries(object):
+    """Expand Dimensionality so tabular transformations see
+       a 2d Array, unlike the ExpandTransform defined under tabular dataset, the dimension is expanded
+       along the last axis
+    """
+
+    def __call__(self, data: np.ndarray) -> np.ndarray:
+        if len(data.shape) <= 1:
+            data = np.expand_dims(data, axis=-1)
+        return data
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 576b445c0..eb9b9cef3 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -461,3 +461,5 @@ def predict(self, X: Union[Dict[str, np.ndarray], pd.DataFrame], batch_size: Opt
                 torch.cuda.empty_cache()
                 batch_size = batch_size // 2
                 return self.predict(X, batch_size=batch_size // 2)
+            else:
+                raise e

From 510cc5a08da551ef811fcda9b7796ec00490af31 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 16 Feb 2022 13:45:24 +0100
Subject: [PATCH 158/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    | 15 ++--
 .../configs/forecasting_init_cfgs.json        | 49 ++++++------
 autoPyTorch/datasets/time_series_dataset.py   | 41 ++++++----
 autoPyTorch/evaluation/abstract_evaluator.py  |  3 +-
 ...time_series_forecasting_train_evaluator.py |  1 -
 .../setup/network/forecasting_network.py      | 12 +--
 .../time_series_forecasting_data_loader.py    | 79 ++++++++++---------
 7 files changed, 107 insertions(+), 93 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index efc6bb349..35a56e789 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -336,7 +336,7 @@ def search(
 
     def predict(
             self,
-            X_test: Union[Optional[Union[List[np.ndarray]], pd.DataFrame], Dict]=None,
+            X_test: Optional[Union[Union[List[np.ndarray]], pd.DataFrame, Dict]]=None,
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
             past_targets: Optional[List[np.ndarray]] = None,
@@ -345,12 +345,13 @@ def predict(
                     target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
                 (used for multi-variable prediction), indicates which value needs to be predicted
         """
-        if past_targets is None:
-            if not isinstance(X_test, Dict) or "past_targets" not in X_test:
-                raise ValueError("Past Targets must be given")
-        else:
-            X_test = {"features": X_test,
-                      "past_targets": past_targets}
+        if not self.dataset.is_uni_variant:
+            if past_targets is None:
+                if not isinstance(X_test, Dict) or "past_targets" not in X_test:
+                    raise ValueError("Past Targets must be given")
+            else:
+                X_test = {"features": X_test,
+                          "past_targets": past_targets}
         flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
         if self.dataset.num_target == 1:
             return flattened_res.reshape([-1, self.dataset.n_prediction_steps])
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index a246b5213..cb10f101d 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -4,7 +4,6 @@
         "data_loader:backcast": false,
         "data_loader:sample_strategy": "SeqUniform",
         "data_loader:num_batches_per_epoch": 50,
-        "imputer:numerical_strategy": "median",
         "lr_scheduler:__choice__": "ReduceLROnPlateau",
         "lr_scheduler:ReduceLROnPlateau:mode": "max",
         "lr_scheduler:ReduceLROnPlateau:factor": 0.5,
@@ -155,22 +154,22 @@
             "network_backbone:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:NBEATSDecoder:activation": "relu",
             "network_backbone:NBEATSDecoder:n_beats_type": "I",
-            "network_backbone:NBEATSDecoder:use_dropout_I": true,
-            "network_backbone:NBEATSDecoder:num_stacks_I": 2,
-            "network_backbone:NBEATSDecoder:num_blocks_I_1": 3,
-            "network_backbone:NBEATSDecoder:num_layers_I_1": 2,
-            "network_backbone:NBEATSDecoder:width_I_1": 256,
-            "network_backbone:NBEATSDecoder:weight_sharing_I_1": true,
-            "network_backbone:NBEATSDecoder:stack_type_I_1": "trend",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_I_trend_1": 3,
-            "network_backbone:NBEATSDecoder:dropout_I_1": 0.1,
-            "network_backbone:NBEATSDecoder:num_blocks_I_2": 3,
-            "network_backbone:NBEATSDecoder:num_layers_I_2": 2,
-            "network_backbone:NBEATSDecoder:width_I_2": 512,
-            "network_backbone:NBEATSDecoder:weight_sharing_I_2": true,
-            "network_backbone:NBEATSDecoder:stack_type_I_2": "seasonality",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_I_seasonality_2": 7,
-            "network_backbone:NBEATSDecoder:dropout_I_2": 0.1
+            "network_backbone:NBEATSDecoder:use_dropout_i": true,
+            "network_backbone:NBEATSDecoder:num_stacks_i": 2,
+            "network_backbone:NBEATSDecoder:num_blocks_i_1": 3,
+            "network_backbone:NBEATSDecoder:num_layers_i_1": 2,
+            "network_backbone:NBEATSDecoder:width_i_1": 256,
+            "network_backbone:NBEATSDecoder:weight_sharing_i_1": true,
+            "network_backbone:NBEATSDecoder:stack_type_i_1": "trend",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_i_trend_1": 3,
+            "network_backbone:NBEATSDecoder:dropout_i_1": 0.1,
+            "network_backbone:NBEATSDecoder:num_blocks_i_2": 3,
+            "network_backbone:NBEATSDecoder:num_layers_i_2": 2,
+            "network_backbone:NBEATSDecoder:width_i_2": 512,
+            "network_backbone:NBEATSDecoder:weight_sharing_i_2": true,
+            "network_backbone:NBEATSDecoder:stack_type_i_2": "seasonality",
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_i_seasonality_2": 7,
+            "network_backbone:NBEATSDecoder:dropout_i_2": 0.1
         },
         "NBEATS-G": {
             "loss:__choice__": "RegressionLoss",
@@ -181,14 +180,14 @@
             "network_backbone:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:NBEATSDecoder:activation": "relu",
             "network_backbone:NBEATSDecoder:n_beats_type": "G",
-            "network_backbone:NBEATSDecoder:use_dropout_G": true,
-            "network_backbone:NBEATSDecoder:num_stacks_G": 30,
-            "network_backbone:NBEATSDecoder:num_blocks_G": 1,
-            "network_backbone:NBEATSDecoder:num_layers_G": 4,
-            "network_backbone:NBEATSDecoder:width_G": 512,
-            "network_backbone:NBEATSDecoder:weight_sharing_G": false,
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_G": 32,
-            "network_backbone:NBEATSDecoder:dropout_G": 0.1
+            "network_backbone:NBEATSDecoder:use_dropout_g": true,
+            "network_backbone:NBEATSDecoder:num_stacks_g": 30,
+            "network_backbone:NBEATSDecoder:num_blocks_g": 1,
+            "network_backbone:NBEATSDecoder:num_layers_g": 4,
+            "network_backbone:NBEATSDecoder:width_g": 512,
+            "network_backbone:NBEATSDecoder:weight_sharing_g": false,
+            "network_backbone:NBEATSDecoder:expansion_coefficient_length_g": 32,
+            "network_backbone:NBEATSDecoder:dropout_g": 0.1
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 443e45cfc..96dceac20 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -53,11 +53,12 @@ def __init__(self,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
-                 statistic_features: Optional[np.ndarray] = None,
+                 static_features: Optional[np.ndarray] = None,
                  n_prediction_steps: int = 1,
                  sp: int = 1,
                  known_future_features: Optional[List[Union[str, int]]] = None,
                  only_has_past_targets: bool = False,
+                 compute_mase_coefficient_value: bool = True,
                  ):
         """
         A dataset representing a time series sequence.
@@ -78,18 +79,22 @@ def __init__(self,
         self.X_test = X_test
         self.Y_tet = Y_test
 
-        self.statistic_features = statistic_features
+        self.static_features = static_features
 
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
         self.train_transform = train_transforms
         self.val_transform = val_transforms
         self.sp = sp
-        if only_has_past_targets:
-            self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp, n_prediction_steps=n_prediction_steps)
+        if compute_mase_coefficient_value:
+            if only_has_past_targets:
+                self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp,
+                                                                 n_prediction_steps=n_prediction_steps)
+            else:
+                self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp,
+                                                                 n_prediction_steps=n_prediction_steps)
         else:
-            self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp,
-                                                             n_prediction_steps=n_prediction_steps)
+            self.mase_coefficient = 1.0
         self.only_has_past_targets = only_has_past_targets
         self.known_future_features = known_future_features
 
@@ -138,10 +143,10 @@ def __getitem__(self, index: int, train: bool = True) \
         past_target = targets[:index + 1]
         past_target = torch.from_numpy(past_target)
 
-        return {"past_target": past_target,
+        return {"past_targets": past_target,
                 "past_features": past_features,
                 "future_features": future_features,
-                "statistic_features": self.statistic_features,
+                "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient}, targets_future
 
     def __len__(self) -> int:
@@ -184,13 +189,15 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
             else:
                 X = None
             return TimeSeriesSequence(X,
-                                      self.Y[:index + 1 + self.n_prediction_steps],
+                                      self.Y[:index + 1],
                                       train_transforms=self.train_transform,
                                       val_transforms=self.val_transform,
                                       n_prediction_steps=self.n_prediction_steps,
-                                      statistic_features=self.statistic_features,
+                                      static_features=self.static_features,
                                       known_future_features=self.known_future_features,
-                                      sp=self.sp)
+                                      sp=self.sp,
+                                      only_has_past_targets=True,
+                                      compute_mase_coefficient_value=False)
 
     def get_test_target(self, test_idx: int):
         if self.only_has_past_targets:
@@ -225,7 +232,7 @@ def __init__(self,
                  dataset_name: Optional[str] = None,
                  shift_input_data: bool = True,
                  normalize_y: bool = True,
-                 statistic_features: Optional[np.ndarray] = None,
+                 static_features: Optional[np.ndarray] = None,
                  ):
         """
         :param target_variables:  Optional[Union[Tuple[int], int]] used for multi-variant forecasting
@@ -242,7 +249,7 @@ def __init__(self,
         if y values needs to be normalized with mean 0 and variance 1
         if the dataset is trained with log_prob losses, this needs to be specified in the very beginning such that the
         header's configspace can be built beforehand.
-        :param statistic_features: statistic features, invariant across different
+        :param static_features: statistic features, invariant across different
         """
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
@@ -299,7 +306,7 @@ def __init__(self,
             self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
                                n_prediction_steps=n_prediction_steps)
 
-        self._is_uni_variant = self.validator._is_uni_variant
+        self.is_uni_variant = self.validator._is_uni_variant
 
         self.numerical_columns = self.validator.feature_validator.numerical_columns
         self.categorical_columns = self.validator.feature_validator.categorical_columns
@@ -414,7 +421,7 @@ def __init__(self,
                             "n_prediction_steps": n_prediction_steps,
                             "sp": self.seasonality,
                             "known_future_features": known_future_features,
-                            "statistic_features": statistic_features}
+                            "static_features": static_features}
 
         self.y_train_mean = [0] * len(self.sequence_lengths_train)
         self.y_train_std = [1] * len(self.sequence_lengths_train)
@@ -428,7 +435,7 @@ def __init__(self,
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
         self.known_future_features = known_future_features
-        self.statistic_features = statistic_features
+        self.static_features = static_features
 
         self.seq_length_min = int(np.min(self.sequence_lengths_train))
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
@@ -750,7 +757,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'sequence_lengths_train': self.sequence_lengths_train,
                                    'seq_length_max': self.seq_length_max,
                                    'lagged_value': self.lagged_value,
-                                   'uni_variant': self._is_uni_variant})
+                                   'uni_variant': self.is_uni_variant})
         return dataset_properties
 
     def create_cross_val_splits(
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index fbd1dabb4..cf1e7b24e 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -325,7 +325,8 @@ def __init__(self, config: Configuration,
     def fit(self, X: Dict[str, Any], y: Any,
             sample_weight: Optional[np.ndarray] = None) -> object:
         self.n_prediction_steps = X['dataset_properties']['n_prediction_steps']
-        return super(DummyTimeSeriesForecastingPipeline, self).fit(X, y)
+        y_train = subsampler(X['y_train'], X['train_indices'])
+        return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train,sample_weight)
 
     def _genreate_dummy_forecasting(self, X):
         if isinstance(X[0], TimeSeriesSequence):
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 106f1107b..654a4e118 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -89,7 +89,6 @@ def __init__(self, backend: Backend, queue: Queue,
         self.max_budget = max_budget
         self.min_num_test_instances = min_num_test_instances
 
-
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
         holdout"""
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 5aeec5326..54a148cdd 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -887,18 +887,18 @@ def predict(self, loader: torch.utils.data.DataLoader,
 
         for i, (X_batch, Y_batch) in enumerate(loader):
             # Predict on batch
-            past_target = X_batch['past_target']
+            past_targets = X_batch['past_targets']
             past_features = X_batch['past_features']
             future_features = X_batch["future_features"]
-            statistic_features = X_batch["statistic_features"]
+            static_features = X_batch["static_features"]
 
-            if past_target.ndim == 2:
-                past_target = past_target.unsqueeze(-1)
+            if past_targets.ndim == 2:
+                past_targets = past_targets.unsqueeze(-1)
 
-            pred_kwargs = {"past_target": past_target,
+            pred_kwargs = {"past_targets": past_targets,
                            "past_features": past_features,
                            "future_features": future_features,
-                           "statistic_features": statistic_features}
+                           "static_features": static_features}
 
             for key in pred_kwargs.keys():
                 if pred_kwargs[key] is not None:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 19a04e913..9641678ea 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -75,7 +75,7 @@ def __init__(self,
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector = None
 
-        self.statistic_features = None
+        self.static_features = None
         self.known_future_features = None
         self._is_uni_variant = False
 
@@ -99,7 +99,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         if self.backcast:
             self.window_size = self.backcast_period * self.n_prediction_steps
 
-        self.statistic_features = datamanager.statistic_features
+        self.static_features = datamanager.static_features
         self.known_future_features = datamanager.known_future_features
 
         # this value corresponds to budget type resolution
@@ -272,9 +272,31 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         applying the transformations meant to validation objects
         This is a lazy loaded test set, each time only one piece of series
         """
-        if isinstance(X, TimeSeriesSequence):
-            X.update_transform(self.test_transform, train=False)
-            dataset = [X]
+        if self._is_uni_variant:
+            if isinstance(X, TimeSeriesSequence):
+                X.update_transform(self.test_transform, train=False)
+                dataset = [X]
+            elif isinstance(X, Sequence):
+                dataset = []
+                if isinstance(X[0], TimeSeriesSequence):
+                    for X_seq in X:
+                        X_seq.update_transform(self.test_transform, train=False)
+                        dataset.append(X_seq)
+                else:
+                    for X_seq in X:
+                        seq = TimeSeriesSequence(
+                            X=None, Y=X_seq,
+                            # This dataset is used for loading test data in a batched format
+                            train_transforms=self.test_transform,
+                            val_transforms=self.test_transform,
+                            n_prediction_steps=0,
+                            static_features=self.static_features,
+                            known_future_features=self.known_future_features,
+                            only_has_past_targets=True,
+                        )
+                        dataset.append(seq)
+            else:
+                raise NotImplementedError(f"Unsupported type of input: {type(y)}")
         else:
             if y is None:
                 # TODO consider other circumstances!
@@ -285,12 +307,11 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
             if isinstance(y, (np.ndarray, torch.Tensor)):
                 if isinstance(y, torch.Tensor):
                     y = y.numpy()
-                    if self._is_uni_variant:
-                        X = X.numpy()
+                    X = X.numpy()
                 if y.ndim == 1:
                     y = [y]
-                    if self._is_uni_variant:
-                        X = [X]
+                if X.ndim == 1:
+                    X = [X]
 
             if isinstance(y, Sequence):
                 dataset = []
@@ -299,34 +320,20 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                         y_seq.update_transform(self.test_transform, train=False)
                         dataset.append(y_seq)
                 else:
-                    if self._is_uni_variant:
-                        for y_seq in y:
-                            seq = TimeSeriesSequence(
-                                X=None, Y=y_seq,
-                                # This dataset is used for loading test data in a batched format
-                                train_transforms=self.test_transform,
-                                val_transforms=self.test_transform,
-                                n_prediction_steps=0,
-                                statistic_features=self.statistic_features,
-                                known_future_features=self.known_future_features,
-                                only_has_past_targets=True,
-                            )
-                            dataset.append(seq)
-                    else:
-                        for X_seq, y_seq in zip(X, y):
-                            seq = TimeSeriesSequence(
-                                X=X_seq, Y=y_seq,
-                                # This dataset is used for loading test data in a batched format
-                                train_transforms=self.test_transform,
-                                val_transforms=self.test_transform,
-                                n_prediction_steps=0,
-                                statistic_features=self.statistic_features,
-                                known_future_features=self.known_future_features,
-                                only_has_past_targets=True,
-                            )
-                            dataset.append(seq)
+                    for X_seq, y_seq in zip(X, y):
+                        seq = TimeSeriesSequence(
+                            X=X_seq, Y=y_seq,
+                            # This dataset is used for loading test data in a batched format
+                            train_transforms=self.test_transform,
+                            val_transforms=self.test_transform,
+                            n_prediction_steps=0,
+                            static_features=self.static_features,
+                            known_future_features=self.known_future_features,
+                            only_has_past_targets=True,
+                        )
+                        dataset.append(seq)
             else:
-                raise NotImplementedError(f"Unsupported type of input X: {type(X)}")
+                raise NotImplementedError(f"Unsupported type of input: {type(y)}")
 
         dataset_test = TestSequenceDataset(dataset, train=False)
 

From 3806fe2323dee66f0f64482cc81302f7f667c1a2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 16 Feb 2022 18:46:18 +0100
Subject: [PATCH 159/347] allow encoder to receive input from different sources

---
 autoPyTorch/datasets/time_series_dataset.py   | 13 ++++++-
 .../InceptionTimeEncoder.py                   |  9 +++--
 .../forecasting_backbone/MLPEncoder.py        |  8 +++--
 .../forecasting_backbone/NBEATSEncoder.py     |  8 +++--
 .../forecasting_backbone/RNNEncoder.py        | 20 +++++++----
 .../forecasting_backbone/TCNEncoder.py        | 10 ++++--
 .../TransformerEncoder.py                     | 19 ++++++----
 .../base_forecasting_encoder.py               | 35 ++++++++++---------
 .../base_forecasting_decoder.py               |  8 -----
 9 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 96dceac20..deb427133 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -454,7 +454,16 @@ def __init__(self,
         self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
         # TODO find a way to edit input shape!
-        self.input_shape: Tuple[int] = (self.seq_length_min, self.num_features)
+        self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)
+        if static_features is None:
+            self.static_features_shape: int = 0
+        else:
+            self.static_features_shape: int = static_features.size
+
+        if known_future_features is None:
+            self.future_feature_shape: Tuple[int, int] = (self.seq_length_min, 0)
+        else:
+            self.input_shape_future: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1][0])
@@ -469,6 +478,8 @@ def __init__(self,
                 # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
                 num_target = Y.shape[-1] if Y.ndim > 1 else 1
             self.output_shape = [self.n_prediction_steps, num_target]
+        else:
+            raise ValueError('Forecasting dataset must contain target values!')
 
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
index 919c5d7b3..180db506a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
@@ -137,11 +137,14 @@ class InceptionTimeEncoder(BaseForecastingEncoder):
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        encoder = _InceptionTime(in_features=input_shape[-1],
+    def build_encoder(self, targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+        in_features = input_shape[-1] + targets_shape[-1] + static_feature_shape
+        encoder = _InceptionTime(in_features=in_features,
                                   config=self.config)
         self._receptive_field = encoder.receptive_field
-        return encoder
+        return encoder, in_features
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
index 900a8a8bc..d0111586a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
@@ -87,10 +87,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         # when resolution is smaller
         return super().fit(X, y)
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        in_features = input_shape[-1] * self.window_size
+    def build_encoder(self, targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+        in_features = (input_shape[-1] + targets_shape[-1] + static_feature_shape)
         feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
-        return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features))
+        return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features * self.window_size)), in_features
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
index 976296271..59506f23b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
@@ -46,9 +46,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
         return super().fit(X, y)
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+    def build_encoder(self,
+                      targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+        in_features = targets_shape[-1] + input_shape[-1] + static_feature_shape
         preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
-        return preprocessor
+        return preprocessor, in_features
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
index cfe84492f..5d149e829 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
@@ -25,15 +25,17 @@ class _RNN(EncoderNetwork):
     def __init__(self,
                  in_features: int,
                  config: Dict[str, Any],
-                 lagged_value: Optional[Union[List, np.ndarray]] = None):
+                 lagged_value: Optional[List[int]] = None):
         super().__init__()
+        if lagged_value is None:
+            self.lagged_value = [0]
+        else:
+            self.lagged_value = lagged_value
         self.config = config
         if config['cell_type'] == 'lstm':
             cell_type = nn.LSTM
         else:
             cell_type = nn.GRU
-        self.lagged_value = lagged_value
-        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
         self.lstm = cell_type(input_size=in_features,
                               hidden_size=config["hidden_size"],
                               num_layers=config["num_layers"],
@@ -77,11 +79,15 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
-        encoder = _RNN(in_features=input_shape[-1],
+    def build_encoder(self, targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+        in_features = len(self.lagged_value) * targets_shape[-1] + input_shape[-1] + static_feature_shape
+        encoder = _RNN(in_features=in_features,
                        config=self.config,
-                       lagged_value=self.lagged_value)
-        return encoder
+                       lagged_value=self.lagged_value,
+                       )
+        return encoder, in_features
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
index a00bccfc6..83cdf16ed 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
@@ -118,7 +118,10 @@ class TCNEncoder(BaseForecastingEncoder):
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+    def build_encoder(self,
+                      targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
         num_channels = [self.config["num_filters_1"]]
         kernel_size = [self.config["kernel_size_1"]]
         dropout = [self.config[f"dropout_1"] if self.config["use_dropout"] else 0.0]
@@ -126,13 +129,14 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
             num_channels.append(self.config[f"num_filters_{i}"])
             kernel_size.append(self.config[f"kernel_size_{i}"])
             dropout.append(self.config[f"dropout_{i}"] if self.config["use_dropout"] else 0.0)
-        encoder = _TemporalConvNet(input_shape[-1],
+        in_features = input_shape[-1] + static_feature_shape + targets_shape[-1]
+        encoder = _TemporalConvNet(in_features,
                                    num_channels,
                                    kernel_size=kernel_size,
                                    dropout=dropout
                                    )
         self._receptive_field = encoder.receptive_field
-        return encoder
+        return encoder, in_features
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
index 800c9fc80..30f23f374 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
@@ -32,11 +32,12 @@ def __init__(self,
                  use_layer_norm_output: bool,
                  dropout_pe: float = 0.0,
                  layer_norm_eps_output: Optional[float] = None,
-                 lagged_value: Optional[Union[List, np.ndarray]] = None):
+                 lagged_value: Optional[List[int]] = None):
         super().__init__()
-        self.lagged_value = lagged_value
-        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
-
+        if lagged_value is None:
+            self.lagged_value = [0]
+        else:
+            self.lagged_value = lagged_value
         self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
         if use_positional_encoder:
             self.input_layer.append(PositionalEncoding(d_model, dropout_pe))
@@ -74,11 +75,15 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+    def build_encoder(self, targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+        in_features = len(self.lagged_value) * targets_shape[-1] + input_shape[-1] + static_feature_shape
+
         d_model = 2 ** self.config['d_model_log']
         transformer_encoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='encoder')
 
-        encoder = _TransformerEncoder(in_features=input_shape[-1],
+        encoder = _TransformerEncoder(in_features=in_features,
                                       d_model=d_model,
                                       num_layers=self.config['num_layers'],
                                       transformer_encoder_layers=transformer_encoder_layers,
@@ -87,7 +92,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
                                       dropout_pe=self.config.get('dropout_positional_encoder', 0.0),
                                       layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
                                       lagged_value=self.lagged_value)
-        return encoder
+        return encoder, in_features
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
index af38b320d..5556ba431 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
@@ -11,7 +11,6 @@
 from abc import abstractmethod
 from typing import Any, Dict, Iterable, Optional, Tuple, List
 
-
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.pipeline.components.base_component import (
@@ -61,12 +60,13 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
         return [
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
             FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                               dataset_property=False),
+                           dataset_property=False),
             FitRequirement('y_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                               dataset_property=False),
-            FitRequirement('uni_variant', (bool, ), user_defined=False, dataset_property=True),
+                           dataset_property=False),
+            FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('static_features_shape', (int, ), user_defined=True, dataset_property=True),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -76,28 +76,27 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         input_shape = X["dataset_properties"]['input_shape']
         output_shape = X["dataset_properties"]['output_shape']
+        static_features_shape = X["dataset_properties"]["static_features_shape"]
 
-        if X["dataset_properties"]["uni_variant"]:
+        if not X["dataset_properties"]["uni_variant"]:
             if not X["dataset_properties"]["is_small_preprocess"]:
                 # get input shape by transforming first two elements of the training set
                 transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
                 X_train = X_train[:1, np.newaxis, ...]
-                y_train = y_train[:1, np.newaxis, ...]
                 X_train = transforms(X_train)
-                input_shape = np.concatenate(X_train, y_train).shape[1:]
-        else:
-            y_train = y_train[:1, np.newaxis, ...]
-            input_shape = y_train.shape[1:]
+                input_shape = np.concatenate(X_train).shape[1:]
 
         if 'network_embedding' in X.keys():
             input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
-            input_shape = (*input_shape[:-1], input_shape[-1] + output_shape[-1])
-        self.input_shape = input_shape
 
-        self.encoder = self.build_encoder(
+        self.encoder, in_features = self.build_encoder(
+            targets_shape=output_shape,
             input_shape=input_shape,
+            static_feature_shape=static_features_shape
         )
 
+        self.input_shape = (X['window_size'], in_features)
+
         return self
 
     @staticmethod
@@ -111,12 +110,17 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return X
 
     @abstractmethod
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+    def build_encoder(self,
+                      targets_shape: Tuple[int, ...],
+                      input_shape: Tuple[int, ...] = (0,),
+                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
         """
         Builds the backbone module and returns it
 
         Args:
-            input_shape (Tuple[int, ...]): shape of the input to the backbone
+            targets_shape (Tuple[int, ...]): shape of target
+            input_shape (Tuple[int, ...]): input feature shape
+            static_feature_shape (int): static feature shape.
 
         Returns:
             nn.Module: backbone module
@@ -142,4 +146,3 @@ def encoder_properties(self):
                               'lagged_input': False,
                               }
         return encoder_properties
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 0c9094109..c1783b460 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -90,16 +90,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             self.n_prediction_heads = output_shape[0]
 
         encoder_properties = X['encoder_properties']
-        fixed_input_seq_length = encoder_properties.get("fixed_input_seq_length", False)
         has_hidden_states = encoder_properties.get("has_hidden_states", False)
 
-        if fixed_input_seq_length:
-            input_shape = (X["window_size"], input_shape[-1])
-
-        if encoder_properties.get('lagged_input', False):
-            lagged_value = X['network_encoder'].lagged_value
-            input_shape = (X["window_size"], input_shape[-1] * len(lagged_value))
-
         self.decoder, self.n_decoder_output_features = self.build_decoder(
             input_shape=get_output_shape(X['network_encoder'], input_shape=input_shape,
                                          has_hidden_states=has_hidden_states),

From 9251bbccc38d31e71eb4445785692ef84bab5b7d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 18 Feb 2022 21:14:21 +0100
Subject: [PATCH 160/347] multi blocks hp design

---
 .../setup/network/forecasting_network.py      |   4 +-
 .../forecasting_backbone/__init__.py          | 256 ++++--------
 .../forecasting_decoder/MLPDecoder.py         |   6 +-
 .../forecasting_decoder/NBEATSDecoder.py      |   6 +-
 .../forecasting_decoder/RNNDecoder.py         |   7 +-
 .../forecasting_decoder/TransformerDecoder.py |   7 +-
 .../forecasting_decoder/__init__.py           |   6 +-
 .../forecasting_encoder/__init__.py           | 392 ++++++++++++++++++
 .../base_forecasting_encoder.py               |   0
 .../flat_encoder}/MLPEncoder.py               |   2 +-
 .../flat_encoder}/NBEATSEncoder.py            |   6 +-
 .../flat_encoder/__init__.py                  |  54 +++
 .../seq_encoder}/InceptionTimeEncoder.py      |   4 +-
 .../seq_encoder}/RNNEncoder.py                |   6 +-
 .../seq_encoder}/TCNEncoder.py                |   4 +-
 .../seq_encoder}/TransformerEncoder.py        |   6 +-
 .../seq_encoder/__init__.py                   | 379 +++++++++++++++++
 .../forecasting_network_head/NBEATS_head.py   |   3 +-
 .../pipeline/time_series_forecasting.py       |   8 +-
 19 files changed, 934 insertions(+), 222 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder}/base_forecasting_encoder.py (100%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder/flat_encoder}/MLPEncoder.py (99%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder/flat_encoder}/NBEATSEncoder.py (93%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder/seq_encoder}/InceptionTimeEncoder.py (98%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder/seq_encoder}/RNNEncoder.py (98%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder/seq_encoder}/TCNEncoder.py (99%)
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{ => forecasting_encoder/seq_encoder}/TransformerEncoder.py (98%)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 54a148cdd..6218b6dc7 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -19,8 +19,8 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder \
-    import EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import EncoderNetwork
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 13885acc4..357fe0a05 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, Optional, List, Any
+from typing import Dict, Optional, List, Any, Union, Tuple
 from sklearn.pipeline import Pipeline
 
 import ConfigSpace.hyperparameters as CSH
@@ -16,24 +16,21 @@
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import (
     BaseForecastingEncoder,
 )
+from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
+    import FlatForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder import \
+    SeqForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
     decoders, decoder_addons, add_decoder
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
-directory = os.path.split(__file__)[0]
-_encoders = find_components(__package__,
-                            directory,
-                            BaseForecastingEncoder)
-_addons = ThirdPartyComponents(BaseForecastingEncoder)
 
-
-def add_encoder(encoder: BaseForecastingEncoder) -> None:
-    _addons.add_component(encoder)
-
-
-class ForecastingBackboneChoice(autoPyTorchChoice):
+class ForecastingNetworkChoice(autoPyTorchChoice):
     """
     A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
     the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
@@ -43,7 +40,8 @@ def __init__(self,
                  **kwargs,
                  ):
         super().__init__(**kwargs)
-        self.decoder_choice = None
+        self.include_components = None
+        self.exclude_components = None
 
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
@@ -56,21 +54,10 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
                 as choices for learning rate scheduling
         """
         components = OrderedDict()
-        components.update(_encoders)
-        components.update(_addons.components)
+        components.update({"flat_encoder": FlatForecastingEncoderChoice,
+                           "seq_encoder": SeqForecastingEncoderChoice})
         return components
 
-    def get_decoder_components(self) -> Dict[str, autoPyTorchComponent]:
-        components = OrderedDict()
-        components.update(decoders)
-        components.update(decoder_addons.components)
-        return components
-
-    @property
-    def additional_components(self):
-        # This function is deigned to add additional components rather than the components in __choice__
-        return [self.get_decoder_components]
-
     def get_available_components(
         self,
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
@@ -107,10 +94,34 @@ def get_available_components(
             available_comp = components
 
         if include is not None:
+            include_top = set()
+            self.include_components = {}
             for incl in include:
                 if incl not in available_comp:
-                    raise ValueError("Trying to include unknown component: "
-                                     "%s" % incl)
+                    for comp in available_comp.keys():
+                        self.include_components[comp] = []
+                        if incl.startswith(comp):
+                            incl_sub = ":".join(incl.split(":")[1:])
+                            if comp in self.include_components:
+                                self.include_components[comp].append(incl_sub)
+                            else:
+                                self.include_components[comp] = [incl_sub]
+                            include_top.add(comp)
+                else:
+                    include_top.add(incl)
+            if not include_top:
+                raise ValueError(f"Trying to include unknown component: {include}")
+            include = list(include_top)
+        elif exclude is not None:
+            self.exclude_components = {}
+            for excl in exclude:
+                for comp in available_comp.keys():
+                    if excl.startswith(comp):
+                        excl_sub = ":".join(excl.split(":")[1:])
+                        if comp in self.exclude_components:
+                            self.exclude_components[comp].append(excl_sub)
+                        else:
+                            self.exclude_components[comp] = [excl_sub]
 
         components_dict = OrderedDict()
         for name in available_comp:
@@ -122,7 +133,7 @@ def get_available_components(
             entry = available_comp[name]
 
             # Exclude itself to avoid infinite loop
-            if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'):
+            if entry == ForecastingNetworkChoice:
                 continue
 
             task_type = str(dataset_properties['task_type'])
@@ -144,6 +155,7 @@ def get_available_components(
 
         return components_dict
 
+
     def get_hyperparameter_search_space(
         self,
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
@@ -159,30 +171,26 @@ def get_hyperparameter_search_space(
             include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
                 list, and will exclusively use this components.
             exclude: Optional[Dict[str, Any]]: which components to skip
+            network_type: type of the network, it determines how to handle the sequential data: flat networks
+            (FFNN and NBEATS) simply flat the input to a 2D input, whereas seq network receives sequential 3D inputs:
+            thus, seq networks could be stacked to form a larger network that is composed of different parts.
 
         Returns:
             ConfigurationSpace: the configuration space of the hyper-parameters of the
                  chosen component
         """
-        cs = ConfigurationSpace()
-
         if dataset_properties is None:
             dataset_properties = {}
 
+        cs = ConfigurationSpace()
+
         # Compile a list of legal preprocessors for this problem
         available_encoders = self.get_available_components(
             dataset_properties=dataset_properties,
             include=include, exclude=exclude)
 
-        available_decoders = self.get_available_components(
-            dataset_properties=dataset_properties,
-            include=None, exclude=None,
-            components=self.get_decoder_components())
-
         if len(available_encoders) == 0:
             raise ValueError("No Encoder found")
-        if len(available_decoders) == 0:
-            raise ValueError("No Decoder found")
 
         if default is None:
             defaults = self._defaults_network
@@ -209,157 +217,43 @@ def get_hyperparameter_search_space(
             )
         cs.add_hyperparameter(hp_encoder)
 
-        decoder2encoder = {key: [] for key in available_decoders.keys()}
-        encoder2decoder = {}
-        for encoder_name in hp_encoder.choices:
-            updates = self._get_search_space_updates(prefix=encoder_name)
-            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
-                                                                                     **updates)
-            parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+        for name in hp_encoder.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            include_encoder = None
+            exclude_encoder = None
+            if include is not None:
+                if name in self.include_components:
+                    include_encoder = self.include_components[name]
+            if exclude is not None:
+                if name in self.exclude_components:
+                    exclude_encoder = self.exclude_components[name]
+            import pdb
+            pdb.set_trace()
+
+            config_space = available_encoders[name].get_hyperparameter_search_space(
+                dataset_properties=dataset_properties,  # type: ignore
+                include=include_encoder,
+                exclude=exclude_encoder,
+                **updates)
+            parent_hyperparameter = {'parent': hp_encoder, 'value': name}
             cs.add_configuration_space(
-                encoder_name,
+                name,
                 config_space,
                 parent_hyperparameter=parent_hyperparameter
             )
 
-            allowed_decoders = available_encoders[encoder_name].allowed_decoders()
-            if len(allowed_decoders) > 1:
-                if 'decoder_type' not in config_space:
-                    raise ValueError('When a specific encoder has more than one allowed decoder, its ConfigSpace'
-                                     'must contain the hyperparameter "decoder_type" ! Please check your encoder '
-                                     'setting!')
-                hp_decoder_choice = config_space.get_hyperparameter('decoder_type').choices
-                if not set(hp_decoder_choice).issubset(allowed_decoders):
-                    raise ValueError('The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
-                allowed_decoders = hp_decoder_choice
-            for decoder_name in allowed_decoders:
-                decoder2encoder[decoder_name].append(encoder_name)
-            encoder2decoder[encoder_name] = allowed_decoders
-
-        for decoder_name in available_decoders.keys():
-            if not decoder2encoder[decoder_name]:
-                continue
-            updates = self._get_search_space_updates(prefix=decoder_name)
-            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
-                                                                                            **updates)
-            compatible_encoders = decoder2encoder[decoder_name]
-            encoders_with_multi_decoder = []
-            encoder_with_uni_decoder = []
-            # this could happen if its parent encoder is not part of
-            inactive_decoder = []
-            for encoder in compatible_encoders:
-                if len(encoder2decoder[encoder]) > 1:
-                    encoders_with_multi_decoder.append(encoder)
-                else:
-                    encoder_with_uni_decoder.append(encoder)
-
-            cs.add_configuration_space(
-                decoder_name,
-                config_space,
-                #parent_hyperparameter=parent_hyperparameter
-            )
-            hps = cs.get_hyperparameters() # type: List[CSH.Hyperparameter]
-            conditions_to_add = []
-            for hp in hps:
-                # TODO consider if this will raise any unexpected behavior
-                if hp.name.startswith(decoder_name):
-                    # From the implementation of ConfigSpace
-                    # Only add a condition if the parameter is a top-level
-                    # parameter of the new configuration space (this will be some
-                    #  kind of tree structure).
-                    if cs.get_parents_of(hp):
-                        continue
-                    or_cond = []
-                    for encoder_uni in encoder_with_uni_decoder:
-                        or_cond.append(EqualsCondition(hp,
-                                                       hp_encoder,
-                                                       encoder_uni))
-                    for encode_multi in encoders_with_multi_decoder:
-                        hp_decoder_type = cs.get_hyperparameter(f'{encode_multi}:decoder_type')
-                        or_cond.append(EqualsCondition(hp,
-                                                       hp_decoder_type,
-                                                       decoder_name))
-                    if len(or_cond) == 0:
-                        continue
-                    elif len(or_cond) > 1:
-                        conditions_to_add.append(OrConjunction(*or_cond))
-                    else:
-                        conditions_to_add.append(or_cond[0])
-            cs.add_conditions(conditions_to_add)
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def set_hyperparameters(self,
-                            configuration: Configuration,
-                            init_params: Optional[Dict[str, Any]] = None
-                            ) -> 'autoPyTorchChoice':
-        """
-        Applies a configuration to the given component.
-        This method translate a hierarchical configuration key,
-        to an actual parameter of the autoPyTorch component.
-
-        Args:
-            configuration (Configuration):
-                Which configuration to apply to the chosen component
-            init_params (Optional[Dict[str, any]]):
-                Optional arguments to initialize the chosen component
-
-        Returns:
-            self: returns an instance of self
-        """
-        new_params = {}
-
-        params = configuration.get_dictionary()
-        choice = params['__choice__']
-        del params['__choice__']
-
-        for param, value in params.items():
-            param = param.replace(choice + ':', '')
-            new_params[param] = value
-
-        if init_params is not None:
-            for param, value in init_params.items():
-                param = param.replace(choice + ':', '')
-                new_params[param] = value
-
-        decoder_components = self.get_decoder_components()
-
-        decoder_type = None
-
-        decoder_params = {}
-        decoder_params_names = []
-        for param, value in new_params.items():
-            if decoder_type is None:
-                for decoder_component in decoder_components.keys():
-                    if param.startswith(decoder_component):
-                        decoder_type = decoder_component
-                        decoder_params_names.append(param)
-                        param = param.replace(decoder_type + ':', '')
-                        decoder_params[param] = value
-            else:
-                if param.startswith(decoder_type):
-                    decoder_params_names.append(param)
-                    param = param.replace(decoder_type + ':', '')
-                    decoder_params[param] = value
-
-        for param_name in decoder_params_names:
-            del new_params[param_name]
-
-        new_params['random_state'] = self.random_state
-        decoder_params['random_state'] = self.random_state
-
-        self.new_params = new_params
-        self.choice = self.get_components()[choice](**new_params)
-        self.decoder_choice = decoder_components[decoder_type](**decoder_params)
-        self.pipe = Pipeline([('encoder', self.choice), ('decoder', self.decoder_choice)])
-        return self
+    def _apply_search_space_update(self, hyperparameter_search_space_update: HyperparameterSearchSpaceUpdate) -> None:
+        self._cs_updates[hyperparameter_search_space_update.hyperparameter] = hyperparameter_search_space_update
 
 
     @property
     def _defaults_network(self):
-        return ['MLPEncoder',
-                'RNNEncpder']
+        return ['flat_network',
+                'seq_network']
 
     def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
         """Handy method to check if a component is fitted
@@ -372,15 +266,15 @@ def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
         """
         # Allows to use check_is_fitted on the choice object
         self.fitted_ = True
-        assert self.pipe is not None, "Cannot call fit without initializing the component"
-        return self.pipe.fit(X, y)
+        assert self.choice is not None, "Cannot call fit without initializing the component"
+        return self.choice.fit(X, y)
         #self.choice.fit(X, y)
         #self.choice.transform(X)
         #return self.choice
 
     def transform(self, X: Dict) -> Dict:
-        assert self.pipe is not None, "Cannot call transform before the object is initialized"
-        return self.pipe.transform(X)
+        assert self.choice is not None, "Cannot call transform before the object is initialized"
+        return self.choice.transform(X)
 
     @property
     def _defaults_network(self):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index bdfe9e0a5..5c044381e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -8,10 +8,10 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
-    BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
 
 
 class ForecastingMLPDecoder(BaseForecastingDecoder):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index ae769c1b5..0a6611191 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -3,7 +3,7 @@
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
     UniformFloatHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition, InCondition, EqualsCondition, AndConjunction
+from ConfigSpace.conditions import GreaterThanCondition, EqualsCondition, AndConjunction
 
 from typing import Dict, Optional, Tuple, Union, Any
 
@@ -13,8 +13,8 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
-    BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
 
 
 # TODO we need to rewrite NBEATS part to make it neater!!!
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index aec3874f3..e819915a4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -1,5 +1,4 @@
 from typing import Any, Dict, Optional, Tuple, List, Union
-import warnings
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -13,10 +12,10 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
-    BaseForecastingDecoder, RecurrentDecoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, RecurrentDecoderNetwork
 
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
+from autoPyTorch.utils.common import FitRequirement
 
 
 class RNN_Module(RecurrentDecoderNetwork):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index b3d3a6ae9..ae75a2809 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -1,6 +1,5 @@
 from typing import Any, Dict, Optional, Tuple, List, Union
 
-import warnings
 import torch
 from torch import nn
 import numpy as np
@@ -15,10 +14,10 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.common import add_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import \
-    BaseForecastingDecoder, RecurrentDecoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, RecurrentDecoderNetwork
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
     PositionalEncoding, build_transformer_layers
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index 69178763f..032a60401 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -11,9 +11,9 @@
 )
 
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
-    BaseForecastingDecoder,
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
+
 
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
new file mode 100644
index 000000000..b027b730e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -0,0 +1,392 @@
+import os
+from collections import OrderedDict
+from typing import Dict, Optional, List, Any
+from abc import abstractmethod
+from sklearn.pipeline import Pipeline
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
+from ConfigSpace.conditions import EqualsCondition, OrConjunction
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    BaseForecastingEncoder,
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
+    decoders, decoder_addons, add_decoder
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class AbstractForecastingEncoderChoice(autoPyTorchChoice):
+    """
+    A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
+    the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
+    the encoder.
+    """
+    def __init__(self,
+
+                 **kwargs,
+                 ):
+        super().__init__(**kwargs)
+        self.encoder_choice = None
+        self.decoder_choice = None
+
+    @abstractmethod
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        raise NotImplementedError
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
+
+    def get_decoder_components(self) -> Dict[str, autoPyTorchComponent]:
+        components = OrderedDict()
+        components.update(decoders)
+        components.update(decoder_addons.components)
+        return components
+
+    @property
+    def additional_components(self):
+        # This function is deigned to add additional components rather than the components in __choice__
+        return [self.get_decoder_components]
+
+    def get_available_components(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        include: List[str] = None,
+        exclude: List[str] = None,
+        components: Optional[Dict[str, autoPyTorchComponent]] = None
+    ) -> Dict[str, autoPyTorchComponent]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate backbones
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        if components is None:
+            available_comp = self.get_components()
+        else:
+            available_comp = components
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # backbones based on the dataset!
+            # TODO: Think if there is any case where a backbone
+            # is not recommended for a certain dataset
+
+            components_dict[name] = entry
+
+        return components_dict
+
+    def get_hyperparameter_search_space(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        default: Optional[str] = None,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
+            default (Optional[str]): Default backbone to use
+            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
+                list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]: which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_encoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        available_decoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=None, exclude=None,
+            components=self.get_decoder_components())
+
+        if len(available_encoders) == 0:
+            raise ValueError("No Encoder found")
+        if len(available_decoders) == 0:
+            raise ValueError("No Decoder found")
+
+        if default is None:
+            defaults = self._defaults_network
+            for default_ in defaults:
+                if default_ in available_encoders:
+                    default = default_
+                    break
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_encoders):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_encoders,
+                                                               choice_hyperparameter.value_range))
+            hp_encoder = CSH.CategoricalHyperparameter('__choice__',
+                                                     choice_hyperparameter.value_range,
+                                                     default_value=choice_hyperparameter.default_value)
+        else:
+            hp_encoder = CSH.CategoricalHyperparameter(
+                '__choice__',
+                list(available_encoders.keys()),
+                default_value=default
+            )
+        cs.add_hyperparameter(hp_encoder)
+
+        decoder2encoder = {key: [] for key in available_decoders.keys()}
+        encoder2decoder = {}
+        for encoder_name in hp_encoder.choices:
+            updates = self._get_search_space_updates(prefix=encoder_name)
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                     **updates)
+            parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+            cs.add_configuration_space(
+                encoder_name,
+                config_space,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
+            allowed_decoders = available_encoders[encoder_name].allowed_decoders()
+            if len(allowed_decoders) > 1:
+                if 'decoder_type' not in config_space:
+                    raise ValueError('When a specific encoder has more than one allowed decoder, its ConfigSpace'
+                                     'must contain the hyperparameter "decoder_type" ! Please check your encoder '
+                                     'setting!')
+                hp_decoder_choice = config_space.get_hyperparameter('decoder_type').choices
+                if not set(hp_decoder_choice).issubset(allowed_decoders):
+                    raise ValueError('The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
+                allowed_decoders = hp_decoder_choice
+            for decoder_name in allowed_decoders:
+                decoder2encoder[decoder_name].append(encoder_name)
+            encoder2decoder[encoder_name] = allowed_decoders
+
+        for decoder_name in available_decoders.keys():
+            if not decoder2encoder[decoder_name]:
+                continue
+            updates = self._get_search_space_updates(prefix=decoder_name)
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+                                                                                            **updates)
+            compatible_encoders = decoder2encoder[decoder_name]
+            encoders_with_multi_decoder = []
+            encoder_with_uni_decoder = []
+            # this could happen if its parent encoder is not part of
+            inactive_decoder = []
+            for encoder in compatible_encoders:
+                if len(encoder2decoder[encoder]) > 1:
+                    encoders_with_multi_decoder.append(encoder)
+                else:
+                    encoder_with_uni_decoder.append(encoder)
+
+            cs.add_configuration_space(
+                decoder_name,
+                config_space,
+                #parent_hyperparameter=parent_hyperparameter
+            )
+            hps = cs.get_hyperparameters() # type: List[CSH.Hyperparameter]
+            conditions_to_add = []
+            for hp in hps:
+                # TODO consider if this will raise any unexpected behavior
+                if hp.name.startswith(decoder_name):
+                    # From the implementation of ConfigSpace
+                    # Only add a condition if the parameter is a top-level
+                    # parameter of the new configuration space (this will be some
+                    #  kind of tree structure).
+                    if cs.get_parents_of(hp):
+                        continue
+                    or_cond = []
+                    for encoder_uni in encoder_with_uni_decoder:
+                        or_cond.append(EqualsCondition(hp,
+                                                       hp_encoder,
+                                                       encoder_uni))
+                    for encode_multi in encoders_with_multi_decoder:
+                        hp_decoder_type = cs.get_hyperparameter(f'{encode_multi}:decoder_type')
+                        or_cond.append(EqualsCondition(hp,
+                                                       hp_decoder_type,
+                                                       decoder_name))
+                    if len(or_cond) == 0:
+                        continue
+                    elif len(or_cond) > 1:
+                        conditions_to_add.append(OrConjunction(*or_cond))
+                    else:
+                        conditions_to_add.append(or_cond[0])
+            cs.add_conditions(conditions_to_add)
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
+
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        """
+        Applies a configuration to the given component.
+        This method translate a hierarchical configuration key,
+        to an actual parameter of the autoPyTorch component.
+
+        Args:
+            configuration (Configuration):
+                Which configuration to apply to the chosen component
+            init_params (Optional[Dict[str, any]]):
+                Optional arguments to initialize the chosen component
+
+        Returns:
+            self: returns an instance of self
+        """
+        new_params = {}
+
+        params = configuration.get_dictionary()
+        choice = params['__choice__']
+        del params['__choice__']
+
+        for param, value in params.items():
+            param = param.replace(choice + ':', '')
+            new_params[param] = value
+
+        if init_params is not None:
+            for param, value in init_params.items():
+                param = param.replace(choice + ':', '')
+                new_params[param] = value
+
+        decoder_components = self.get_decoder_components()
+
+        decoder_type = None
+
+        decoder_params = {}
+        decoder_params_names = []
+        for param, value in new_params.items():
+            if decoder_type is None:
+                for decoder_component in decoder_components.keys():
+                    if param.startswith(decoder_component):
+                        decoder_type = decoder_component
+                        decoder_params_names.append(param)
+                        param = param.replace(decoder_type + ':', '')
+                        decoder_params[param] = value
+            else:
+                if param.startswith(decoder_type):
+                    decoder_params_names.append(param)
+                    param = param.replace(decoder_type + ':', '')
+                    decoder_params[param] = value
+
+        for param_name in decoder_params_names:
+            del new_params[param_name]
+
+        new_params['random_state'] = self.random_state
+        decoder_params['random_state'] = self.random_state
+
+        self.new_params = new_params
+        self.encoder_choice = self.get_components()[choice](**new_params)
+        self.decoder_choice = decoder_components[decoder_type](**decoder_params)
+        self.choice = Pipeline([('encoder', self.encoder_choice), ('decoder', self.decoder_choice)])
+        return self
+
+    @property
+    def _defaults_network(self):
+        return ['MLPEncoder']
+
+    def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
+        """Handy method to check if a component is fitted
+
+        Args:
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
+        """
+        # Allows to use check_is_fitted on the choice object
+        self.fitted_ = True
+        assert self.choice is not None, "Cannot call fit without initializing the component"
+        return self.choice.fit(X, y)
+        #self.choice.fit(X, y)
+        #self.choice.transform(X)
+        #return self.choice
+
+    def transform(self, X: Dict) -> Dict:
+        assert self.choice is not None, "Cannot call transform before the object is initialized"
+        return self.choice.transform(X)
+
+    @property
+    def _defaults_network(self):
+        return ['MLPEncoder']
+
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
similarity index 100%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/base_forecasting_encoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
similarity index 99%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index d0111586a..a6e04c17a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -7,7 +7,7 @@
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderNetwork
 )
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
similarity index 93%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 59506f23b..9d77b6e36 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -4,13 +4,13 @@
 
 from ConfigSpace import ConfigurationSpace
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    BaseForecastingEncoder
 )
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.MLPEncoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.MLPEncoder import \
     TimeSeriesMLPrecpocessor
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
new file mode 100644
index 000000000..b025f5d08
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -0,0 +1,54 @@
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
+
+
+import os
+from collections import OrderedDict
+from typing import Dict, Union, Optional
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class FlatForecastingEncoderChoice(AbstractForecastingEncoderChoice):
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'SeqEncoder',
+            'name': 'SeqEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
similarity index 98%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 180db506a..d1f8dee6f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -9,8 +9,8 @@
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    BaseForecastingEncoder
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
similarity index 98%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 5d149e829..c111b7626 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -1,6 +1,4 @@
-from typing import Any, Dict, Optional, Tuple, List, Union
-import warnings
-import numpy as np
+from typing import Any, Dict, Optional, Tuple, List
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -14,7 +12,7 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderNetwork
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
similarity index 99%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 83cdf16ed..d3d0059c9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -11,11 +11,11 @@
 import torch
 from torch import nn
 from torch.nn.utils import weight_norm
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderNetwork
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
 
 # _Chomp1d, _TemporalBlock and _TemporalConvNet copied from
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
similarity index 98%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 30f23f374..82d6b3fc5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -1,6 +1,4 @@
-from typing import Any, Dict, Optional, Tuple, List, Union
-import warnings
-import numpy as np
+from typing import Any, Dict, Optional, Tuple, List
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -14,7 +12,7 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.base_forecasting_encoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderNetwork
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
new file mode 100644
index 000000000..67944c2a1
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -0,0 +1,379 @@
+import os
+from collections import OrderedDict
+from typing import Dict, Optional, List, Any, Union
+import numpy as np
+from sklearn.pipeline import Pipeline
+
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
+from ConfigSpace.conditions import EqualsCondition, OrConjunction, GreaterThanCondition
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    BaseForecastingEncoder,
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
+    decoders, decoder_addons, add_decoder
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
+    base_forecasting_encoder import BaseForecastingEncoder
+
+directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            directory,
+                            BaseForecastingEncoder)
+_addons = ThirdPartyComponents(BaseForecastingEncoder)
+
+
+def add_encoder(encoder: BaseForecastingEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class ForecastingNetworkStructure(autoPyTorchComponent):
+    def __init__(self, random_state: Optional[np.random.RandomState] = None,
+                 num_blocks:int = 1,
+                 variable_selection: bool = False,
+                 skip_connection: bool = False) -> None:
+        super().__init__()
+        self.num_blocks = num_blocks
+        self.random_state = random_state
+        self.variable_selection = variable_selection
+        self.skip_connection = skip_connection
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
+        self.check_requirements(X, y)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({
+            'num_blocks': self.num_blocks,
+            'variable_selection': self.variable_selection,
+            'skip_connection': self.skip_connection
+                  })
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            **kwargs: Any
+    ) -> ConfigurationSpace:
+        return ConfigurationSpace()
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'EarlyPreprocessing',
+            'name': 'Early Preprocessing Node',
+        }
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
+
+
+class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available backbone components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all basebackbone components available
+                as choices for learning rate scheduling
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                                              value_range=(1, 2),
+                                                                              default_value=1),
+            variable_selection: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="variable_selection",
+                value_range=(True, False),
+                default_value=False
+            ),
+            skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False),
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
+    ) -> ConfigurationSpace:
+        """Returns the configuration space of the current chosen components
+
+        Args:
+            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
+            num_blocks: HyperparameterSearchSpace: number of encoder-decoder structure blocks
+            variable_selection: HyperparameterSearchSpace: if variable selection is applied, if True, then the first
+            block will be attached with a variable selection block while the following will be enriched with static
+            features.
+            skip_connection: HyperparameterSearchSpace: if skip connection is applied to
+            default (Optional[str]): Default backbone to use
+            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
+                list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]: which components to skip
+
+        Returns:
+            ConfigurationSpace: the configuration space of the hyper-parameters of the
+                 chosen component
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, variable_selection, CategoricalHyperparameter)
+        add_hyperparameter(cs, skip_connection, CategoricalHyperparameter)
+
+        min_num_blocks, max_num_blcoks = num_blocks.value_range
+
+        num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
+        cs.add_hyperparameter(num_blocks)
+
+        # Compile a list of legal preprocessors for this problem
+        available_encoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        available_decoders = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=None, exclude=None,
+            components=self.get_decoder_components())
+
+        if len(available_encoders) == 0:
+            raise ValueError("No Encoder found")
+        if len(available_decoders) == 0:
+            raise ValueError("No Decoder found")
+
+        if default is None:
+            defaults = self._defaults_network
+            for default_ in defaults:
+                if default_ in available_encoders:
+                    default = default_
+                    break
+        updates_choice = self._get_search_space_updates()
+
+        for i in range(1, int(max_num_blcoks) + 1):
+            block_prefix = f'block_{i}:'
+
+            if '__choice__' in updates_choice.keys():
+                choice_hyperparameter = updates_choice['__choice__']
+                if not set(choice_hyperparameter.value_range).issubset(available_encoders):
+                    raise ValueError("Expected given update for {} to have "
+                                     "choices in {} got {}".format(self.__class__.__name__,
+                                                                   available_encoders,
+                                                                   choice_hyperparameter.value_range))
+                hp_encoder = CategoricalHyperparameter(block_prefix + '__choice__',
+                                                       choice_hyperparameter.value_range,
+                                                       default_value=choice_hyperparameter.default_value)
+            else:
+                hp_encoder = CategoricalHyperparameter(
+                    block_prefix + '__choice__',
+                    list(available_encoders.keys()),
+                    default_value=default
+                )
+            cs.add_hyperparameter(hp_encoder)
+            if i > int(min_num_blocks):
+                cs.add_condition(
+                    GreaterThanCondition(hp_encoder, num_blocks, i - 1)
+                )
+
+            decoder2encoder = {key: [] for key in available_decoders.keys()}
+            encoder2decoder = {}
+            for encoder_name in hp_encoder.choices:
+                updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
+                config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,
+                                                                                                # type: ignore
+                                                                                                **updates)
+                parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+                cs.add_configuration_space(
+                    block_prefix + encoder_name,
+                    config_space,
+                    parent_hyperparameter=parent_hyperparameter
+                )
+
+                allowed_decoders = available_encoders[encoder_name].allowed_decoders()
+                if len(allowed_decoders) > 1:
+                    if 'decoder_type' not in config_space:
+                        raise ValueError('When a specific encoder has more than one allowed decoder, its ConfigSpace'
+                                         'must contain the hyperparameter "decoder_type" ! Please check your encoder '
+                                         'setting!')
+                    hp_decoder_choice = config_space.get_hyperparameter('decoder_type').choices
+                    if not set(hp_decoder_choice).issubset(allowed_decoders):
+                        raise ValueError(
+                            'The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
+                    allowed_decoders = hp_decoder_choice
+                for decoder_name in allowed_decoders:
+                    decoder2encoder[decoder_name].append(encoder_name)
+                encoder2decoder[encoder_name] = allowed_decoders
+
+            for decoder_name in available_decoders.keys():
+                if not decoder2encoder[decoder_name]:
+                    continue
+                updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
+                config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
+                                                                                                # type: ignore
+                                                                                                **updates)
+                compatible_encoders = decoder2encoder[decoder_name]
+                encoders_with_multi_decoder = []
+                encoder_with_uni_decoder = []
+                # this could happen if its parent encoder is not part of
+                inactive_decoder = []
+                for encoder in compatible_encoders:
+                    if len(encoder2decoder[encoder]) > 1:
+                        encoders_with_multi_decoder.append(encoder)
+                    else:
+                        encoder_with_uni_decoder.append(encoder)
+
+                cs.add_configuration_space(
+                    block_prefix + decoder_name,
+                    config_space,
+                    # parent_hyperparameter=parent_hyperparameter
+                )
+                hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
+                conditions_to_add = []
+                for hp in hps:
+                    # TODO consider if this will raise any unexpected behavior
+                    if hp.name.startswith(block_prefix + decoder_name):
+                        # From the implementation of ConfigSpace
+                        # Only add a condition if the parameter is a top-level
+                        # parameter of the new configuration space (this will be some
+                        #  kind of tree structure).
+                        if cs.get_parents_of(hp):
+                            continue
+                        or_cond = []
+                        for encoder_uni in encoder_with_uni_decoder:
+                            or_cond.append(EqualsCondition(hp,
+                                                           hp_encoder,
+                                                           encoder_uni))
+                        for encode_multi in encoders_with_multi_decoder:
+                            hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encode_multi}:decoder_type')
+                            or_cond.append(EqualsCondition(hp,
+                                                           hp_decoder_type,
+                                                           decoder_name))
+                        if len(or_cond) == 0:
+                            continue
+                        elif len(or_cond) > 1:
+                            conditions_to_add.append(OrConjunction(*or_cond))
+                        else:
+                            conditions_to_add.append(or_cond[0])
+                cs.add_conditions(conditions_to_add)
+            self.configuration_space_ = cs
+            self.dataset_properties_ = dataset_properties
+        return cs
+
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        """
+        Applies a configuration to the given component.
+        This method translate a hierarchical configuration key,
+        to an actual parameter of the autoPyTorch component.
+
+        Args:
+            configuration (Configuration):
+                Which configuration to apply to the chosen component
+            init_params (Optional[Dict[str, any]]):
+                Optional arguments to initialize the chosen component
+
+        Returns:
+            self: returns an instance of self
+        """
+        new_params = {}
+
+        params = configuration.get_dictionary()
+
+        num_blocks = params['num_blocks']
+        variable_selection = params['variable_selection']
+        skip_connection = params['skip_connection']
+        del params['num_blocks']
+        del params['variable_selection']
+        del params['skip_connection']
+
+        pipeline_steps = [ForecastingNetworkStructure(random_state=self.random_state,
+                                                     num_blocks=num_blocks,
+                                                     variable_selection=variable_selection,
+                                                     skip_connection=skip_connection)]
+        self.encoder_choice = []
+        self.decoder_choice = []
+
+        for i in range(1, num_blocks + 1):
+            block_prefix = f'block_{i}:'
+            choice = params[block_prefix + '__choice__']
+            del params[block_prefix + '__choice__']
+
+            for param, value in params.items():
+                if param.startswith(block_prefix):
+                    param = param.replace(block_prefix + choice + ':', '')
+                    new_params[param] = value
+
+            if init_params is not None:
+                for param, value in init_params.items():
+                    if param.startswith(block_prefix):
+                        param = param.replace(block_prefix + choice + ':', '')
+                        new_params[param] = value
+
+            decoder_components = self.get_decoder_components()
+
+            decoder_type = None
+
+            decoder_params = {}
+            decoder_params_names = []
+            for param, value in new_params.items():
+                if decoder_type is None:
+                    for decoder_component in decoder_components.keys():
+                        if param.startswith(block_prefix + decoder_component):
+                            decoder_type = decoder_component
+                            decoder_params_names.append(param)
+                            param = param.replace(block_prefix + decoder_type + ':', '')
+                            decoder_params[param] = value
+                else:
+                    if param.startswith(block_prefix + decoder_type):
+                        decoder_params_names.append(param)
+                        param = param.replace(block_prefix + decoder_type + ':', '')
+                        decoder_params[param] = value
+
+            for param_name in decoder_params_names:
+                del new_params[param_name]
+
+            new_params['random_state'] = self.random_state
+            decoder_params['random_state'] = self.random_state
+            encoder = self.get_components()[choice](**new_params)
+            decoder = decoder_components[decoder_type](**decoder_params)
+            pipeline_steps.extend([(f'encoder_{i}', encoder), f'decoder_{i}', decoder])
+            self.encoder_choice.append(encoder)
+            self.decoder_choice.append(decoder)
+
+        self.choice = Pipeline(pipeline_steps)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'SeqEncoder',
+            'name': 'SeqEncoder',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index b4b9214ab..3e21d91db 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -6,7 +6,8 @@
 import numpy as np
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.NBEATSDecoder import NBEATSBLock
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.\
+    forecasting_decoder.NBEATSDecoder import NBEATSBLock
 
 
 class TransposeLinear(nn.Module):
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index eb9b9cef3..f211af861 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -27,9 +27,7 @@
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
-    ForecastingDecoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
@@ -374,8 +372,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
-            ("network_backbone", ForecastingBackboneChoice(dataset_properties=default_dataset_properties,
-                                                           random_state=self.random_state)),
+            ("network_backbone", ForecastingNetworkChoice(dataset_properties=default_dataset_properties,
+                                                          random_state=self.random_state)),
             ("network_head", ForecastingHead(random_state=self.random_state)),
             ("network", ForecastingNetworkComponent(random_state=self.random_state)),
             ("network_init", NetworkInitializerChoice(default_dataset_properties,

From 5617db6ea0cadcb71833bb6168bbd3b592d0fc1d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 20 Feb 2022 16:48:21 +0100
Subject: [PATCH 161/347] maint

---
 .../forecasting_training_loss/QuantileLoss.py | 49 +++++++++++
 .../forecasting_backbone/__init__.py          | 87 ++++++++++++++-----
 .../forecasting_encoder/__init__.py           | 14 +--
 .../flat_encoder/MLPEncoder.py                |  2 +-
 .../flat_encoder/__init__.py                  |  1 +
 .../seq_encoder/InceptionTimeEncoder.py       |  4 +-
 .../seq_encoder/TCNEncoder.py                 |  2 +-
 .../seq_encoder/__init__.py                   | 17 ++--
 8 files changed, 133 insertions(+), 43 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
new file mode 100644
index 000000000..3fac5e68b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -0,0 +1,49 @@
+from typing import Optional, Dict, Union, Any
+import numpy as np
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
+    ForecastingLossComponents
+from autoPyTorch.pipeline.components.training.losses import LogProbLoss
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
+
+
+class QuantileLoss():
+    loss = LogProbLoss
+    required_net_out_put_type = 'quantile'
+
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 ):
+        super(QuantileLoss, self).__init__()
+        self.random_state = random_state
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'QuantileLoss',
+            'name': 'QuantileLoss',
+            "handles_tabular": False,
+            "handles_image": False,
+            "handles_time_series": True,
+            'handles_regression': False,
+            'handles_classification': False
+        }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        required_padding_value = ALL_DISTRIBUTIONS[self.dist_cls].value_in_support
+        X.update({"dist_cls": self.dist_cls,
+                  "required_padding_value": required_padding_value})
+        return super().transform(X)
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 357fe0a05..77dc9d025 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -36,6 +36,7 @@ class ForecastingNetworkChoice(autoPyTorchChoice):
     the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
     the encoder.
     """
+
     def __init__(self,
                  **kwargs,
                  ):
@@ -43,6 +44,12 @@ def __init__(self,
         self.include_components = None
         self.exclude_components = None
 
+        self.default_components = OrderedDict(
+            {"flat_encoder": FlatForecastingEncoderChoice(dataset_properties=self.dataset_properties,
+                                                          random_state=self.random_state),
+             "seq_encoder": SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties,
+                                                        random_state=self.random_state)})
+
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
@@ -53,17 +60,14 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
             Dict[str, autoPyTorchComponent]: all basebackbone components available
                 as choices for learning rate scheduling
         """
-        components = OrderedDict()
-        components.update({"flat_encoder": FlatForecastingEncoderChoice,
-                           "seq_encoder": SeqForecastingEncoderChoice})
-        return components
+        return self.default_components
 
     def get_available_components(
-        self,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        include: List[str] = None,
-        exclude: List[str] = None,
-        components: Optional[Dict[str, autoPyTorchComponent]] = None
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            include: List[str] = None,
+            exclude: List[str] = None,
+            components: Optional[Dict[str, autoPyTorchComponent]] = None
     ) -> Dict[str, autoPyTorchComponent]:
         """Filters out components based on user provided
         include/exclude directives, as well as the dataset properties
@@ -155,13 +159,12 @@ def get_available_components(
 
         return components_dict
 
-
     def get_hyperparameter_search_space(
-        self,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        default: Optional[str] = None,
-        include: Optional[List[str]] = None,
-        exclude: Optional[List[str]] = None,
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
     ) -> ConfigurationSpace:
         """Returns the configuration space of the current chosen components
 
@@ -207,8 +210,8 @@ def get_hyperparameter_search_space(
                                                                available_encoders,
                                                                choice_hyperparameter.value_range))
             hp_encoder = CSH.CategoricalHyperparameter('__choice__',
-                                                     choice_hyperparameter.value_range,
-                                                     default_value=choice_hyperparameter.default_value)
+                                                       choice_hyperparameter.value_range,
+                                                       default_value=choice_hyperparameter.default_value)
         else:
             hp_encoder = CSH.CategoricalHyperparameter(
                 '__choice__',
@@ -227,8 +230,6 @@ def get_hyperparameter_search_space(
             if exclude is not None:
                 if name in self.exclude_components:
                     exclude_encoder = self.exclude_components[name]
-            import pdb
-            pdb.set_trace()
 
             config_space = available_encoders[name].get_hyperparameter_search_space(
                 dataset_properties=dataset_properties,  # type: ignore
@@ -244,11 +245,49 @@ def get_hyperparameter_search_space(
 
         self.configuration_space_ = cs
         self.dataset_properties_ = dataset_properties
+
         return cs
 
-    def _apply_search_space_update(self, hyperparameter_search_space_update: HyperparameterSearchSpaceUpdate) -> None:
-        self._cs_updates[hyperparameter_search_space_update.hyperparameter] = hyperparameter_search_space_update
+    def set_hyperparameters(self,
+                            configuration: Configuration,
+                            init_params: Optional[Dict[str, Any]] = None
+                            ) -> 'autoPyTorchChoice':
+        new_params = {}
+
+        params = configuration.get_dictionary()
+        choice = params['__choice__']
+        del params['__choice__']
+
+        for param, value in params.items():
+            param = param.replace(choice + ':', '')
+            new_params[param] = value
 
+        if init_params is not None:
+            for param, value in init_params.items():
+                param = param.replace(choice + ':', '')
+                new_params[param] = value
+
+        choice_component = self.get_components()[choice]
+
+        self.new_params = new_params
+        sub_configuration_space = choice_component.get_hyperparameter_search_space(  # type: ignore[call-arg]
+            self.dataset_properties,
+        )
+
+        sub_configuration = Configuration(sub_configuration_space,
+                                          values=new_params)
+        self.choice = choice_component.set_hyperparameters(sub_configuration)
+
+        return self
+
+    def _apply_search_space_update(self, hyperparameter_search_space_update: HyperparameterSearchSpaceUpdate) -> None:
+        sub_module_name = hyperparameter_search_space_update.hyperparameter.split(':')
+        if sub_module_name[1] == '__choice__':
+            super()._apply_search_space_update(hyperparameter_search_space_update)
+        else:
+            # TODO create a new update and consider special HPs for seq encoder!!!
+            update_sub_module = hyperparameter_search_space_update.get_search_space(sub_module_name[0])
+            self.get_components()[sub_module_name]._apply_search_space_update(update_sub_module)
 
     @property
     def _defaults_network(self):
@@ -268,9 +307,9 @@ def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
         self.fitted_ = True
         assert self.choice is not None, "Cannot call fit without initializing the component"
         return self.choice.fit(X, y)
-        #self.choice.fit(X, y)
-        #self.choice.transform(X)
-        #return self.choice
+        # self.choice.fit(X, y)
+        # self.choice.transform(X)
+        # return self.choice
 
     def transform(self, X: Dict) -> Dict:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index b027b730e..5ed64d0eb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -45,7 +45,7 @@ def __init__(self,
                  **kwargs,
                  ):
         super().__init__(**kwargs)
-        self.encoder_choice = None
+        self.pipeline = None
         self.decoder_choice = None
 
     @abstractmethod
@@ -355,9 +355,9 @@ def set_hyperparameters(self,
         decoder_params['random_state'] = self.random_state
 
         self.new_params = new_params
-        self.encoder_choice = self.get_components()[choice](**new_params)
+        self.choice = self.get_components()[choice](**new_params)
         self.decoder_choice = decoder_components[decoder_type](**decoder_params)
-        self.choice = Pipeline([('encoder', self.encoder_choice), ('decoder', self.decoder_choice)])
+        self.pipeline = Pipeline([('encoder', self.choice), ('decoder', self.decoder_choice)])
         return self
 
     @property
@@ -375,15 +375,15 @@ def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
         """
         # Allows to use check_is_fitted on the choice object
         self.fitted_ = True
-        assert self.choice is not None, "Cannot call fit without initializing the component"
-        return self.choice.fit(X, y)
+        assert self.pipeline is not None, "Cannot call fit without initializing the component"
+        return self.pipeline.fit(X, y)
         #self.choice.fit(X, y)
         #self.choice.transform(X)
         #return self.choice
 
     def transform(self, X: Dict) -> Dict:
-        assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        assert self.pipeline is not None, "Cannot call transform before the object is initialized"
+        return self.pipeline.transform(X)
 
     @property
     def _defaults_network(self):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index a6e04c17a..69e0d9697 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -129,7 +129,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
-                                                                              value_range=(1, 10),
+                                                                              value_range=(1, 5),
                                                                               default_value=3,
                                                                               ),
             activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index b025f5d08..01c0ddd82 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -52,3 +52,4 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_image': False,
             'handles_time_series': True,
         }
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index d1f8dee6f..0eeea102a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -172,8 +172,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
         num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                          value_range=(1, 10),
-                                                                          default_value=5,
+                                                                          value_range=(1, 5),
+                                                                          default_value=3,
                                                                           ),
         num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                            value_range=(4, 64),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index d3d0059c9..b74198935 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -164,7 +164,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                              value_range=(1, 6),
+                                                                              value_range=(1, 4),
                                                                               default_value=3),
             num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
                                                                                value_range=(4, 64),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 67944c2a1..b66911d77 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -43,7 +43,7 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
 
 class ForecastingNetworkStructure(autoPyTorchComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None,
-                 num_blocks:int = 1,
+                 num_blocks: int = 1,
                  variable_selection: bool = False,
                  skip_connection: bool = False) -> None:
         super().__init__()
@@ -61,7 +61,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             'num_blocks': self.num_blocks,
             'variable_selection': self.variable_selection,
             'skip_connection': self.skip_connection
-                  })
+        })
         return X
 
     @staticmethod
@@ -310,10 +310,10 @@ def set_hyperparameters(self,
         del params['variable_selection']
         del params['skip_connection']
 
-        pipeline_steps = [ForecastingNetworkStructure(random_state=self.random_state,
-                                                     num_blocks=num_blocks,
-                                                     variable_selection=variable_selection,
-                                                     skip_connection=skip_connection)]
+        pipeline_steps = [('net_structure', ForecastingNetworkStructure(random_state=self.random_state,
+                                                                        num_blocks=num_blocks,
+                                                                        variable_selection=variable_selection,
+                                                                        skip_connection=skip_connection))]
         self.encoder_choice = []
         self.decoder_choice = []
 
@@ -360,11 +360,12 @@ def set_hyperparameters(self,
             decoder_params['random_state'] = self.random_state
             encoder = self.get_components()[choice](**new_params)
             decoder = decoder_components[decoder_type](**decoder_params)
-            pipeline_steps.extend([(f'encoder_{i}', encoder), f'decoder_{i}', decoder])
+            pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
             self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
 
-        self.choice = Pipeline(pipeline_steps)
+        self.pipeline = Pipeline(pipeline_steps)
+        self.choice = self.encoder_choice[0]
         return self
 
     @staticmethod

From d04cb04b5ca3bc0f3193de774474c69225c56b82 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 20 Feb 2022 17:01:26 +0100
Subject: [PATCH 162/347] correct hp updates

---
 .../forecasting_backbone/__init__.py          | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 77dc9d025..dbc04f25a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -281,12 +281,19 @@ def set_hyperparameters(self,
         return self
 
     def _apply_search_space_update(self, hyperparameter_search_space_update: HyperparameterSearchSpaceUpdate) -> None:
-        sub_module_name = hyperparameter_search_space_update.hyperparameter.split(':')
-        if sub_module_name[1] == '__choice__':
+        sub_module_name_component = hyperparameter_search_space_update.hyperparameter.split(':')
+        if len(sub_module_name_component) <= 2:
             super()._apply_search_space_update(hyperparameter_search_space_update)
         else:
+            sub_module_name = sub_module_name_component[0]
             # TODO create a new update and consider special HPs for seq encoder!!!
-            update_sub_module = hyperparameter_search_space_update.get_search_space(sub_module_name[0])
+            update_sub_module = HyperparameterSearchSpaceUpdate(
+                hyperparameter_search_space_update.node_name,
+                hyperparameter=hyperparameter_search_space_update.hyperparameter.replace(f'{sub_module_name}:', ''),
+                value_range=hyperparameter_search_space_update.value_range,
+                default_value=hyperparameter_search_space_update.default_value,
+                log=hyperparameter_search_space_update.log
+            )
             self.get_components()[sub_module_name]._apply_search_space_update(update_sub_module)
 
     @property
@@ -307,14 +314,7 @@ def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
         self.fitted_ = True
         assert self.choice is not None, "Cannot call fit without initializing the component"
         return self.choice.fit(X, y)
-        # self.choice.fit(X, y)
-        # self.choice.transform(X)
-        # return self.choice
 
     def transform(self, X: Dict) -> Dict:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
-
-    @property
-    def _defaults_network(self):
-        return ['MLPEncoder']

From 7881bb5561d16702a46da54703c9ed0d1e8d67b8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 21 Feb 2022 19:59:28 +0100
Subject: [PATCH 163/347] first trial on nested conjunction

---
 .../forecasting_decoder/MLPDecoder.py         |  23 +-
 .../base_forecasting_decoder.py               |   1 -
 .../forecasting_encoder/__init__.py           |   6 +-
 .../seq_encoder/__init__.py                   | 204 +++++++++++-------
 4 files changed, 141 insertions(+), 93 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 5c044381e..bdfb4250e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -22,7 +22,7 @@ def _build_decoder(self,
         layers = []
         in_features = input_shape[-1]
         num_decoder_output_features = in_features
-        if self.config["num_layers"] > 0:
+        if 'num_layers' in self.config and self.config["num_layers"] > 0:
             for i in range(1, self.config["num_layers"]):
                 layers.append(nn.Linear(in_features=in_features,
                                         out_features=self.config[f"units_layer_{i}"]))
@@ -71,9 +71,7 @@ def get_hyperparameter_search_space(
                                                                               value_range=tuple(_activations.keys()),
                                                                               default_value=list(_activations.keys())[
                                                                                   0]),
-            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
-                                                                                   value_range=(True, False),
-                                                                                   default_value=False),
+            auto_regressive: bool = False,
             has_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='has_local_layer',
                                                                                    value_range=(True, False),
                                                                                    default_value=True),
@@ -102,7 +100,8 @@ def get_hyperparameter_search_space(
             could start from 0)
             units_layer (HyperparameterSearchSpace): number of units of each layer (except for the last layer)
             activation (HyperparameterSearchSpace): activation function
-            auto_regressive (HyperparameterSearchSpace): if the model acts as a DeepAR model
+            auto_regressive (bool): if the model acts as a DeepAR model, the corresponding hyperparaemter is
+            controlled by seq_encoder
             has_local_layer (HyperparameterSearchSpace): if local MLP layer is applied, if not, the output of the
                 network will be directly attached with different heads
             units_local_layer (HyperparameterSearchSpace): number of units of local layer. The size of this layer is
@@ -147,14 +146,12 @@ def get_hyperparameter_search_space(
 
         # add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
 
-        # TODO let dataset_properties decide if auto_regressive models is applicable
-        auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
-        has_local_layer = get_hyperparameter(has_local_layer, CategoricalHyperparameter)
-        units_local_layer = get_hyperparameter(units_local_layer, UniformIntegerHyperparameter)
+        if not auto_regressive:
+            has_local_layer = get_hyperparameter(has_local_layer, CategoricalHyperparameter)
+            units_local_layer = get_hyperparameter(units_local_layer, UniformIntegerHyperparameter)
 
-        cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive, False)
-        cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
-        cs.add_hyperparameters([auto_regressive, has_local_layer, units_local_layer])
-        cs.add_conditions([cond_use_local_layer, cond_units_local_layer])
+            cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
+            cs.add_hyperparameters([has_local_layer, units_local_layer])
+            cs.add_conditions([cond_units_local_layer])
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index c1783b460..9094b679d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -36,7 +36,6 @@ def __init__(self,
                  **kwargs: Any):
         super().__init__()
         self.add_fit_requirements(self._required_fit_requirements)
-        self.auto_regressive = kwargs.get('auto_regressive', False)
 
         self.config = kwargs
         self.decoder: Optional[nn.Module] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 5ed64d0eb..16091477d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -250,8 +250,7 @@ def get_hyperparameter_search_space(
             compatible_encoders = decoder2encoder[decoder_name]
             encoders_with_multi_decoder = []
             encoder_with_uni_decoder = []
-            # this could happen if its parent encoder is not part of
-            inactive_decoder = []
+
             for encoder in compatible_encoders:
                 if len(encoder2decoder[encoder]) > 1:
                     encoders_with_multi_decoder.append(encoder)
@@ -377,9 +376,6 @@ def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
         self.fitted_ = True
         assert self.pipeline is not None, "Cannot call fit without initializing the component"
         return self.pipeline.fit(X, y)
-        #self.choice.fit(X, y)
-        #self.choice.transform(X)
-        #return self.choice
 
     def transform(self, X: Dict) -> Dict:
         assert self.pipeline is not None, "Cannot call transform before the object is initialized"
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index b66911d77..911073e89 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -6,7 +6,9 @@
 
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
-from ConfigSpace.conditions import EqualsCondition, OrConjunction, GreaterThanCondition
+from ConfigSpace.conditions import (
+    EqualsCondition, OrConjunction, GreaterThanCondition, NotEqualsCondition, AndConjunction
+)
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 
@@ -45,12 +47,14 @@ class ForecastingNetworkStructure(autoPyTorchComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None,
                  num_blocks: int = 1,
                  variable_selection: bool = False,
-                 skip_connection: bool = False) -> None:
+                 skip_connection: bool = False,
+                 auto_regressive: str = 'not_applied') -> None:
         super().__init__()
         self.num_blocks = num_blocks
         self.random_state = random_state
         self.variable_selection = variable_selection
         self.skip_connection = skip_connection
+        self.auto_regressive = auto_regressive
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
         self.check_requirements(X, y)
@@ -60,7 +64,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({
             'num_blocks': self.num_blocks,
             'variable_selection': self.variable_selection,
-            'skip_connection': self.skip_connection
+            'skip_connection': self.skip_connection,
+            'auto_regressive': self.auto_regressive,
         })
         return X
 
@@ -86,6 +91,8 @@ def __str__(self) -> str:
 
 
 class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
+    deepAR_decoder_name = 'MLPDecoder'
+    deepAR_decoder_prefix = 'deepAR_decoder'
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
@@ -115,6 +122,11 @@ def get_hyperparameter_search_space(
             skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
                                                                                    value_range=(True, False),
                                                                                    default_value=False),
+            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="auto_regressive",
+                value_range=('encoder', 'decoder'),
+                default_value='encoder',
+            ),
             default: Optional[str] = None,
             include: Optional[List[str]] = None,
             exclude: Optional[List[str]] = None,
@@ -127,7 +139,10 @@ def get_hyperparameter_search_space(
             variable_selection: HyperparameterSearchSpace: if variable selection is applied, if True, then the first
             block will be attached with a variable selection block while the following will be enriched with static
             features.
-            skip_connection: HyperparameterSearchSpace: if skip connection is applied to
+            skip_connection: HyperparameterSearchSpace: if skip connection is applied
+            auto_regressive: HyperparameterSearchSpace: if auto-regressive is applied, depending on the choice,
+            auto-regressive strategy is applied to either encoder (DeepAR model), decoder or not applied (in which case
+            )
             default (Optional[str]): Default backbone to use
             include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
                 list, and will exclusively use this components.
@@ -147,7 +162,8 @@ def get_hyperparameter_search_space(
         min_num_blocks, max_num_blcoks = num_blocks.value_range
 
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
-        cs.add_hyperparameter(num_blocks)
+        auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
+        cs.add_hyperparameters([num_blocks, auto_regressive])
 
         # Compile a list of legal preprocessors for this problem
         available_encoders = self.get_available_components(
@@ -226,59 +242,81 @@ def get_hyperparameter_search_space(
                     decoder2encoder[decoder_name].append(encoder_name)
                 encoder2decoder[encoder_name] = allowed_decoders
 
-            for decoder_name in available_decoders.keys():
-                if not decoder2encoder[decoder_name]:
-                    continue
-                updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
+            if len(auto_regressive.choices) > 1 or auto_regressive.choices[0] != 'encoder':
+                for decoder_name in available_decoders.keys():
+                    if not decoder2encoder[decoder_name]:
+                        continue
+                    updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
+                    config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
+                                                                                                    # type: ignore
+                                                                                                    **updates)
+                    compatible_encoders = decoder2encoder[decoder_name]
+                    encoders_with_multi_decoder = []
+                    encoder_with_single_decoder = []
+                    for encoder in compatible_encoders:
+                        if len(encoder2decoder[encoder]) > 1:
+                            encoders_with_multi_decoder.append(encoder)
+                        else:
+                            encoder_with_single_decoder.append(encoder)
+
+                    cs.add_configuration_space(
+                        block_prefix + decoder_name,
+                        config_space,
+                        # parent_hyperparameter=parent_hyperparameter
+                    )
+
+                    hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
+                    conditions_to_add = []
+                    for hp in hps:
+                        # TODO consider if this will raise any unexpected behavior
+                        if hp.name.startswith(block_prefix + decoder_name):
+                            # From the implementation of ConfigSpace
+                            # Only add a condition if the parameter is a top-level
+                            # parameter of the new configuration space (this will be some
+                            #  kind of tree structure).
+                            if cs.get_parents_of(hp):
+                                continue
+                            or_cond = []
+                            for encoder_single in encoder_with_single_decoder:
+                                or_cond.append(EqualsCondition(hp,
+                                                               hp_encoder,
+                                                               encoder_single))
+                            for encode_multi in encoders_with_multi_decoder:
+                                hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encode_multi}:decoder_type')
+                                or_cond.append(EqualsCondition(hp, hp_decoder_type, decoder_name))
+                            if len(or_cond) == 1:
+                                conditions_to_add.append(
+                                    AndConjunction(or_cond[0],
+                                                   NotEqualsCondition(hp, auto_regressive, 'encoder'))
+                                )
+                            elif len(or_cond) > 1:
+                                conditions_to_add.append(
+                                    AndConjunction(OrConjunction(*or_cond),
+                                                   NotEqualsCondition(hp, auto_regressive, 'encoder'))
+                                )
+
+                    cs.add_conditions(conditions_to_add)
+
+        if 'encoder' in auto_regressive.choices:
+            decoder_name = self.deepAR_decoder_name
+            if decoder_name in available_decoders:
+                updates = self._get_search_space_updates(prefix=self.deepAR_decoder_prefix + decoder_name)
+                updates['auto_regressive'] = True
                 config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
                                                                                                 # type: ignore
                                                                                                 **updates)
-                compatible_encoders = decoder2encoder[decoder_name]
-                encoders_with_multi_decoder = []
-                encoder_with_uni_decoder = []
-                # this could happen if its parent encoder is not part of
-                inactive_decoder = []
-                for encoder in compatible_encoders:
-                    if len(encoder2decoder[encoder]) > 1:
-                        encoders_with_multi_decoder.append(encoder)
-                    else:
-                        encoder_with_uni_decoder.append(encoder)
-
+                parent_hyperparameter = {'parent': auto_regressive, 'value': 'encoder'}
                 cs.add_configuration_space(
-                    block_prefix + decoder_name,
+                    self.deepAR_decoder_prefix + decoder_name,
                     config_space,
-                    # parent_hyperparameter=parent_hyperparameter
+                    parent_hyperparameter=parent_hyperparameter
                 )
-                hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
-                conditions_to_add = []
-                for hp in hps:
-                    # TODO consider if this will raise any unexpected behavior
-                    if hp.name.startswith(block_prefix + decoder_name):
-                        # From the implementation of ConfigSpace
-                        # Only add a condition if the parameter is a top-level
-                        # parameter of the new configuration space (this will be some
-                        #  kind of tree structure).
-                        if cs.get_parents_of(hp):
-                            continue
-                        or_cond = []
-                        for encoder_uni in encoder_with_uni_decoder:
-                            or_cond.append(EqualsCondition(hp,
-                                                           hp_encoder,
-                                                           encoder_uni))
-                        for encode_multi in encoders_with_multi_decoder:
-                            hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encode_multi}:decoder_type')
-                            or_cond.append(EqualsCondition(hp,
-                                                           hp_decoder_type,
-                                                           decoder_name))
-                        if len(or_cond) == 0:
-                            continue
-                        elif len(or_cond) > 1:
-                            conditions_to_add.append(OrConjunction(*or_cond))
-                        else:
-                            conditions_to_add.append(or_cond[0])
-                cs.add_conditions(conditions_to_add)
+
             self.configuration_space_ = cs
             self.dataset_properties_ = dataset_properties
+
+        cs.get_parent_conditions_of('block_1:MLPDecoder:has_local_layer')
+
         return cs
 
     def set_hyperparameters(self,
@@ -306,6 +344,7 @@ def set_hyperparameters(self,
         num_blocks = params['num_blocks']
         variable_selection = params['variable_selection']
         skip_connection = params['skip_connection']
+        auto_regressive = params['auto_regressive']
         del params['num_blocks']
         del params['variable_selection']
         del params['skip_connection']
@@ -313,10 +352,13 @@ def set_hyperparameters(self,
         pipeline_steps = [('net_structure', ForecastingNetworkStructure(random_state=self.random_state,
                                                                         num_blocks=num_blocks,
                                                                         variable_selection=variable_selection,
-                                                                        skip_connection=skip_connection))]
+                                                                        skip_connection=skip_connection,
+                                                                        auto_regressive=auto_regressive,))]
         self.encoder_choice = []
         self.decoder_choice = []
 
+        decoder_components = self.get_decoder_components()
+
         for i in range(1, num_blocks + 1):
             block_prefix = f'block_{i}:'
             choice = params[block_prefix + '__choice__']
@@ -333,35 +375,49 @@ def set_hyperparameters(self,
                         param = param.replace(block_prefix + choice + ':', '')
                         new_params[param] = value
 
-            decoder_components = self.get_decoder_components()
-
-            decoder_type = None
-
-            decoder_params = {}
-            decoder_params_names = []
-            for param, value in new_params.items():
-                if decoder_type is None:
-                    for decoder_component in decoder_components.keys():
-                        if param.startswith(block_prefix + decoder_component):
-                            decoder_type = decoder_component
+            if auto_regressive != 'encoder':
+                decoder_type = None
+
+                decoder_params = {}
+                decoder_params_names = []
+                for param, value in new_params.items():
+                    if decoder_type is None:
+                        for decoder_component in decoder_components.keys():
+                            if param.startswith(block_prefix + decoder_component):
+                                decoder_type = decoder_component
+                                decoder_params_names.append(param)
+                                param = param.replace(block_prefix + decoder_type + ':', '')
+                                decoder_params[param] = value
+                    else:
+                        if param.startswith(block_prefix + decoder_type):
                             decoder_params_names.append(param)
                             param = param.replace(block_prefix + decoder_type + ':', '')
                             decoder_params[param] = value
-                else:
-                    if param.startswith(block_prefix + decoder_type):
-                        decoder_params_names.append(param)
-                        param = param.replace(block_prefix + decoder_type + ':', '')
-                        decoder_params[param] = value
 
-            for param_name in decoder_params_names:
-                del new_params[param_name]
+                for param_name in decoder_params_names:
+                    del new_params[param_name]
+                new_params['random_state'] = self.random_state
+                decoder_params['random_state'] = self.random_state
+                encoder = self.get_components()[choice](**new_params)
+                decoder = decoder_components[decoder_type](**decoder_params)
+                pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
+                self.encoder_choice.append(encoder)
+                self.decoder_choice.append(decoder)
+            else:
+                new_params['random_state'] = self.random_state
+                encoder = self.get_components()[choice](**new_params)
+                pipeline_steps.extend([(f'encoder_{i}', encoder)])
+                self.encoder_choice.append(encoder)
 
-            new_params['random_state'] = self.random_state
+        if auto_regressive == 'encoder':
+            decoder_params = {}
+            for param, value in new_params.items():
+                if param.startswith(self.deepAR_decoder_prefix + self.deepAR_decoder_name):
+                    param = param.replace(self.deepAR_decoder_prefix + self.deepAR_decoder_name + ':', '')
+                    decoder_params[param] = value
             decoder_params['random_state'] = self.random_state
-            encoder = self.get_components()[choice](**new_params)
-            decoder = decoder_components[decoder_type](**decoder_params)
-            pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
-            self.encoder_choice.append(encoder)
+            decoder = decoder_components[self.deepAR_decoder_name](**decoder_params)
+            pipeline_steps.extend([(f'decoder', decoder)])
             self.decoder_choice.append(decoder)
 
         self.pipeline = Pipeline(pipeline_steps)

From d7bff6e44ae731f11a3ab7a4312afbedd2286e35 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 21 Feb 2022 20:00:11 +0100
Subject: [PATCH 164/347] maint

---
 .../forecasting_encoder/seq_encoder/__init__.py                 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 911073e89..fdc6aad87 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -315,8 +315,6 @@ def get_hyperparameter_search_space(
             self.configuration_space_ = cs
             self.dataset_properties_ = dataset_properties
 
-        cs.get_parent_conditions_of('block_1:MLPDecoder:has_local_layer')
-
         return cs
 
     def set_hyperparameters(self,

From 2153bc244a60fb1ec9e12eb5f0e4faab60716a41 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 21 Feb 2022 20:35:38 +0100
Subject: [PATCH 165/347] fit for deep AR model (needs to be reverted when the
 issue in ConfigSpace is fixed!!!)

---
 .../forecasting_decoder/MLPDecoder.py         |  24 +-
 .../base_forecasting_decoder.py               |   4 +-
 .../seq_encoder/__init__.py                   | 211 ++++++++----------
 3 files changed, 108 insertions(+), 131 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index bdfb4250e..21ba716bf 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -71,7 +71,11 @@ def get_hyperparameter_search_space(
                                                                               value_range=tuple(_activations.keys()),
                                                                               default_value=list(_activations.keys())[
                                                                                   0]),
-            auto_regressive: bool = False,
+            can_be_auto_regressive:bool = False,
+            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=False,
+                                                                                   ),
             has_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='has_local_layer',
                                                                                    value_range=(True, False),
                                                                                    default_value=True),
@@ -145,13 +149,19 @@ def get_hyperparameter_search_space(
                 cs.add_condition(GreaterThanCondition(num_units_hp, num_layers_hp, i - 1))
 
         # add_hyperparameter(cs, units_final_layer, UniformIntegerHyperparameter)
+        has_local_layer = get_hyperparameter(has_local_layer, CategoricalHyperparameter)
+        units_local_layer = get_hyperparameter(units_local_layer, UniformIntegerHyperparameter)
+
+        cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
+
+        cs.add_hyperparameters([has_local_layer, units_local_layer])
+        cs.add_conditions([cond_units_local_layer])
 
-        if not auto_regressive:
-            has_local_layer = get_hyperparameter(has_local_layer, CategoricalHyperparameter)
-            units_local_layer = get_hyperparameter(units_local_layer, UniformIntegerHyperparameter)
+        if can_be_auto_regressive:
+            auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
 
-            cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
-            cs.add_hyperparameters([has_local_layer, units_local_layer])
-            cs.add_conditions([cond_units_local_layer])
+            cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive, False)
+            cs.add_hyperparameters([auto_regressive])
+            cs.add_conditions([cond_use_local_layer])
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 9094b679d..1534dcb1f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -33,10 +33,10 @@ class BaseForecastingDecoder(autoPyTorchComponent):
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
     def __init__(self,
-                 **kwargs: Any):
+                 **kwargs: Dict[str, Any]):
         super().__init__()
         self.add_fit_requirements(self._required_fit_requirements)
-
+        self.auto_regressive = kwargs.get('auto_regressive', False)
         self.config = kwargs
         self.decoder: Optional[nn.Module] = None
         self.n_decoder_output_features = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index fdc6aad87..d5999544c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -9,6 +9,7 @@
 from ConfigSpace.conditions import (
     EqualsCondition, OrConjunction, GreaterThanCondition, NotEqualsCondition, AndConjunction
 )
+from ConfigSpace.forbidden import ForbiddenInClause, ForbiddenEqualsClause, ForbiddenAndConjunction
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 
@@ -47,14 +48,12 @@ class ForecastingNetworkStructure(autoPyTorchComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None,
                  num_blocks: int = 1,
                  variable_selection: bool = False,
-                 skip_connection: bool = False,
-                 auto_regressive: str = 'not_applied') -> None:
+                 skip_connection: bool = False) -> None:
         super().__init__()
         self.num_blocks = num_blocks
         self.random_state = random_state
         self.variable_selection = variable_selection
         self.skip_connection = skip_connection
-        self.auto_regressive = auto_regressive
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
         self.check_requirements(X, y)
@@ -65,7 +64,6 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             'num_blocks': self.num_blocks,
             'variable_selection': self.variable_selection,
             'skip_connection': self.skip_connection,
-            'auto_regressive': self.auto_regressive,
         })
         return X
 
@@ -92,7 +90,8 @@ def __str__(self) -> str:
 
 class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
     deepAR_decoder_name = 'MLPDecoder'
-    deepAR_decoder_prefix = 'deepAR_decoder'
+    deepAR_decoder_prefix = 'block_1'
+
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
@@ -122,11 +121,6 @@ def get_hyperparameter_search_space(
             skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
                                                                                    value_range=(True, False),
                                                                                    default_value=False),
-            auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="auto_regressive",
-                value_range=('encoder', 'decoder'),
-                default_value='encoder',
-            ),
             default: Optional[str] = None,
             include: Optional[List[str]] = None,
             exclude: Optional[List[str]] = None,
@@ -159,11 +153,10 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, variable_selection, CategoricalHyperparameter)
         add_hyperparameter(cs, skip_connection, CategoricalHyperparameter)
 
-        min_num_blocks, max_num_blcoks = num_blocks.value_range
+        min_num_blocks, max_num_blocks = num_blocks.value_range
 
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
-        auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
-        cs.add_hyperparameters([num_blocks, auto_regressive])
+        cs.add_hyperparameters([num_blocks])
 
         # Compile a list of legal preprocessors for this problem
         available_encoders = self.get_available_components(
@@ -188,7 +181,7 @@ def get_hyperparameter_search_space(
                     break
         updates_choice = self._get_search_space_updates()
 
-        for i in range(1, int(max_num_blcoks) + 1):
+        for i in range(1, int(max_num_blocks) + 1):
             block_prefix = f'block_{i}:'
 
             if '__choice__' in updates_choice.keys():
@@ -242,79 +235,72 @@ def get_hyperparameter_search_space(
                     decoder2encoder[decoder_name].append(encoder_name)
                 encoder2decoder[encoder_name] = allowed_decoders
 
-            if len(auto_regressive.choices) > 1 or auto_regressive.choices[0] != 'encoder':
-                for decoder_name in available_decoders.keys():
-                    if not decoder2encoder[decoder_name]:
-                        continue
-                    updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
-                    config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
-                                                                                                    # type: ignore
-                                                                                                    **updates)
-                    compatible_encoders = decoder2encoder[decoder_name]
-                    encoders_with_multi_decoder = []
-                    encoder_with_single_decoder = []
-                    for encoder in compatible_encoders:
-                        if len(encoder2decoder[encoder]) > 1:
-                            encoders_with_multi_decoder.append(encoder)
-                        else:
-                            encoder_with_single_decoder.append(encoder)
-
-                    cs.add_configuration_space(
-                        block_prefix + decoder_name,
-                        config_space,
-                        # parent_hyperparameter=parent_hyperparameter
-                    )
-
-                    hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
-                    conditions_to_add = []
-                    for hp in hps:
-                        # TODO consider if this will raise any unexpected behavior
-                        if hp.name.startswith(block_prefix + decoder_name):
-                            # From the implementation of ConfigSpace
-                            # Only add a condition if the parameter is a top-level
-                            # parameter of the new configuration space (this will be some
-                            #  kind of tree structure).
-                            if cs.get_parents_of(hp):
-                                continue
-                            or_cond = []
-                            for encoder_single in encoder_with_single_decoder:
-                                or_cond.append(EqualsCondition(hp,
-                                                               hp_encoder,
-                                                               encoder_single))
-                            for encode_multi in encoders_with_multi_decoder:
-                                hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encode_multi}:decoder_type')
-                                or_cond.append(EqualsCondition(hp, hp_decoder_type, decoder_name))
-                            if len(or_cond) == 1:
-                                conditions_to_add.append(
-                                    AndConjunction(or_cond[0],
-                                                   NotEqualsCondition(hp, auto_regressive, 'encoder'))
-                                )
-                            elif len(or_cond) > 1:
-                                conditions_to_add.append(
-                                    AndConjunction(OrConjunction(*or_cond),
-                                                   NotEqualsCondition(hp, auto_regressive, 'encoder'))
-                                )
-
-                    cs.add_conditions(conditions_to_add)
-
-        if 'encoder' in auto_regressive.choices:
-            decoder_name = self.deepAR_decoder_name
-            if decoder_name in available_decoders:
-                updates = self._get_search_space_updates(prefix=self.deepAR_decoder_prefix + decoder_name)
-                updates['auto_regressive'] = True
+            for decoder_name in available_decoders.keys():
+                if not decoder2encoder[decoder_name]:
+                    continue
+                updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
+                if i == 1 and decoder_name == self.deepAR_decoder_name:
+                    # TODO this is only a temporary solution, a fix on ConfigSpace needs to be implemented
+                    updates['can_be_auto_regressive'] = True
                 config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
                                                                                                 # type: ignore
                                                                                                 **updates)
-                parent_hyperparameter = {'parent': auto_regressive, 'value': 'encoder'}
+                compatible_encoders = decoder2encoder[decoder_name]
+                encoders_with_multi_decoder = []
+                encoder_with_single_decoder = []
+                for encoder in compatible_encoders:
+                    if len(encoder2decoder[encoder]) > 1:
+                        encoders_with_multi_decoder.append(encoder)
+                    else:
+                        encoder_with_single_decoder.append(encoder)
+
                 cs.add_configuration_space(
-                    self.deepAR_decoder_prefix + decoder_name,
+                    block_prefix + decoder_name,
                     config_space,
-                    parent_hyperparameter=parent_hyperparameter
+                    # parent_hyperparameter=parent_hyperparameter
                 )
 
-            self.configuration_space_ = cs
-            self.dataset_properties_ = dataset_properties
-
+                hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
+                conditions_to_add = []
+                for hp in hps:
+                    # TODO consider if this will raise any unexpected behavior
+                    if hp.name.startswith(block_prefix + decoder_name):
+                        # From the implementation of ConfigSpace
+                        # Only add a condition if the parameter is a top-level
+                        # parameter of the new configuration space (this will be some
+                        #  kind of tree structure).
+                        if cs.get_parents_of(hp):
+                            continue
+                        or_cond = []
+                        for encoder_single in encoder_with_single_decoder:
+                            or_cond.append(EqualsCondition(hp,
+                                                           hp_encoder,
+                                                           encoder_single))
+                        for encode_multi in encoders_with_multi_decoder:
+                            hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encode_multi}:decoder_type')
+                            or_cond.append(EqualsCondition(hp, hp_decoder_type, decoder_name))
+                        if len(or_cond) == 0:
+                            continue
+                        elif len(or_cond) > 1:
+                            conditions_to_add.append(OrConjunction(*or_cond))
+                        else:
+                            conditions_to_add.append(or_cond[0])
+
+                cs.add_conditions(conditions_to_add)
+        if self.deepAR_decoder_name in available_decoders:
+            deep_ar_hp = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
+            if deep_ar_hp in cs:
+                deep_ar_hp = cs.get_hyperparameter(deep_ar_hp)
+                forbidden_ar = ForbiddenEqualsClause(deep_ar_hp, True)
+                if min_num_blocks == 1:
+                    if max_num_blocks - min_num_blocks > 1:
+                        forbidden = ForbiddenAndConjunction(
+                            ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
+                            forbidden_ar
+                        )
+                    else:
+                        forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
+                    cs.add_forbidden_clause(forbidden)
         return cs
 
     def set_hyperparameters(self,
@@ -342,7 +328,6 @@ def set_hyperparameters(self,
         num_blocks = params['num_blocks']
         variable_selection = params['variable_selection']
         skip_connection = params['skip_connection']
-        auto_regressive = params['auto_regressive']
         del params['num_blocks']
         del params['variable_selection']
         del params['skip_connection']
@@ -350,8 +335,7 @@ def set_hyperparameters(self,
         pipeline_steps = [('net_structure', ForecastingNetworkStructure(random_state=self.random_state,
                                                                         num_blocks=num_blocks,
                                                                         variable_selection=variable_selection,
-                                                                        skip_connection=skip_connection,
-                                                                        auto_regressive=auto_regressive,))]
+                                                                        skip_connection=skip_connection,))]
         self.encoder_choice = []
         self.decoder_choice = []
 
@@ -373,49 +357,32 @@ def set_hyperparameters(self,
                         param = param.replace(block_prefix + choice + ':', '')
                         new_params[param] = value
 
-            if auto_regressive != 'encoder':
-                decoder_type = None
-
-                decoder_params = {}
-                decoder_params_names = []
-                for param, value in new_params.items():
-                    if decoder_type is None:
-                        for decoder_component in decoder_components.keys():
-                            if param.startswith(block_prefix + decoder_component):
-                                decoder_type = decoder_component
-                                decoder_params_names.append(param)
-                                param = param.replace(block_prefix + decoder_type + ':', '')
-                                decoder_params[param] = value
-                    else:
-                        if param.startswith(block_prefix + decoder_type):
-                            decoder_params_names.append(param)
-                            param = param.replace(block_prefix + decoder_type + ':', '')
-                            decoder_params[param] = value
+            decoder_type = None
 
-                for param_name in decoder_params_names:
-                    del new_params[param_name]
-                new_params['random_state'] = self.random_state
-                decoder_params['random_state'] = self.random_state
-                encoder = self.get_components()[choice](**new_params)
-                decoder = decoder_components[decoder_type](**decoder_params)
-                pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
-                self.encoder_choice.append(encoder)
-                self.decoder_choice.append(decoder)
-            else:
-                new_params['random_state'] = self.random_state
-                encoder = self.get_components()[choice](**new_params)
-                pipeline_steps.extend([(f'encoder_{i}', encoder)])
-                self.encoder_choice.append(encoder)
-
-        if auto_regressive == 'encoder':
             decoder_params = {}
+            decoder_params_names = []
             for param, value in new_params.items():
-                if param.startswith(self.deepAR_decoder_prefix + self.deepAR_decoder_name):
-                    param = param.replace(self.deepAR_decoder_prefix + self.deepAR_decoder_name + ':', '')
-                    decoder_params[param] = value
+                if decoder_type is None:
+                    for decoder_component in decoder_components.keys():
+                        if param.startswith(block_prefix + decoder_component):
+                            decoder_type = decoder_component
+                            decoder_params_names.append(param)
+                            param = param.replace(block_prefix + decoder_type + ':', '')
+                            decoder_params[param] = value
+                else:
+                    if param.startswith(block_prefix + decoder_type):
+                        decoder_params_names.append(param)
+                        param = param.replace(block_prefix + decoder_type + ':', '')
+                        decoder_params[param] = value
+
+            for param_name in decoder_params_names:
+                del new_params[param_name]
+            new_params['random_state'] = self.random_state
             decoder_params['random_state'] = self.random_state
-            decoder = decoder_components[self.deepAR_decoder_name](**decoder_params)
-            pipeline_steps.extend([(f'decoder', decoder)])
+            encoder = self.get_components()[choice](**new_params)
+            decoder = decoder_components[decoder_type](**decoder_params)
+            pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
+            self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
 
         self.pipeline = Pipeline(pipeline_steps)

From b2063e7bcf481cc9047ce628f403b4fcab9a0792 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 23 Feb 2022 19:39:10 +0100
Subject: [PATCH 166/347] adjust backbones to fit new structure

---
 autoPyTorch/datasets/time_series_dataset.py   |  7 +-
 .../forecasting_decoder/MLPDecoder.py         | 72 +++++++++++++++----
 .../forecasting_decoder/NBEATSDecoder.py      | 15 ++--
 .../forecasting_decoder/RNNDecoder.py         | 19 ++---
 .../forecasting_decoder/TransformerDecoder.py | 21 +++---
 .../base_forecasting_decoder.py               | 69 ++++++++++++------
 .../base_forecasting_encoder.py               | 62 ++++++++++------
 .../flat_encoder/MLPEncoder.py                |  8 +--
 .../flat_encoder/NBEATSEncoder.py             |  7 +-
 .../seq_encoder/InceptionTimeEncoder.py       | 46 ++++++------
 .../seq_encoder/RNNEncoder.py                 |  8 +--
 .../seq_encoder/TCNEncoder.py                 | 35 ++++-----
 .../seq_encoder/TransformerEncoder.py         |  8 +--
 .../seq_encoder/__init__.py                   | 63 ++++++++++++++--
 .../forecasting_network_head/distribution.py  | 10 +--
 15 files changed, 286 insertions(+), 164 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index deb427133..cb154b20e 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -461,9 +461,9 @@ def __init__(self,
             self.static_features_shape: int = static_features.size
 
         if known_future_features is None:
-            self.future_feature_shape: Tuple[int, int] = (self.seq_length_min, 0)
+            self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)
         else:
-            self.input_shape_future: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
+            self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1][0])
@@ -767,7 +767,10 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'freq': self.freq,
                                    'sequence_lengths_train': self.sequence_lengths_train,
                                    'seq_length_max': self.seq_length_max,
+                                   'input_shape':self.input_shape,
                                    'lagged_value': self.lagged_value,
+                                   'static_features': self.static_features,
+                                   'future_feature_shapes': self.future_feature_shapes,
                                    'uni_variant': self.is_uni_variant})
         return dataset_properties
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 21ba716bf..3056c436f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -1,5 +1,7 @@
 from typing import Dict, Optional, Tuple, Union, Any
 
+import numpy as np
+import torch
 from torch import nn
 
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -10,33 +12,67 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone. \
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderNetwork
+
+
+class MLPDecoderModule(DecoderNetwork):
+    def __init__(self,
+                 global_layers: nn.Module,
+                 local_layers: Optional[nn.Module],
+                 auto_regressive: bool = False
+                 ):
+        self.global_layers = global_layers
+        self.local_layers = local_layers
+        self.auto_regressive = auto_regressive
+
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor):
+        if x_future is not None or self.auto_regressive:
+            # for auto-regressive model, x_future is fed to the encoders
+            x = self.global_layers(encoder_output)
+            if self.local_layers is None:
+                return x
+            else:
+                # auto regressive model does not have local layers
+                return self.local_layers(x)
+        if self.local_layers is None:
+            x = torch.concat([encoder_output, x_future.flatten(-2)], dim=-1)
+            return self.global_layers(x)
+        x = self.global_layers(encoder_output)
+        x = self.local_layers(x)
+        return torch.concat([x, x_future], dim=-1)
 
 
 class ForecastingMLPDecoder(BaseForecastingDecoder):
     def _build_decoder(self,
-                       input_shape: Tuple[int, ...],
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
-        layers = []
-        in_features = input_shape[-1]
+        global_layers = []
+        in_features = encoder_output_shape[-1]
         num_decoder_output_features = in_features
         if 'num_layers' in self.config and self.config["num_layers"] > 0:
+            in_features += int(np.prod(future_variable_input))
             for i in range(1, self.config["num_layers"]):
-                layers.append(nn.Linear(in_features=in_features,
-                                        out_features=self.config[f"units_layer_{i}"]))
-                layers.append(_activations[self.config["activation"]]())
+                global_layers.append(nn.Linear(in_features=in_features,
+                                               out_features=self.config[f"units_layer_{i}"]))
+                global_layers.append(_activations[self.config["activation"]]())
                 in_features = self.config[f"units_layer_{i}"]
                 num_decoder_output_features = in_features
         if 'units_local_layer' in self.config:
-            layers.append(nn.Linear(in_features=in_features,
-                                    out_features=self.config['units_local_layer'] * n_prediction_heads))
+            local_layers = [nn.Linear(in_features=in_features,
+                                      out_features=self.config['units_local_layer'] * n_prediction_heads)]
             if 'activation' in self.config:
-                layers.append(_activations[self.config["activation"]]())
-            num_decoder_output_features = self.config['units_local_layer']
+                local_layers.append(_activations[self.config["activation"]]())
+            local_layers.append(nn.Unflatten(-1, (n_prediction_heads, self.config['units_local_layer'])))
+            num_decoder_output_features = self.config['units_local_layer'] + future_variable_input[-1]
+        else:
+            local_layers = None
 
-        return nn.Sequential(*layers), num_decoder_output_features
+        return MLPDecoderModule(global_layers=nn.Sequential(*global_layers),
+                                local_layers=nn.Sequential(*local_layers) if local_layers is not None else None,
+                                auto_regressive=self.auto_regressive), num_decoder_output_features
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
@@ -60,6 +96,7 @@ def fitted_encoder(self):
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            can_be_auto_regressive: bool = False,
             num_layers: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_layers",
                                                                               value_range=(0, 3),
                                                                               default_value=1),
@@ -71,7 +108,6 @@ def get_hyperparameter_search_space(
                                                                               value_range=tuple(_activations.keys()),
                                                                               default_value=list(_activations.keys())[
                                                                                   0]),
-            can_be_auto_regressive:bool = False,
             auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="auto_regressive",
                                                                                    value_range=(True, False),
                                                                                    default_value=False,
@@ -100,6 +136,7 @@ def get_hyperparameter_search_space(
 
         Args:
             dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Dataset Properties
+            can_be_auto_regressive: bool: if this decoder is allowed to be auto-regressive
             num_layers (HyperparameterSearchSpace): number of decoder layers (the last layer is not included, thus it
             could start from 0)
             units_layer (HyperparameterSearchSpace): number of units of each layer (except for the last layer)
@@ -113,6 +150,13 @@ def get_hyperparameter_search_space(
         Returns:
             cs (ConfigurationSpace): ConfigurationSpace
         """
+        if dataset_properties is not None:
+            num_in_features = dataset_properties.get('input_shape', (0,))
+            future_feature_shapes = dataset_properties.get('future_feature_shapes', (0,))
+            if num_in_features[-1] != future_feature_shapes[-1]:
+                # deepAR model cannot be applied
+                auto_regressive.value_range = False
+
         cs = ConfigurationSpace()
 
         min_num_layers: int = num_layers.value_range[0]  # type: ignore
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 0a6611191..885911ee3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -17,9 +17,6 @@
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
 
 
-# TODO we need to rewrite NBEATS part to make it neater!!!
-
-
 class NBEATSBLock(nn.Module):
     def __init__(self,
                  n_in_features: int,
@@ -92,16 +89,20 @@ class NBEATSDecoder(BaseForecastingDecoder):
     fill_lower_resolution_seq = False
     fill_kwargs = {}
 
-    def decoder_properties(self):
-        decoder_properties = super().decoder_properties()
+    @staticmethod
+    def decoder_properties():
+        decoder_properties = BaseForecastingDecoder.decoder_properties()
         decoder_properties.update({
             'multi_blocks': True
         })
         return decoder_properties
 
-    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
-        in_features = input_shape[-1]
+        in_features = encoder_output_shape[-1]
         n_beats_type = self.config['n_beats_type']
         if n_beats_type == 'G':
             stacks = [[] for _ in range(self.config['num_stacks_g'])]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index e819915a4..e1175620f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -13,12 +13,12 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, RecurrentDecoderNetwork
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderNetwork
 
 from autoPyTorch.utils.common import FitRequirement
 
 
-class RNN_Module(RecurrentDecoderNetwork):
+class RNN_Module(DecoderNetwork):
     def __init__(self,
                  in_features: int,
                  hidden_size: int,
@@ -41,10 +41,10 @@ def __init__(self,
                          batch_first=True)
 
     def forward(self, x_future: torch.Tensor,
-                features_latent: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+                encoder_output: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
         if x_future.ndim == 2:
             x_future = x_future.unsqueeze(1)
-        outputs, hidden_state, = self.lstm(x_future, features_latent)
+        outputs, hidden_state, = self.lstm(x_future, encoder_output)
         return outputs, hidden_state
 
 
@@ -56,7 +56,6 @@ class ForecastingRNNDecoder(BaseForecastingDecoder):
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
-        self.auto_regressive = True
         self.rnn_kwargs = None
         self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
 
@@ -67,7 +66,8 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
         return fit_requirement
 
     def _build_decoder(self,
-                       input_shape: Tuple[int, ...],
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
         # RNN decoder only allows RNN encoder, these parameters need to exists.
@@ -76,7 +76,7 @@ def _build_decoder(self,
             'num_layers']
         cell_type = self.rnn_kwargs['cell_type']
         dropout = self.rnn_kwargs['dropout']
-        decoder = RNN_Module(in_features=dataset_properties['output_shape'][-1],
+        decoder = RNN_Module(in_features=future_variable_input[-1],
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              cell_type=cell_type,
@@ -89,8 +89,9 @@ def _build_decoder(self,
     def fitted_encoder(self):
         return ['RNNEncoder']
 
-    def decoder_properties(self):
-        decoder_properties = super().decoder_properties()
+    @staticmethod
+    def decoder_properties():
+        decoder_properties = BaseForecastingDecoder.decoder_properties()
         decoder_properties.update({'has_hidden_states': True,
                                    'recurrent': True,
                                    'lagged_input': True,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index ae75a2809..5f0910130 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -17,7 +17,7 @@
 from autoPyTorch.utils.common import add_hyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, RecurrentDecoderNetwork
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderNetwork
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
     PositionalEncoding, build_transformer_layers
@@ -25,7 +25,7 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
 
 
-class _TransformerDecoder(RecurrentDecoderNetwork):
+class _TransformerDecoder(DecoderNetwork):
     def __init__(self,
                  in_features: int,
                  d_model: int,
@@ -55,13 +55,13 @@ def __init__(self,
                                                                 num_layers=num_layers,
                                                                 norm=norm)
 
-    def forward(self, x_future: torch.Tensor, features_latent: torch.Tensor,
+    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor,
                 tgt_mask: Optional[torch.Tensor] = None,
                 memory_mask: Optional[torch.Tensor] = None,
                 tgt_key_padding_mask: Optional[torch.Tensor] = None,
                 memory_key_padding_mask: Optional[torch.Tensor] = None):
         output = self.input_layer(x_future)
-        output = self.transformer_decoder_layers(output, features_latent, tgt_mask=tgt_mask,
+        output = self.transformer_decoder_layers(output, encoder_output, tgt_mask=tgt_mask,
                                                  memory_mask=memory_mask,
                                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                                  memory_key_padding_mask=memory_key_padding_mask)
@@ -72,18 +72,18 @@ class ForecastingTransformerDecoder(BaseForecastingDecoder):
     def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
-        self.auto_regressive = True
         self.transformer_encoder_kwargs = None
         self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
 
     def _build_decoder(self,
-                       input_shape: Tuple[int, ...],
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
-                       dataset_properties: Dict) -> nn.Module:
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
         d_model = 2 ** self.transformer_encoder_kwargs['d_model_log']
         transformer_decoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='decoder')
 
-        decoder = _TransformerDecoder(in_features=dataset_properties['output_shape'][-1],
+        decoder = _TransformerDecoder(in_features=future_variable_input[-1],
                                       d_model=d_model,
                                       num_layers=self.config['num_layers'],
                                       transformer_decoder_layers=transformer_decoder_layers,
@@ -102,8 +102,9 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
                                               dataset_property=False))
         return fit_requirement
 
-    def decoder_properties(self):
-        decoder_properties = super().decoder_properties()
+    @staticmethod
+    def decoder_properties():
+        decoder_properties = BaseForecastingDecoder.decoder_properties()
         decoder_properties.update({'recurrent': True,
                                    'lagged_input': True,
                                    'mask_on_future_target': True,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 1534dcb1f..602709fb3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -9,15 +9,15 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
 
 
-class RecurrentDecoderNetwork(nn.Module):
-    def forward(self, x_future: torch.Tensor, features_latent: torch.Tensor):
+class DecoderNetwork(nn.Module):
+    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
         """
         Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
 
 
         Args:
-            x_future torch.Tensor(B, L_future, N_out), the future features
-            features_latent: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
+            x_future: torch.Tensor(B, L_future, N_out), the future features
+            encoder_output: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
         Returns:
             net_output: torch.Tensor with shape either (B, L_future, N)
 
@@ -33,14 +33,17 @@ class BaseForecastingDecoder(autoPyTorchComponent):
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
     def __init__(self,
+                 block_number: int = 1,
+                 auto_regressive: bool = False,
                  **kwargs: Dict[str, Any]):
         super().__init__()
         self.add_fit_requirements(self._required_fit_requirements)
-        self.auto_regressive = kwargs.get('auto_regressive', False)
+        self.auto_regressive = auto_regressive
         self.config = kwargs
         self.decoder: Optional[nn.Module] = None
         self.n_decoder_output_features = None
         self.n_prediction_heads = 1
+        self.block_number = block_number
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -48,14 +51,17 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('network_encoder', (nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement('encoder_properties', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement('future_feature_shapes', (Tuple,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('seq_output_shape', (Tuple,), user_defined=False, dataset_property=False)
         ]
 
     @property
     def fitted_encoder(self):
         return []
 
-    def decoder_properties(self):
+    @staticmethod
+    def decoder_properties():
         decoder_properties = {'has_hidden_states': False,
                               'has_local_layer': True,
                               'recurrent': False,
@@ -76,24 +82,38 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             Self
         """
         self.check_requirements(X, y)
-        input_shape = X['dataset_properties']['input_shape']
         output_shape = X['dataset_properties']['output_shape']
+        static_features_shape = X["dataset_properties"]["static_features_shape"]
 
-        auto_regressive = self.auto_regressive
+        encoder_output_shape = X['encoder_output_shape']
 
-        X.update({"auto_regressive": auto_regressive})
+        auto_regressive = self.auto_regressive
 
         if auto_regressive:
             self.n_prediction_heads = 1
         else:
             self.n_prediction_heads = output_shape[0]
 
-        encoder_properties = X['encoder_properties']
-        has_hidden_states = encoder_properties.get("has_hidden_states", False)
+        variable_selection = X.get("variable_selection", False)
+        future_feature_shapes = X['dataset_properties']['future_feature_shapes']
+
+        future_in_features = future_feature_shapes[-1] + static_features_shape
+        if variable_selection:
+            # TODO
+            pass
+        else:
+            if auto_regressive:
+                if self.decoder_properties()["lagged_input"] and hasattr(self, 'lagged_value'):
+                    future_in_features += len(self.lagged_value) * output_shape[-1]
+                else:
+                    future_in_features += output_shape[-1]
+        future_variable_input = (self.n_prediction_heads, future_in_features)
+
+        # TODO consider decoder auto regressive and fill in decoder part
 
         self.decoder, self.n_decoder_output_features = self.build_decoder(
-            input_shape=get_output_shape(X['network_encoder'], input_shape=input_shape,
-                                         has_hidden_states=has_hidden_states),
+            encoder_output_shape=encoder_output_shape,
+            future_variable_input=future_variable_input,
             n_prediction_heads=self.n_prediction_heads,
             dataset_properties=X['dataset_properties']
         )
@@ -113,36 +133,45 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'decoder_properties': self.decoder_properties(),
                   'network_decoder': self.decoder,
                   'n_prediction_heads': self.n_prediction_heads,
-                  'n_decoder_output_features': self.n_decoder_output_features})
+                  'n_decoder_output_features': self.n_decoder_output_features,
+                  'auto_regressive': self.auto_regressive})
 
         return X
 
     def build_decoder(self,
-                      input_shape: Tuple[int, ...],
+                      encoder_output_shape: Tuple[int, ...],
+                      future_variable_input: Tuple[int, ...],
                       n_prediction_heads: int,
                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
         """
         Builds the head module and returns it
 
         Args:
-            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            encoder_output_shape (Tuple[int, ...]): shape of the input to the decoder, this value is the encoder output
+            future_variable_input (Tuple[int, ...]): shape of the known future input values
             n_prediction_heads (int): how many prediction heads the network has, used for final forecasting heads
             dataset_properties (Dict): dataset properties
         Returns:
             nn.Module: head module
         """
-        decoder, n_decoder_features = self._build_decoder(input_shape, n_prediction_heads, dataset_properties)
+        decoder, n_decoder_features = self._build_decoder(encoder_output_shape, future_variable_input,
+                                                          n_prediction_heads, dataset_properties)
         return decoder, n_decoder_features
 
     @abstractmethod
-    def _build_decoder(self, input_shape: Tuple[int, ...], n_prediction_heads: int,
+    def _build_decoder(self,
+                       encoder_output_shape: Tuple[int, ...],
+                       future_variable_input: Tuple[int, ...],
+                       n_prediction_heads: int,
                        dataset_properties:Dict) -> Tuple[nn.Module, int]:
         """
         Builds the head module and returns it
 
         Args:
-            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
-            n_prediction_heads (int): how many prediction heads will be generated after the encoder
+            encoder_output_shape (Tuple[int, ...]): shape of the input to the decoder, this value is the encoder output
+            future_variable_input (Tuple[int, ...]): shape of the known future input values
+            n_prediction_heads (int): how many prediction heads the network has, used for final forecasting heads
+            dataset_properties (Dict): dataset properties
 
         Returns:
             decoder (nn.Module): decoder module
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 5556ba431..c36934353 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -46,6 +46,7 @@ class BaseForecastingEncoder(autoPyTorchComponent):
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
     def __init__(self,
+                 block_number: int = 1,
                  **kwargs: Any):
         autoPyTorchComponent.__init__(self)
         self.add_fit_requirements(
@@ -54,6 +55,8 @@ def __init__(self,
         self.encoder: nn.Module = None
         self.config = kwargs
         self.input_shape: Optional[Iterable] = None
+        self.block_number = block_number
+        self.encoder_output_shape: Optional[Iterable] = None
 
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
@@ -66,7 +69,7 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
             FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('static_features_shape', (int, ), user_defined=True, dataset_property=True),
+            FitRequirement('static_features_shape', (int,), user_defined=True, dataset_property=True),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -78,24 +81,41 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         output_shape = X["dataset_properties"]['output_shape']
         static_features_shape = X["dataset_properties"]["static_features_shape"]
 
-        if not X["dataset_properties"]["uni_variant"]:
-            if not X["dataset_properties"]["is_small_preprocess"]:
-                # get input shape by transforming first two elements of the training set
-                transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
-                X_train = X_train[:1, np.newaxis, ...]
-                X_train = transforms(X_train)
-                input_shape = np.concatenate(X_train).shape[1:]
-
-        if 'network_embedding' in X.keys():
-            input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
-
-        self.encoder, in_features = self.build_encoder(
-            targets_shape=output_shape,
+        if self.block_number == 1:
+            if not X["dataset_properties"]["uni_variant"]:
+                if not X["dataset_properties"]["is_small_preprocess"]:
+                    # get input shape by transforming first two elements of the training set
+                    transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
+                    X_train = X_train[:1, np.newaxis, ...]
+                    X_train = transforms(X_train)
+                    input_shape = np.concatenate(X_train).shape[1:]
+
+            if 'network_embedding' in X.keys():
+                input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
+
+            in_features = input_shape[-1]
+
+            variable_selection = X.get("variable_selection", False)
+            if variable_selection:
+                # TODO
+                pass
+            elif self.encoder_properties()["lagged_input"] and hasattr(self, 'lagged_value'):
+                in_features = len(self.lagged_value) * output_shape[-1] + input_shape[-1] + static_features_shape
+            else:
+                in_features = output_shape[-1] + input_shape[-1] + static_features_shape
+
+            input_shape = (X['window_size'], in_features)
+        else:
+            input_shape = X['encoder_output_shape']
+
+        self.encoder = self.build_encoder(
             input_shape=input_shape,
-            static_feature_shape=static_features_shape
         )
 
-        self.input_shape = (X['window_size'], in_features)
+        self.input_shape = input_shape
+
+        has_hidden_states = self.encoder_properties().get("has_hidden_states", False)
+        self.encoder_output_shape = get_output_shape(input_shape, has_hidden_states)
 
         return self
 
@@ -105,15 +125,15 @@ def allowed_decoders():
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
-        X.update({'network_encoder': self.encoder})
-        X.update({'encoder_properties': self.encoder_properties()})
+        X.update({'network_encoder': self.encoder,
+                  'encoder_properties': self.encoder_properties(),
+                  'encoder_output_shape': self.encoder_output_shape
+                  })
         return X
 
     @abstractmethod
     def build_encoder(self,
-                      targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+                      input_shape: Tuple[int, ...]) -> nn.Module:
         """
         Builds the backbone module and returns it
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index 69e0d9697..ef40104df 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -87,12 +87,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         # when resolution is smaller
         return super().fit(X, y)
 
-    def build_encoder(self, targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
-        in_features = (input_shape[-1] + targets_shape[-1] + static_feature_shape)
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[-1]
         feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
-        return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features * self.window_size)), in_features
+        return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features * self.window_size))
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 9d77b6e36..64498fda2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -47,12 +47,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         return super().fit(X, y)
 
     def build_encoder(self,
-                      targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
-        in_features = targets_shape[-1] + input_shape[-1] + static_feature_shape
+                      input_shape: Tuple[int, ...]) -> nn.Module:
         preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
-        return preprocessor, in_features
+        return preprocessor
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 0eeea102a..f3cc6e981 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -137,14 +137,12 @@ class InceptionTimeEncoder(BaseForecastingEncoder):
     InceptionTime backbone for time series data (see https://arxiv.org/pdf/1909.04939.pdf).
     """
 
-    def build_encoder(self, targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
-        in_features = input_shape[-1] + targets_shape[-1] + static_feature_shape
+    def build_encoder(self, input_shape: Tuple[int, ...] = (0,)) -> nn.Module:
+        in_features = input_shape[-1]
         encoder = _InceptionTime(in_features=in_features,
-                                  config=self.config)
+                                 config=self.config)
         self._receptive_field = encoder.receptive_field
-        return encoder, in_features
+        return encoder
 
     @staticmethod
     def allowed_decoders():
@@ -170,26 +168,26 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                          value_range=(1, 5),
-                                                                          default_value=3,
-                                                                          ),
-        num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
-                                                                           value_range=(4, 64),
-                                                                           default_value=32,
-                                                                           log=True,
-                                                                           ),
-        kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
-                                                                           value_range=(4, 64),
-                                                                           default_value=32,
-                                                                           log=True,
-                                                                           ),
-        bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
-                                                                               value_range=(16, 64),
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                                              value_range=(1, 5),
+                                                                              default_value=3,
+                                                                              ),
+            num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
+                                                                               value_range=(4, 64),
                                                                                default_value=32,
-                                                                               log=True
+                                                                               log=True,
                                                                                ),
+            kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
+                                                                               value_range=(4, 64),
+                                                                               default_value=32,
+                                                                               log=True,
+                                                                               ),
+            bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
+                                                                                   value_range=(16, 64),
+                                                                                   default_value=32,
+                                                                                   log=True
+                                                                                   ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index c111b7626..917ed0d2f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -77,15 +77,13 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
-    def build_encoder(self, targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
-        in_features = len(self.lagged_value) * targets_shape[-1] + input_shape[-1] + static_feature_shape
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[-1]
         encoder = _RNN(in_features=in_features,
                        config=self.config,
                        lagged_value=self.lagged_value,
                        )
-        return encoder, in_features
+        return encoder
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index b74198935..fef6db303 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -92,7 +92,7 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: List[i
                                       stride=stride,
                                       dilation=dilation_size,
                                       padding=(kernel_size[i] - 1) * dilation_size,
-                                      dropout=dropout[i])]
+                                      dropout=dropout)]
             # receptive_field_block = 1 + (kernel_size - 1) * dilation_size * \
             #                        (int(np.prod(stride_values[:-2])) * (1 + stride_values[-2]))
             # stride = 1, we ignore stride computation
@@ -118,18 +118,14 @@ class TCNEncoder(BaseForecastingEncoder):
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
 
-    def build_encoder(self,
-                      targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> Tuple[nn.Module, int]:
         num_channels = [self.config["num_filters_1"]]
         kernel_size = [self.config["kernel_size_1"]]
-        dropout = [self.config[f"dropout_1"] if self.config["use_dropout"] else 0.0]
+        dropout = self.config[f"dropout"] if self.config["use_dropout"] else 0.0
         for i in range(2, self.config["num_blocks"] + 1):
             num_channels.append(self.config[f"num_filters_{i}"])
             kernel_size.append(self.config[f"kernel_size_{i}"])
-            dropout.append(self.config[f"dropout_{i}"] if self.config["use_dropout"] else 0.0)
-        in_features = input_shape[-1] + static_feature_shape + targets_shape[-1]
+        in_features = input_shape[-1]
         encoder = _TemporalConvNet(in_features,
                                    num_channels,
                                    kernel_size=kernel_size,
@@ -190,6 +186,14 @@ def get_hyperparameter_search_space(
         use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
         cs.add_hyperparameter(use_dropout)
 
+
+        dropout_hp = get_hyperparameter(dropout, UniformFloatHyperparameter)
+        cs.add_hyperparameter(dropout_hp)
+
+        dropout_condition = CS.EqualsCondition(dropout_hp, use_dropout, True)
+
+        cs.add_condition(dropout_condition)
+
         for i in range(1, int(max_num_blocks) + 1):
             num_filter_search_space = HyperparameterSearchSpace(f"num_filters_{i}",
                                                                 value_range=num_filters.value_range,
@@ -209,19 +213,4 @@ def get_hyperparameter_search_space(
                     CS.GreaterThanCondition(kernel_size_hp, num_blocks, i - 1)
                 ])
 
-            dropout_search_space = HyperparameterSearchSpace(hyperparameter='dropout_%d' % i,
-                                                             value_range=dropout.value_range,
-                                                             default_value=dropout.default_value,
-                                                             log=dropout.log)
-            dropout_hp = get_hyperparameter(dropout_search_space, UniformFloatHyperparameter)
-            cs.add_hyperparameter(dropout_hp)
-
-            dropout_condition_1 = CS.EqualsCondition(dropout_hp, use_dropout, True)
-
-            if i > int(min_num_blocks):
-                dropout_condition_2 = CS.GreaterThanCondition(dropout_hp, num_blocks, i - 1)
-                cs.add_condition(CS.AndConjunction(dropout_condition_1, dropout_condition_2))
-            else:
-                cs.add_condition(dropout_condition_1)
-
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 82d6b3fc5..254f2be28 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -73,10 +73,8 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
-    def build_encoder(self, targets_shape: Tuple[int, ...],
-                      input_shape: Tuple[int, ...] = (0,),
-                      static_feature_shape: int = 0) -> Tuple[nn.Module, int]:
-        in_features = len(self.lagged_value) * targets_shape[-1] + input_shape[-1] + static_feature_shape
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
+        in_features = input_shape[-1]
 
         d_model = 2 ** self.config['d_model_log']
         transformer_encoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='encoder')
@@ -90,7 +88,7 @@ def build_encoder(self, targets_shape: Tuple[int, ...],
                                       dropout_pe=self.config.get('dropout_positional_encoder', 0.0),
                                       layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
                                       lagged_value=self.lagged_value)
-        return encoder, in_features
+        return encoder
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index d5999544c..313edb89f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -118,6 +118,11 @@ def get_hyperparameter_search_space(
                 value_range=(True, False),
                 default_value=False
             ),
+            decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="decoder_auto_regressive",
+                value_range=(True, False),
+                default_value=False,
+            ),
             skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
                                                                                    value_range=(True, False),
                                                                                    default_value=False),
@@ -134,9 +139,8 @@ def get_hyperparameter_search_space(
             block will be attached with a variable selection block while the following will be enriched with static
             features.
             skip_connection: HyperparameterSearchSpace: if skip connection is applied
-            auto_regressive: HyperparameterSearchSpace: if auto-regressive is applied, depending on the choice,
-            auto-regressive strategy is applied to either encoder (DeepAR model), decoder or not applied (in which case
-            )
+            decoder_auto_regressive: HyperparameterSearchSpace: if decoder is auto_regressive, e.g., if the decoder
+            receives the output of its input, this only works for auto_regressive decoder models
             default (Optional[str]): Default backbone to use
             include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
                 list, and will exclusively use this components.
@@ -149,14 +153,29 @@ def get_hyperparameter_search_space(
         if dataset_properties is None:
             dataset_properties = {}
 
+        # TODO
+        static_features_shape = dataset_properties.get("static_features_shape", 0)
+        future_feature_shapes = dataset_properties.get("future_feature_shapes", (0,))
+
         cs = ConfigurationSpace()
-        add_hyperparameter(cs, variable_selection, CategoricalHyperparameter)
         add_hyperparameter(cs, skip_connection, CategoricalHyperparameter)
 
         min_num_blocks, max_num_blocks = num_blocks.value_range
 
+        variable_selection = get_hyperparameter(variable_selection, CategoricalHyperparameter)
+        decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
-        cs.add_hyperparameters([num_blocks])
+        cs.add_hyperparameters([num_blocks, decoder_auto_regressive, variable_selection])
+
+        if static_features_shape + future_feature_shapes[-1] == 0:
+            if False in variable_selection.choices and True in decoder_auto_regressive.choices:
+                if variable_selection.num_choices == 1 and decoder_auto_regressive.num_choices == 1:
+                    raise ValueError("When no future information is available, it is not possible to disable variable"
+                                     "selection and enable auto-regressive decoder model")
+                cs.add_forbidden_clause(ForbiddenAndConjunction(
+                    ForbiddenEqualsClause(variable_selection, False),
+                    ForbiddenEqualsClause(decoder_auto_regressive, True)
+                ))
 
         # Compile a list of legal preprocessors for this problem
         available_encoders = self.get_available_components(
@@ -181,6 +200,13 @@ def get_hyperparameter_search_space(
                     break
         updates_choice = self._get_search_space_updates()
 
+        forbiddens_decoder_auto_regressive = []
+
+        if False in decoder_auto_regressive.choices:
+            forbidden_decoder_ar = ForbiddenEqualsClause(decoder_auto_regressive, True)
+        else:
+            forbidden_decoder_ar = None
+
         for i in range(1, int(max_num_blocks) + 1):
             block_prefix = f'block_{i}:'
 
@@ -259,6 +285,20 @@ def get_hyperparameter_search_space(
                     config_space,
                     # parent_hyperparameter=parent_hyperparameter
                 )
+                if not available_decoders[decoder_name].decoder_properties()["recurrent"]:
+                    hp_encoder_choice = cs.get_hyperparameter(block_prefix + '__choice__')
+                    for encoder_single in encoder_with_single_decoder:
+                        if encoder_single in hp_encoder_choice.choices:
+                            forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
+                                forbidden_decoder_ar,
+                                ForbiddenEqualsClause(hp_encoder_choice, encoder_single)
+                            ))
+                    for encode_multi in encoders_with_multi_decoder:
+                        hp_decoder_type = cs.get_hyperparameter(f"{block_prefix}{encode_multi}:decoder_type")
+                        forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
+                            forbidden_decoder_ar,
+                            ForbiddenEqualsClause(hp_decoder_type, decoder_name)
+                        ))
 
                 hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
                 conditions_to_add = []
@@ -287,6 +327,8 @@ def get_hyperparameter_search_space(
                             conditions_to_add.append(or_cond[0])
 
                 cs.add_conditions(conditions_to_add)
+
+        cs.add_forbidden_clauses(forbiddens_decoder_auto_regressive)
         if self.deepAR_decoder_name in available_decoders:
             deep_ar_hp = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
             if deep_ar_hp in cs:
@@ -301,6 +343,8 @@ def get_hyperparameter_search_space(
                     else:
                         forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
                     cs.add_forbidden_clause(forbidden)
+
+
         return cs
 
     def set_hyperparameters(self,
@@ -328,14 +372,16 @@ def set_hyperparameters(self,
         num_blocks = params['num_blocks']
         variable_selection = params['variable_selection']
         skip_connection = params['skip_connection']
+        decoder_auto_regressive = params['decoder_auto_regressive']
         del params['num_blocks']
         del params['variable_selection']
         del params['skip_connection']
+        del params['decoder_auto_regressive']
 
         pipeline_steps = [('net_structure', ForecastingNetworkStructure(random_state=self.random_state,
                                                                         num_blocks=num_blocks,
                                                                         variable_selection=variable_selection,
-                                                                        skip_connection=skip_connection,))]
+                                                                        skip_connection=skip_connection))]
         self.encoder_choice = []
         self.decoder_choice = []
 
@@ -378,7 +424,12 @@ def set_hyperparameters(self,
             for param_name in decoder_params_names:
                 del new_params[param_name]
             new_params['random_state'] = self.random_state
+            new_params['block_number'] = i
             decoder_params['random_state'] = self.random_state
+            decoder_params['block_number'] = i
+            # for mlp decoder, to avoid decoder's auto_regressive being overwritten by decoder_auto_regressive
+            if 'auto_regressive' not in decoder_params:
+                decoder_params['auto_regressive'] = decoder_auto_regressive
             encoder = self.get_components()[choice](**new_params)
             decoder = decoder_components[decoder_type](**decoder_params)
             pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index a8087f274..cbc6281e6 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -69,14 +69,8 @@ def build_single_proj_layer(arg_dim):
 
             """
             if decoder_has_local_layer:
-                if auto_regressive:
-                    unflatten_layer = []
-                else:
-                    # we need to unflatten the input from 2D to 3D such that local MLP can be applied to each prediction
-                    # separately
-                    unflatten_layer = [nn.Unflatten(-1, (n_prediction_heads, num_in_features))]
-                return nn.Sequential(*unflatten_layer,
-                                     nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+
+                return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
                                      nn.Unflatten(-1, (*output_shape, arg_dim)))
             else:
                 return nn.Sequential(

From 59cee13dda34379d45ba930ccb967619b9471d40 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 28 Feb 2022 09:08:52 +0100
Subject: [PATCH 167/347] further API changes

---
 .../setup/network/forecasting_network.py      |  82 ++++++----
 ...transformer_util.py => components_util.py} |  52 ++++++-
 .../forecasting_decoder/MLPDecoder.py         |   4 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  10 +-
 .../forecasting_decoder/RNNDecoder.py         |  16 +-
 .../forecasting_decoder/TransformerDecoder.py |  15 +-
 .../base_forecasting_decoder.py               |  68 ++++++---
 .../forecasting_encoder/__init__.py           |  20 +--
 .../base_forecasting_encoder.py               | 100 +++++++++++--
 .../flat_encoder/MLPEncoder.py                |  12 +-
 .../flat_encoder/NBEATSEncoder.py             |  11 +-
 .../seq_encoder/InceptionTimeEncoder.py       |   2 +-
 .../seq_encoder/RNNEncoder.py                 |  11 +-
 .../seq_encoder/TransformerEncoder.py         |  12 +-
 .../seq_encoder/__init__.py                   | 140 +++++++++---------
 .../forecasting_head.py                       |  12 +-
 16 files changed, 364 insertions(+), 203 deletions(-)
 rename autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/{transformer_util.py => components_util.py} (51%)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 6218b6dc7..d88240dd9 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,3 +1,4 @@
+from collections import OrderedDict
 from typing import Any, Dict, Optional, Union, Tuple, List
 
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -20,7 +21,19 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import EncoderNetwork
+    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    EncoderNetwork,
+    NetworkStructure,
+    EncoderBlockInfo,
+    NetworkStructure,
+    EncoderProperties
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
+    DecoderBlockInfo,
+    DecoderProperties
+)
+
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
@@ -148,15 +161,14 @@ class ForecastingNet(nn.Module):
     future_target_required = False
 
     def __init__(self,
+                 network_structure: NetworkStructure,
                  network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
-                 network_encoder: EncoderNetwork,
-                 network_decoder: nn.Module,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 network_decoder: Dict[str, DecoderBlockInfo],
                  network_head: Optional[nn.Module],
                  window_size: int,
                  target_scaler: BaseTargetScaler,
                  dataset_properties: Dict,
-                 encoder_properties: Dict,
-                 decoder_properties: Dict,
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
                  num_samples: Optional[int] = 100,
@@ -185,11 +197,27 @@ def __init__(self,
             aggregation (str): how the samples are aggregated. We could take their mean or median values.
         """
         super(ForecastingNet, self).__init__()
+        self.network_structure = network_structure
         self.embedding = network_embedding
-        self.encoder = network_encoder  # type:EncoderNetwork
-        self.decoder = network_decoder
         self.head = network_head
 
+        encoders = OrderedDict()
+        decoders = OrderedDict()
+
+        first_decoder = 0
+        for i in range(1, network_structure.num_blocks + 1):
+            block_number = f'block_{i}'
+            encoders[block_number] = network_encoder[block_number].encoder
+            if block_number in decoders:
+                if first_decoder == 0:
+                    first_decoder = block_number
+                decoders[block_number] = network_decoder[block_number].decoder
+
+        if first_decoder == 0:
+            raise ValueError("At least one decoder must be specified!")
+        self.encoder = nn.ModuleDict(encoders)
+        self.decoder = nn.ModuleDict(decoders)
+
         self.target_scaler = target_scaler
 
         self.n_prediction_steps = dataset_properties['n_prediction_steps']  # type: int
@@ -200,17 +228,15 @@ def __init__(self,
         self.num_samples = num_samples
         self.aggregation = aggregation
 
-        if decoder_properties['has_hidden_states']:
-            if not encoder_properties['has_hidden_states']:
-                raise ValueError('when decoder contains hidden states, encoder must provide the hidden states '
-                                 'for decoder!')
-        self.encoder_has_hidden_states = encoder_properties['has_hidden_states']
-        self.decoder_has_hidden_states = decoder_properties['has_hidden_states']
         # self.mask_futur_features = decoder_properties['mask_future_features']
         self._device = torch.device('cpu')
 
-        self.encoder_lagged_input = encoder_properties['lagged_input']
-        self.decoder_lagged_input = decoder_properties['lagged_input']
+        if not network_structure.variable_selection:
+            self.encoder_lagged_input = network_encoder['block_1'].encoder_properties.lagged_input
+            self.decoder_lagged_input = network_decoder[f'block_{first_decoder}'].decoder_properties.lagged_input
+        else:
+            self.encoder_lagged_input = False
+            self.decoder_lagged_input = False
 
         if self.encoder_lagged_input:
             self.cached_lag_mask_encoder = None
@@ -267,7 +293,6 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
                 hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        # TODO We need to replace thus None with empty tensors to avoid checking if they are None every time!
         if self.encoder_lagged_input:
             past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
             past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
@@ -558,7 +583,7 @@ def __init__(self,
         """
         super(ForecastingDeepARNet, self).__init__(**kwargs)
         # this determines the training targets
-        self.encoder_bijective_seq_output = kwargs['encoder_properties']['bijective_seq_output']
+        self.encoder_bijective_seq_output = kwargs['network_encoder']['block_1'].encoder_properties.bijective_seq_output
 
         self.cached_lag_mask_encoder_test = None
         self.only_generate_future_dist = False
@@ -816,14 +841,16 @@ def _required_fit_requirements(self):
         return [
             FitRequirement('dataset_properties', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_encoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_decoder", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_encoder", (Dict[str, EncoderBlockInfo]), user_defined=False,
+                           dataset_property=False),
+            FitRequirement("network_decoder", (Dict[str, DecoderBlockInfo]), user_defined=False,
+                           dataset_property=False),
             FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
             FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
             FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
-            FitRequirement("encoder_properties", (Dict,), user_defined=False, dataset_property=False),
-            FitRequirement("decoder_properties", (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement("encoder_properties_1", (Dict,), user_defined=False, dataset_property=False),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
@@ -836,14 +863,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                              f"loss function. However, net_out_type is {self.net_out_type} and "
                              f"required_net_out_put_type is {X['required_net_out_put_type']}")
 
-        network_init_kwargs = dict(network_embedding=X['network_embedding'],
-                                   network_encoder=X['network_encoder'],
-                                   network_decoder=X['network_decoder'],
+        network_structure = X['network_structure']
+        network_encoder = X['network_encoder']
+        network_decoder = X['network_decoder']
+
+        network_init_kwargs = dict(network_structure=network_structure,
+                                   network_embedding=X['network_embedding'],
+                                   network_encoder=network_encoder,
+                                   network_decoder=network_decoder,
                                    network_head=X['network_head'],
                                    window_size=X['window_size'],
                                    dataset_properties=X['dataset_properties'],
-                                   encoder_properties=X['encoder_properties'],
-                                   decoder_properties=X['decoder_properties'],
                                    target_scaler=X['target_scaler'],
                                    output_type=self.net_out_type,
                                    forecast_strategy=self.forecast_strategy,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
similarity index 51%
rename from autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
rename to autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index 538f5ed9c..34f8e8ad3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/transformer_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -1,7 +1,13 @@
-from typing import Dict, Any
+from typing import Dict, Any, Optional
 import torch
 from torch import nn
+from functools import partial
+import torch.nn.functional as F
 import math
+from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
+    TimeDistributed, TimeDistributedInterpolation, GatedLinearUnit, ResampleNorm, AddNorm, GateAddNorm,
+    GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention
+)
 
 
 def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='encoder'):
@@ -22,6 +28,50 @@ def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='e
         raise ValueError('layer_type must be encoder or decoder!')
 
 
+class TunableAddNorm(AddNorm):
+    def __init__(self, input_size: int, skip_size: int = None, trainable_add: bool = True,
+                 layer_norm_eps: float = 1e-5):
+        super(TunableAddNorm, self).__init__(input_size, skip_size, trainable_add)
+        self.norm = nn.LayerNorm(self.input_size, eps=layer_norm_eps)
+
+
+class TunableGateAddNorm(GateAddNorm):
+    def __init__(self, input_size: int, hidden_size: int = None, skip_size: int = None, trainable_add: bool = False,
+                 dropout: Optional[float] = None, layer_norm_eps: float = 1e-5):
+        super().__init__(input_size, hidden_size, skip_size, trainable_add, dropout)
+        self.add_norm = TunableAddNorm(self.hidden_size, skip_size=self.skip_size, trainable_add=trainable_add,
+                                       layer_norm_eps=layer_norm_eps)
+
+
+class TunableGatedResidualNetwork(GatedResidualNetwork):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int, dropout: float = 0.1,
+                 context_size: int = None, residual: bool = False, layer_norm_eps: float = 1e-5):
+        super().__init__(input_size, hidden_size, output_size, dropout, context_size, residual)
+        self.gate_norm = TunableGateAddNorm(
+            input_size=self.hidden_size,
+            skip_size=self.output_size,
+            hidden_size=self.output_size,
+            dropout=self.dropout,
+            trainable_add=False,
+            layer_norm_eps=layer_norm_eps
+        )
+
+
+class InterpretableMultiAttentionEncoderLayer(nn.Module):
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 layer_norm_eps: float = 1e-5, device=None, dtype=None) -> None:
+        self.multi_attention = InterpretableMultiHeadAttention(n_head=nhead, d_model=d_model, dropout=dropout)
+        self.post_attn_gate_norm = TunableGateAddNorm(input_size=d_model,
+                                                      hidden_size=dim_feedforward,
+                                                      dropout=dropout,
+                                                      trainable_add=False,
+                                                      layer_norm_eps=layer_norm_eps
+                                                      )
+        self.pos_wise_ff = TunableGatedResidualNetwork(
+            self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, dropout=self.hparams.dropout
+        )
+
+
 # https://github.com/pytorch/examples/blob/master/word_language_model/model.py
 class PositionalEncoding(nn.Module):
     r"""
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 3056c436f..7229312e9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -22,6 +22,7 @@ def __init__(self,
                  local_layers: Optional[nn.Module],
                  auto_regressive: bool = False
                  ):
+        super().__init__()
         self.global_layers = global_layers
         self.local_layers = local_layers
         self.auto_regressive = auto_regressive
@@ -86,7 +87,8 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        X.update({'mlp_has_local_layer': self.config.get('has_local_layer', True)})
+        if self.is_last_decoder:
+            X.update({'mlp_has_local_layer': self.config.get('has_local_layer', True)})
         return super().transform(X)
 
     @property
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 885911ee3..8ed84c0fe 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -14,7 +14,7 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
 
 
 class NBEATSBLock(nn.Module):
@@ -90,12 +90,8 @@ class NBEATSDecoder(BaseForecastingDecoder):
     fill_kwargs = {}
 
     @staticmethod
-    def decoder_properties():
-        decoder_properties = BaseForecastingDecoder.decoder_properties()
-        decoder_properties.update({
-            'multi_blocks': True
-        })
-        return decoder_properties
+    def decoder_properties() -> DecoderProperties:
+        return DecoderProperties(multi_blocks=True)
 
     def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index e1175620f..d76de1071 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -13,7 +13,11 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderNetwork
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
+    BaseForecastingDecoder,
+    DecoderNetwork,
+    DecoderProperties
+)
 
 from autoPyTorch.utils.common import FitRequirement
 
@@ -90,12 +94,10 @@ def fitted_encoder(self):
         return ['RNNEncoder']
 
     @staticmethod
-    def decoder_properties():
-        decoder_properties = BaseForecastingDecoder.decoder_properties()
-        decoder_properties.update({'has_hidden_states': True,
-                                   'recurrent': True,
-                                   'lagged_input': True,
-                                   })
+    def decoder_properties() -> DecoderProperties:
+        decoder_properties = DecoderProperties(has_hidden_states=True,
+                                               recurrent=True,
+                                               lagged_input=True)
         return decoder_properties
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 5f0910130..71493d16e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -17,9 +17,13 @@
 from autoPyTorch.utils.common import add_hyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderNetwork
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
+    BaseForecastingDecoder,
+    DecoderNetwork,
+    DecoderProperties
+)
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     PositionalEncoding, build_transformer_layers
 
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
@@ -104,12 +108,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
 
     @staticmethod
     def decoder_properties():
-        decoder_properties = BaseForecastingDecoder.decoder_properties()
-        decoder_properties.update({'recurrent': True,
-                                   'lagged_input': True,
-                                   'mask_on_future_target': True,
-                                   })
-        return decoder_properties
+        return DecoderProperties(recurrent=True, lagged_input=True, mask_on_future_target=True)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.transformer_encoder_kwargs = X['transformer_encoder_kwargs']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 602709fb3..11f113a75 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -1,5 +1,6 @@
 from abc import abstractmethod, ABC
-from typing import Any, Dict, Iterable, Tuple, List, Optional
+from typing import Any, Dict, Iterable, Tuple, List, Optional, NamedTuple
+from collections import OrderedDict
 
 import torch
 from torch import nn
@@ -7,6 +8,24 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    EncoderBlockInfo,
+    NetworkStructure)
+
+
+class DecoderProperties(NamedTuple):
+    has_hidden_states: bool = False
+    has_local_layer: bool = True
+    recurrent: bool = False
+    lagged_input: bool = False
+    multi_blocks: bool = False
+    mask_on_future_target: bool = False
+
+
+class DecoderBlockInfo(NamedTuple):
+    decoder: nn.Module
+    decoder_properties: DecoderProperties
 
 
 class DecoderNetwork(nn.Module):
@@ -37,39 +56,32 @@ def __init__(self,
                  auto_regressive: bool = False,
                  **kwargs: Dict[str, Any]):
         super().__init__()
+        self.block_number = block_number
         self.add_fit_requirements(self._required_fit_requirements)
         self.auto_regressive = auto_regressive
         self.config = kwargs
         self.decoder: Optional[nn.Module] = None
         self.n_decoder_output_features = None
         self.n_prediction_heads = 1
-        self.block_number = block_number
+        self.is_last_decoder = False
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
-            FitRequirement('network_encoder', (nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement('encoder_properties', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('future_feature_shapes', (Tuple,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
-            FitRequirement('seq_output_shape', (Tuple,), user_defined=False, dataset_property=False)
+            FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False)
         ]
 
     @property
-    def fitted_encoder(self):
+    def fitted_encoder(self) -> List[str]:
         return []
 
     @staticmethod
-    def decoder_properties():
-        decoder_properties = {'has_hidden_states': False,
-                              'has_local_layer': True,
-                              'recurrent': False,
-                              'lagged_input': False,
-                              'multi_blocks': False,
-                              'mask_on_future_target': False,
-                              }
-        return decoder_properties
+    def decoder_properties() -> DecoderProperties:
+        return DecoderProperties()
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
@@ -85,7 +97,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         output_shape = X['dataset_properties']['output_shape']
         static_features_shape = X["dataset_properties"]["static_features_shape"]
 
-        encoder_output_shape = X['encoder_output_shape']
+        encoder_output_shape = X['network_encoder'][f'block_{self.block_number}'].encoder_output_shape_
 
         auto_regressive = self.auto_regressive
 
@@ -94,9 +106,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         else:
             self.n_prediction_heads = output_shape[0]
 
-        variable_selection = X.get("variable_selection", False)
+        network_structure = X['network_structure']
+        variable_selection = network_structure.variable_selection
         future_feature_shapes = X['dataset_properties']['future_feature_shapes']
 
+        if self.block_number == network_structure.num_blocks:
+            self.is_last_decoder = True
+
         future_in_features = future_feature_shapes[-1] + static_features_shape
         if variable_selection:
             # TODO
@@ -130,11 +146,19 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        X.update({'decoder_properties': self.decoder_properties(),
-                  'network_decoder': self.decoder,
-                  'n_prediction_heads': self.n_prediction_heads,
-                  'n_decoder_output_features': self.n_decoder_output_features,
-                  'auto_regressive': self.auto_regressive})
+        # 'auto_regressive' needs to be the same across all the decoders,
+        # 'n_prediction_heads' and 'n_decoder_output_features' are only applied to the head such that they could be
+        # overwritten by the following decoders
+        network_decoder = X.get('network_decoder', OrderedDict())
+        network_decoder[f'block_{self.block_number}'] = DecoderBlockInfo(decoder=self.decoder,
+                                                                         decoder_properties=self.decoder_properties())
+        if self.is_last_decoder:
+            X.update({f'network_decoder': network_decoder,
+                      'n_prediction_heads': self.n_prediction_heads,
+                      'n_decoder_output_features': self.n_decoder_output_features,
+                      'auto_regressive': self.auto_regressive})
+        else:
+            X.update({f'network_decoder': network_decoder})
 
         return X
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 16091477d..0c526d9e9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, Optional, List, Any
+from typing import Dict, Optional, List, Any, Type
 from abc import abstractmethod
 from sklearn.pipeline import Pipeline
 
@@ -19,6 +19,7 @@
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
     BaseForecastingEncoder,
+    ForecastingNetworkStructure
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
     decoders, decoder_addons, add_decoder
@@ -30,10 +31,6 @@
 _addons = ThirdPartyComponents(BaseForecastingEncoder)
 
 
-def add_encoder(encoder: BaseForecastingEncoder) -> None:
-    _addons.add_component(encoder)
-
-
 class AbstractForecastingEncoderChoice(autoPyTorchChoice):
     """
     A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
@@ -49,7 +46,7 @@ def __init__(self,
         self.decoder_choice = None
 
     @abstractmethod
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         """Returns the available backbone components
 
         Args:
@@ -60,10 +57,6 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
                 as choices for learning rate scheduling
         """
         raise NotImplementedError
-        components = OrderedDict()
-        components.update(_encoders)
-        components.update(_addons.components)
-        return components
 
     def get_decoder_components(self) -> Dict[str, autoPyTorchComponent]:
         components = OrderedDict()
@@ -82,7 +75,7 @@ def get_available_components(
         include: List[str] = None,
         exclude: List[str] = None,
         components: Optional[Dict[str, autoPyTorchComponent]] = None
-    ) -> Dict[str, autoPyTorchComponent]:
+    ) -> Dict[str, Type[autoPyTorchComponent]]:
         """Filters out components based on user provided
         include/exclude directives, as well as the dataset properties
 
@@ -356,7 +349,10 @@ def set_hyperparameters(self,
         self.new_params = new_params
         self.choice = self.get_components()[choice](**new_params)
         self.decoder_choice = decoder_components[decoder_type](**decoder_params)
-        self.pipeline = Pipeline([('encoder', self.choice), ('decoder', self.decoder_choice)])
+
+        self.pipeline = Pipeline([('net_structure', ForecastingNetworkStructure(random_state=self.random_state)),
+                                  ('encoder', self.choice),
+                                  ('decoder', self.decoder_choice)])
         return self
 
     @property
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index c36934353..42c5f5a9c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -1,23 +1,92 @@
 import numpy as np
+from collections import OrderedDict
 
 import pandas as pd
-
 from scipy.sparse import csr_matrix
 
 import torch
 import torchvision
+from ConfigSpace import ConfigurationSpace
 from autoPyTorch.utils.common import FitRequirement
 from torch import nn
 from abc import abstractmethod
-from typing import Any, Dict, Iterable, Optional, Tuple, List
+from typing import Any, Dict, Iterable, Optional, Tuple, List, Union, NamedTuple
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.pipeline.components.base_component import (
     autoPyTorchComponent,
 )
 
 
+class EncoderProperties(NamedTuple):
+    has_hidden_states: bool = False
+    bijective_seq_output: bool = True
+    fixed_input_seq_length: bool = False
+    lagged_input: bool = False
+
+
+class NetworkStructure(NamedTuple):
+    num_blocks: int = 1
+    variable_selection: bool = False
+    skip_connection: bool = False
+    skip_connection_type: str = "add"
+    grn_dropout_rate: float = 0.0
+
+
+class EncoderBlockInfo(NamedTuple):
+    encoder: nn.Module
+    encoder_properties: EncoderProperties
+    encoder_output_shape_: Tuple[int, ...]
+
+
+class ForecastingNetworkStructure(autoPyTorchComponent):
+    def __init__(self, random_state: Optional[np.random.RandomState] = None,
+                 num_blocks: int = 1,
+                 variable_selection: bool = False,
+                 skip_connection: bool = False,
+                 skip_connection_type: str = "add",
+                 grn_dropout_rate: float = 0.0,
+                 ) -> None:
+        super().__init__()
+        self.network_structure = NetworkStructure(num_blocks=num_blocks,
+                                                  variable_selection=variable_selection,
+                                                  skip_connection=skip_connection,
+                                                  skip_connection_type=skip_connection_type,
+                                                  grn_dropout_rate=grn_dropout_rate)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
+        self.check_requirements(X, y)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({
+            'network_structure': self.network_structure,
+        })
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            **kwargs: Any
+    ) -> ConfigurationSpace:
+        return ConfigurationSpace()
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'EarlyPreprocessing',
+            'name': 'Early Preprocessing Node',
+        }
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
+
+
 class EncoderNetwork(nn.Module):
     def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
@@ -56,7 +125,7 @@ def __init__(self,
         self.config = kwargs
         self.input_shape: Optional[Iterable] = None
         self.block_number = block_number
-        self.encoder_output_shape: Optional[Iterable] = None
+        self.encoder_output_shape: Optional[Tuple[int, ...]] = None
 
     @property
     def _required_fit_arguments(self) -> List[FitRequirement]:
@@ -99,7 +168,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             if variable_selection:
                 # TODO
                 pass
-            elif self.encoder_properties()["lagged_input"] and hasattr(self, 'lagged_value'):
+            elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
                 in_features = len(self.lagged_value) * output_shape[-1] + input_shape[-1] + static_features_shape
             else:
                 in_features = output_shape[-1] + input_shape[-1] + static_features_shape
@@ -114,8 +183,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.input_shape = input_shape
 
-        has_hidden_states = self.encoder_properties().get("has_hidden_states", False)
-        self.encoder_output_shape = get_output_shape(input_shape, has_hidden_states)
+        has_hidden_states = self.encoder_properties().has_hidden_states
+        self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
 
         return self
 
@@ -125,10 +194,12 @@ def allowed_decoders():
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
-        X.update({'network_encoder': self.encoder,
-                  'encoder_properties': self.encoder_properties(),
-                  'encoder_output_shape': self.encoder_output_shape
-                  })
+        network_encoder = X.get('network_encoder', OrderedDict())
+        network_encoder[f'block_{self.block_number}'] = EncoderBlockInfo(encoder=self.encoder,
+                                                                         encoder_properties=self.encoder_properties(),
+                                                                         encoder_output_shape_=self.encoder_output_shape)
+
+        X.update({f'network_encoder': network_encoder})
         return X
 
     @abstractmethod
@@ -147,7 +218,8 @@ def build_encoder(self,
         """
         raise NotImplementedError()
 
-    def encoder_properties(self):
+    @staticmethod
+    def encoder_properties(self) -> EncoderProperties:
         """
         Encoder properties, this determines how the data flows over the forecasting networks
 
@@ -160,9 +232,5 @@ def encoder_properties(self):
         implemented in gluonTS:
         https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/model/deepar/module.py
         """
-        encoder_properties = {'has_hidden_states': False,
-                              'bijective_seq_output': True,
-                              'fixed_input_seq_length': False,
-                              'lagged_input': False,
-                              }
+        encoder_properties = EncoderProperties()
         return encoder_properties
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index ef40104df..b1e29b906 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -8,7 +8,7 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork
+    BaseForecastingEncoder, EncoderNetwork, EncoderProperties
 )
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
@@ -61,13 +61,9 @@ class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
     _fixed_seq_length = True
     window_size = 1
 
-    def encoder_properties(self):
-        encoder_properties = super().encoder_properties()
-        encoder_properties.update({
-            'bijective_seq_output': False,
-            'fixed_input_seq_length': True,
-        })
-        return encoder_properties
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(bijective_seq_output=False, fixed_input_seq_length=True)
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 64498fda2..c7a438f03 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -5,7 +5,7 @@
 from ConfigSpace import ConfigurationSpace
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder
+    BaseForecastingEncoder, EncoderProperties
 )
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
@@ -22,12 +22,9 @@ class NBEATSEncoder(BaseForecastingEncoder):
     _fixed_seq_length = True
     window_size = 1
 
-    def encoder_properties(self):
-        encoder_properties = super().encoder_properties()
-        encoder_properties.update({
-            'fixed_input_seq_length': True,
-        })
-        return encoder_properties
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(fixed_input_seq_length=True)
 
     @staticmethod
     def allowed_decoders():
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index f3cc6e981..5f6ec53bb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -10,7 +10,7 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder
+    BaseForecastingEncoder,
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 917ed0d2f..5a6a9f594 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -13,7 +13,7 @@
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork
+    BaseForecastingEncoder, EncoderNetwork, EncoderProperties
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
@@ -92,12 +92,9 @@ def allowed_decoders():
         """
         return ['MLPDecoder', 'RNNDecoder']
 
-    def encoder_properties(self):
-        encoder_properties = super().encoder_properties()
-        encoder_properties.update({'has_hidden_states': True,
-                                   'lagged_input': True,
-                                   })
-        return encoder_properties
+    @staticmethod
+    def encoder_properties() -> EncoderProperties:
+        return EncoderProperties(has_hidden_states=True, lagged_input=True)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if 'lagged_value' in X['dataset_properties']:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 254f2be28..90652c38c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -13,10 +13,10 @@
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork
+    BaseForecastingEncoder, EncoderNetwork, EncoderProperties
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.transformer_util import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     PositionalEncoding, build_transformer_layers
 
 
@@ -97,11 +97,9 @@ def allowed_decoders():
         """
         return ['MLPDecoder', 'TransformerDecoder']
 
-    def encoder_properties(self):
-        encoder_properties = super().encoder_properties()
-        encoder_properties.update({'lagged_input': True,
-                                   })
-        return encoder_properties
+    @staticmethod
+    def encoder_properties():
+        return EncoderProperties(lagged_input=True)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if 'lagged_value' in X['dataset_properties']:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 313edb89f..87af5a6bb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -4,7 +4,11 @@
 import numpy as np
 from sklearn.pipeline import Pipeline
 
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter,
+    UniformFloatHyperparameter
+)
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
 from ConfigSpace.conditions import (
     EqualsCondition, OrConjunction, GreaterThanCondition, NotEqualsCondition, AndConjunction
@@ -19,19 +23,13 @@
     find_components,
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder,
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
-    decoders, decoder_addons, add_decoder
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
     AbstractForecastingEncoderChoice
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
-    base_forecasting_encoder import BaseForecastingEncoder
+    base_forecasting_encoder import BaseForecastingEncoder, ForecastingNetworkStructure
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -44,50 +42,6 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
     _addons.add_component(encoder)
 
 
-class ForecastingNetworkStructure(autoPyTorchComponent):
-    def __init__(self, random_state: Optional[np.random.RandomState] = None,
-                 num_blocks: int = 1,
-                 variable_selection: bool = False,
-                 skip_connection: bool = False) -> None:
-        super().__init__()
-        self.num_blocks = num_blocks
-        self.random_state = random_state
-        self.variable_selection = variable_selection
-        self.skip_connection = skip_connection
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
-        self.check_requirements(X, y)
-        return self
-
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        X.update({
-            'num_blocks': self.num_blocks,
-            'variable_selection': self.variable_selection,
-            'skip_connection': self.skip_connection,
-        })
-        return X
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            **kwargs: Any
-    ) -> ConfigurationSpace:
-        return ConfigurationSpace()
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'EarlyPreprocessing',
-            'name': 'Early Preprocessing Node',
-        }
-
-    def __str__(self) -> str:
-        """ Allow a nice understanding of what components where used """
-        string = self.__class__.__name__
-        return string
-
-
 class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
     deepAR_decoder_name = 'MLPDecoder'
     deepAR_decoder_prefix = 'block_1'
@@ -126,6 +80,17 @@ def get_hyperparameter_search_space(
             skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
                                                                                    value_range=(True, False),
                                                                                    default_value=False),
+            skip_connection_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="skip_connection_type",
+                value_range=("add", "grn"),
+                default_value="grn",
+            ),
+            grn_use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="grn_use_dropout",
+                                                                                   value_range=(True, False),
+                                                                                   default_value=True),
+            grn_dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='grn_dropout_rate',
+                                                                                    value_range=(0.0, 0.8),
+                                                                                    default_value=0.1),
             default: Optional[str] = None,
             include: Optional[List[str]] = None,
             exclude: Optional[List[str]] = None,
@@ -134,13 +99,19 @@ def get_hyperparameter_search_space(
 
         Args:
             dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
-            num_blocks: HyperparameterSearchSpace: number of encoder-decoder structure blocks
-            variable_selection: HyperparameterSearchSpace: if variable selection is applied, if True, then the first
-            block will be attached with a variable selection block while the following will be enriched with static
-            features.
+            num_blocks (HyperparameterSearchSpace): number of encoder-decoder structure blocks
+            variable_selection (HyperparameterSearchSpace): if variable selection is applied, if True, then the first
+                block will be attached with a variable selection block while the following will be enriched with static
+                features.
             skip_connection: HyperparameterSearchSpace: if skip connection is applied
+            skip_connection_type (HyperparameterSearchSpace): skip connection type, it could be directly added or a grn
+                network (
+                Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
+                https://arxiv.org/abs/1912.09363) TODO consider hidden size of grn as a new HP
+            grn_use_dropout (HyperparameterSearchSpace): if dropout layer is applied to grn
+            grn_dropout_rate (HyperparameterSearchSpace): dropout rate of grn
             decoder_auto_regressive: HyperparameterSearchSpace: if decoder is auto_regressive, e.g., if the decoder
-            receives the output of its input, this only works for auto_regressive decoder models
+                receives the output as its input, this only works for  auto_regressive decoder models
             default (Optional[str]): Default backbone to use
             include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
                 list, and will exclusively use this components.
@@ -158,14 +129,33 @@ def get_hyperparameter_search_space(
         future_feature_shapes = dataset_properties.get("future_feature_shapes", (0,))
 
         cs = ConfigurationSpace()
-        add_hyperparameter(cs, skip_connection, CategoricalHyperparameter)
 
         min_num_blocks, max_num_blocks = num_blocks.value_range
 
         variable_selection = get_hyperparameter(variable_selection, CategoricalHyperparameter)
         decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
-        cs.add_hyperparameters([num_blocks, decoder_auto_regressive, variable_selection])
+
+        skip_connection = get_hyperparameter(skip_connection, CategoricalHyperparameter)
+
+        hp_network_structures = [num_blocks, decoder_auto_regressive, variable_selection, skip_connection]
+        cond_skip_connections = []
+        if True in skip_connection.choices:
+            skip_connection_type = get_hyperparameter(skip_connection_type, CategoricalHyperparameter)
+            hp_network_structures.append(skip_connection_type)
+            cond_skip_connections.append(EqualsCondition(skip_connection_type, skip_connection, True))
+            if 'grn' in skip_connection_type.choices:
+                grn_use_dropout = get_hyperparameter(grn_use_dropout, CategoricalHyperparameter)
+                hp_network_structures.append(grn_use_dropout)
+                cond_skip_connections.append(EqualsCondition(grn_use_dropout, skip_connection_type, "grn"))
+                if True in grn_use_dropout.choices:
+                    grn_dropout_rate = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
+                    hp_network_structures.append(grn_dropout_rate)
+                    cond_skip_connections.append(EqualsCondition(grn_dropout_rate, grn_use_dropout, True))
+
+        cs.add_hyperparameters(hp_network_structures)
+        if cond_skip_connections:
+            cs.add_conditions(cond_skip_connections)
 
         if static_features_shape + future_feature_shapes[-1] == 0:
             if False in variable_selection.choices and True in decoder_auto_regressive.choices:
@@ -268,6 +258,10 @@ def get_hyperparameter_search_space(
                 if i == 1 and decoder_name == self.deepAR_decoder_name:
                     # TODO this is only a temporary solution, a fix on ConfigSpace needs to be implemented
                     updates['can_be_auto_regressive'] = True
+                if decoder_name == "MLPDecoder" and i < int(max_num_blocks):
+                    updates['has_local_layer'] = HyperparameterSearchSpace('has_local_layer',
+                                                                           value_range=(True,),
+                                                                           default_value=True)
                 config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
                                                                                                 # type: ignore
                                                                                                 **updates)
@@ -285,7 +279,7 @@ def get_hyperparameter_search_space(
                     config_space,
                     # parent_hyperparameter=parent_hyperparameter
                 )
-                if not available_decoders[decoder_name].decoder_properties()["recurrent"]:
+                if not available_decoders[decoder_name].decoder_properties().recurrent:
                     hp_encoder_choice = cs.get_hyperparameter(block_prefix + '__choice__')
                     for encoder_single in encoder_with_single_decoder:
                         if encoder_single in hp_encoder_choice.choices:
@@ -344,7 +338,9 @@ def get_hyperparameter_search_space(
                         forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
                     cs.add_forbidden_clause(forbidden)
 
+        import pdb
 
+        pdb.set_trace()
         return cs
 
     def set_hyperparameters(self,
@@ -368,20 +364,30 @@ def set_hyperparameters(self,
         new_params = {}
 
         params = configuration.get_dictionary()
-
         num_blocks = params['num_blocks']
-        variable_selection = params['variable_selection']
-        skip_connection = params['skip_connection']
         decoder_auto_regressive = params['decoder_auto_regressive']
+        forecasting_structure_kwargs = dict(num_blocks=num_blocks,
+                                            variable_selection=params['variable_selection'],
+                                            skip_connection=params['skip_connection'],
+                                            decoder_auto_regressive=decoder_auto_regressive, )
+
         del params['num_blocks']
         del params['variable_selection']
         del params['skip_connection']
         del params['decoder_auto_regressive']
 
-        pipeline_steps = [('net_structure', ForecastingNetworkStructure(random_state=self.random_state,
-                                                                        num_blocks=num_blocks,
-                                                                        variable_selection=variable_selection,
-                                                                        skip_connection=skip_connection))]
+        if 'skip_connection_type' in params:
+            forecasting_structure_kwargs['skip_connection_type'] = params['skip_connection_type']
+            del params['skip_connection_type']
+            if 'grn_use_dropout' in params:
+                del params['grn_use_dropout']
+                if 'grn_dropout_rate' in params:
+                    forecasting_structure_kwargs['grn_dropout_rate'] = params['grn_dropout_rate']
+                    del params['grn_dropout_rate']
+                else:
+                    forecasting_structure_kwargs['grn_dropout_rate'] = 0.0
+
+        pipeline_steps = [('net_structure', ForecastingNetworkStructure(**forecasting_structure_kwargs))]
         self.encoder_choice = []
         self.decoder_choice = []
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 151c9ad3a..cd6993f02 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -7,6 +7,8 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import DecoderBlockInfo
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
@@ -36,10 +38,10 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
-            FitRequirement('decoder_properties', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_decoder', (Dict,),  user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
-            FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True),
+            FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -58,10 +60,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.required_net_out_put_type = X['required_net_out_put_type']
 
-        if X['decoder_properties']['multi_blocks']:
+        if 'block_1' in X['network_decoder'] and X['network_decoder']['block_1'].decoder_properties.multi_blocks:
             # if the decoder is a stacked block, we directly build head inside the decoder
-            if X.get('network_decoder', None) is None:
-                raise ValueError("when decoder has multi_blocks, it must be specified!")
             if self.required_net_out_put_type != 'regression':
                 raise ValueError("decoder with multi block structure only allow regression loss!")
             self.output_shape = output_shape
@@ -112,7 +112,7 @@ def build_head(self,
                    input_shape: Tuple[int, ...],
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
-                   decoder_has_local_layer: bool =True,
+                   decoder_has_local_layer: bool = True,
                    dist_cls: Optional[str] = None,
                    n_prediction_heads: int = 1) -> nn.Module:
         """

From b2b5580e62ce087022c92566f725a2e971cc82cd Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 28 Feb 2022 20:43:44 +0100
Subject: [PATCH 168/347] tft temporal fusion decoder

---
 autoPyTorch/datasets/time_series_dataset.py   |   4 +-
 .../setup/network/forecasting_architecture.py | 813 ++++++++++++++++++
 .../setup/network/forecasting_network.py      | 785 +----------------
 .../forecasting_backbone/components_util.py   | 211 ++++-
 .../forecasting_decoder/RNNDecoder.py         |   3 +-
 .../base_forecasting_encoder.py               |  10 +-
 .../flat_encoder/MLPEncoder.py                |   4 +
 .../flat_encoder/NBEATSEncoder.py             |   4 +
 .../seq_encoder/InceptionTimeEncoder.py       |   4 +-
 .../seq_encoder/RNNEncoder.py                 |   3 +
 .../seq_encoder/TCNEncoder.py                 |   4 +
 .../seq_encoder/TransformerEncoder.py         |   8 +-
 .../seq_encoder/__init__.py                   |  43 +-
 .../forecasting_head.py                       | 126 ++-
 .../training/data_loader/time_series_util.py  |  25 +-
 .../pipeline/time_series_forecasting.py       |  10 +-
 16 files changed, 1182 insertions(+), 875 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index cb154b20e..f53e17622 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -147,7 +147,9 @@ def __getitem__(self, index: int, train: bool = True) \
                 "past_features": past_features,
                 "future_features": future_features,
                 "static_features": self.static_features,
-                "mase_coefficient": self.mase_coefficient}, targets_future
+                "mase_coefficient": self.mase_coefficient,
+                'encoder_length': past_target.shape[0],
+                'decoder_length': targets_future.shape[0]}, targets_future
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
new file mode 100644
index 000000000..0e0b215ac
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -0,0 +1,813 @@
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Union, Tuple, List
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+from ConfigSpace.conditions import EqualsCondition
+
+import numpy as np
+
+import torch
+from torch import nn
+import warnings
+
+from torch.distributions import (
+    AffineTransform,
+    TransformedDistribution,
+)
+
+from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
+    base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+    EncoderNetwork,
+    NetworkStructure,
+    EncoderBlockInfo,
+    NetworkStructure,
+    EncoderProperties
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.\
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
+    DecoderBlockInfo,
+    DecoderProperties
+)
+
+class TransformedDistribution_(TransformedDistribution):
+    """
+    We implement the mean function such that we do not need to enquire base mean every time
+    """
+
+    @property
+    def mean(self):
+        mean = self.base_dist.mean
+        for transform in self.transforms:
+            mean = transform(mean)
+        return mean
+
+
+def get_lagged_subsequences(
+        sequence: torch.Tensor,
+        subsequences_length: int,
+        lags_seq: Optional[List[int]] = None,
+        mask: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Returns lagged subsequences of a given sequence, this allows the model to receive the input from the past targets
+    outside the sliding windows. This implementation is similar to gluonTS's implementation
+     the only difference is that we pad the sequence that is not long enough
+
+    Parameters
+    ----------
+    sequence : Tensor
+        the sequence from which lagged subsequences should be extracted.
+        Shape: (N, T, C).
+    subsequences_length : int
+        length of the subsequences to be extracted.
+    lags_seq: Optional[List[int]]
+        lags of the sequence, indicating the sequence that needs to be extracted
+    lag_mask: Optional[torch.Tensor]
+        a mask tensor indicating
+
+    Returns
+    --------
+    lagged : Tensor
+        a tensor of shape (N, S, I * C), where S = subsequences_length and
+        I = len(indices), containing lagged subsequences.
+    """
+    batch_size = sequence.shape[0]
+    num_features = sequence.shape[2]
+    if mask is None:
+        if lags_seq is None:
+            warnings.warn('Neither lag_mask or lags_seq is given, we simply return the input value')
+            return sequence, None
+        # generate mask
+        num_lags = len(lags_seq)
+
+        # build a mask
+        mask_length = max(lags_seq) + subsequences_length
+        mask = torch.zeros((num_lags, mask_length), dtype=torch.bool)
+        for i, lag_index in enumerate(lags_seq):
+            begin_index = -lag_index - subsequences_length
+            end_index = -lag_index if lag_index > 0 else None
+            mask[i, begin_index: end_index] = True
+    else:
+        num_lags = mask.shape[0]
+        mask_length = mask.shape[1]
+
+    mask_extend = mask.clone()
+
+    if mask_length > sequence.shape[1]:
+        sequence = torch.cat([sequence.new_zeros([batch_size, mask_length - sequence.shape[1], num_features]),
+                              sequence], dim=1)
+    elif mask_length < sequence.shape[1]:
+        mask_extend = torch.cat([mask.new_zeros([num_lags, sequence.shape[1] - mask_length]), mask_extend], dim=1)
+    #  (N, 1, T, C)
+    sequence = sequence.unsqueeze(1)
+
+    # (I, T, 1)
+    mask_extend = mask_extend.unsqueeze(-1)
+
+    # (N, I, S, C)
+    lagged_seq = torch.masked_select(sequence, mask_extend).reshape(batch_size, num_lags, subsequences_length, -1)
+
+    lagged_seq = torch.transpose(lagged_seq, 1, 2).reshape(batch_size, subsequences_length, -1)
+
+    return lagged_seq, mask
+
+
+def get_lagged_subsequences_inference(
+        sequence: torch.Tensor,
+        subsequences_length: int,
+        lags_seq: Optional[List[int]] = None, ):
+    """
+    this function works exactly the same as get_lagged_subsequences. However, this implementation is faster when no
+    cached value is available, thus it more suitable during inference times.
+
+    designed for doing inference for DeepAR, the core idea is to use
+    """
+    sequence_length = sequence.shape[1]
+    batch_size = sequence.shape[0]
+    lagged_values = []
+    for lag_index in lags_seq:
+        begin_index = -lag_index - subsequences_length
+        end_index = -lag_index if lag_index > 0 else None
+        if end_index is not None and end_index < -sequence_length:
+            lagged_values.append(torch.zeros([batch_size, subsequences_length, *sequence.shape[2:]]))
+            continue
+        if begin_index < -sequence_length:
+            if end_index is not None:
+                pad_shape = [batch_size, subsequences_length - sequence_length - end_index, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence[:, :end_index, ...]], dim=1))
+            else:
+                pad_shape = [batch_size, subsequences_length - sequence_length, *sequence.shape[2:]]
+                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence], dim=1))
+            continue
+        else:
+            lagged_values.append(sequence[:, begin_index:end_index, ...])
+
+    lagged_seq = torch.stack(lagged_values, -1).transpose(-1, -2).reshape(batch_size, subsequences_length, -1)
+    return lagged_seq
+
+
+class ForecastingNet(nn.Module):
+    future_target_required = False
+
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 network_decoder: Dict[str, DecoderBlockInfo],
+                 network_head: Optional[nn.Module],
+                 window_size: int,
+                 target_scaler: BaseTargetScaler,
+                 dataset_properties: Dict,
+                 output_type: str = 'regression',
+                 forecast_strategy: Optional[str] = 'mean',
+                 num_samples: Optional[int] = 100,
+                 aggregation: Optional[str] = 'mean'
+                 ):
+        """
+        This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
+        MLP decoder and the final head).
+
+        This structure is active when the decoder is a MLP with auto_regressive set as false
+
+        Args:
+            network_embedding (nn.Module): network embedding
+            network_encoder (EncoderNetwork): Encoder network, could be selected to return a sequence or a
+            network_decoder (nn.Module): network decoder
+            network_head (nn.Module): network head, maps the output of decoder to the final output
+            dataset_properties (Dict): dataset properties
+            encoder_properties (Dict): encoder properties
+            decoder_properties: (Dict): decoder properties
+            output_type (str): the form that the network outputs. It could be regression, distribution and
+            (TODO) quantile
+            forecast_strategy (str): only valid if output_type is distribution or quantile, how the network transforms
+            its output to predicted values, could be mean or sample
+            num_samples (int): only valid if output_type is not regression and forecast_strategy is sample. this
+            indicates the number of the points to sample when doing prediction
+            aggregation (str): how the samples are aggregated. We could take their mean or median values.
+        """
+        super(ForecastingNet, self).__init__()
+        self.network_structure = network_structure
+        self.embedding = network_embedding
+        self.head = network_head
+
+        encoders = OrderedDict()
+        decoders = OrderedDict()
+
+        first_decoder = 0
+        for i in range(1, network_structure.num_blocks + 1):
+            block_number = f'block_{i}'
+            encoders[block_number] = network_encoder[block_number].encoder
+            if block_number in decoders:
+                if first_decoder == 0:
+                    first_decoder = block_number
+                decoders[block_number] = network_decoder[block_number].decoder
+
+        if first_decoder == 0:
+            raise ValueError("At least one decoder must be specified!")
+        self.encoder = nn.ModuleDict(encoders)
+        self.decoder = nn.ModuleDict(decoders)
+
+        self.target_scaler = target_scaler
+
+        self.n_prediction_steps = dataset_properties['n_prediction_steps']  # type: int
+        self.window_size = window_size
+
+        self.output_type = output_type
+        self.forecast_strategy = forecast_strategy
+        self.num_samples = num_samples
+        self.aggregation = aggregation
+
+        # self.mask_futur_features = decoder_properties['mask_future_features']
+        self._device = torch.device('cpu')
+
+        if not network_structure.variable_selection:
+            self.encoder_lagged_input = network_encoder['block_1'].encoder_properties.lagged_input
+            self.decoder_lagged_input = network_decoder[f'block_{first_decoder}'].decoder_properties.lagged_input
+        else:
+            self.encoder_lagged_input = False
+            self.decoder_lagged_input = False
+
+        if self.encoder_lagged_input:
+            self.cached_lag_mask_encoder = None
+        if self.decoder_lagged_input:
+            self.cached_lag_mask_decoder = None
+
+    @property
+    def device(self):
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device):
+        self.to(device)
+        self._device = device
+
+    def rescale_output(self,
+                       outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                       loc: Optional[torch.Tensor],
+                       scale: Optional[torch.Tensor],
+                       device: torch.device = torch.device('cpu')):
+        if loc is not None or scale is not None:
+            if isinstance(outputs, torch.distributions.Distribution):
+                transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
+                                            scale=1.0 if scale is None else scale.to(device),
+                                            )
+                outputs = TransformedDistribution_(outputs, [transform])
+            else:
+                if loc is None:
+                    outputs = outputs * scale.to(device)
+                elif scale is None:
+                    outputs = outputs + loc.to(device)
+                else:
+                    outputs = outputs * scale.to(device) + loc.to(device)
+        return outputs
+
+    def scale_value(self,
+                    outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                    loc: Optional[torch.Tensor],
+                    scale: Optional[torch.Tensor],
+                    device: torch.device = torch.device('cpu')):
+        if loc is not None or scale is not None:
+            if loc is None:
+                outputs = outputs / scale.to(device)
+            elif scale is None:
+                outputs = outputs - loc.to(device)
+            else:
+                outputs = (outputs - loc.to(device)) / scale.to(device)
+        return outputs
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        if self.encoder_lagged_input:
+            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
+            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
+                                                                           self.window_size,
+                                                                           self.encoder.lagged_value,
+                                                                           self.cached_lag_mask_encoder)
+        else:
+            if self.window_size < past_targets.shape[1]:
+                past_targets = past_targets[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets)
+            x_past = past_targets
+
+        if past_features is not None:
+            x_past = torch.cat([past_features, x_past], dim=1)
+
+        x_past = x_past.to(device=self.device)
+        x_past = self.embedding(x_past)
+
+        if self.encoder_has_hidden_states:
+            x_past, _ = self.encoder(x_past)
+        else:
+            x_past = self.encoder(x_past)
+        x_past = self.decoder(x_past)
+        output = self.head(x_past)
+        return self.rescale_output(output, loc, scale, self.device)
+
+    def pred_from_net_output(self, net_output):
+        if self.output_type == 'regression':
+            return net_output
+        elif self.output_type == 'distribution':
+            if self.forecast_strategy == 'mean':
+                if isinstance(net_output, list):
+                    return torch.cat([dist.mean for dist in net_output], dim=-2)
+                else:
+                    return net_output.mean
+            elif self.forecast_strategy == 'sample':
+                if isinstance(net_output, list):
+                    samples = torch.cat([dist.sample((self.num_samples,)) for dist in net_output], dim=-2)
+                else:
+                    samples = net_output.sample((self.num_samples,))
+                if self.aggregation == 'mean':
+                    return torch.mean(samples, dim=0)
+                elif self.aggregation == 'median':
+                    return torch.median(samples, 0)[0]
+                else:
+                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
+            else:
+                raise ValueError(f'Unknown forecast_strategy: {self.forecast_strategy}')
+        else:
+            raise ValueError(f'Unknown output_type: {self.output_type}')
+
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
+                ):
+        net_output = self(past_targets, past_features)
+        return self.pred_from_net_output(net_output)
+
+
+class ForecastingSeq2SeqNet(ForecastingNet):
+    future_target_required = True
+    """
+    Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while 
+
+    This structure is activate when the decoder is recurrent (RNN or transformer). 
+    We train the network with teacher forcing, thus
+    future_targets is required for the network. To train the network, past targets and past features are fed to the
+    encoder to obtain the hidden states whereas future targets and future features.
+    When the output type is distribution and forecast_strategy is sampling, this model is equivalent to a deepAR model 
+    during inference.
+    """
+
+    def __init__(self, **kwargs):
+        super(ForecastingSeq2SeqNet, self).__init__(**kwargs)
+        self.mask_on_future_target = kwargs['decoder_properties']['mask_on_future_target']
+        if self.mask_on_future_target:
+            self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        if self.encoder_lagged_input:
+            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
+            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
+                                                                           self.window_size,
+                                                                           self.encoder.lagged_value,
+                                                                           self.cached_lag_mask_encoder)
+        else:
+            if self.window_size < past_targets.shape[1]:
+                past_targets = past_targets[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets)
+            x_past = past_targets
+
+        x_past = x_past if past_features is None else torch.cat([past_features, x_past], dim=-1)
+
+        x_past = x_past.to(self.device)
+        x_past = self.embedding(x_past)
+
+        if self.training:
+            # we do one step ahead forecasting
+            if self.decoder_lagged_input:
+                future_targets = torch.cat([past_targets, future_targets[:, :-1, :]], dim=1)
+                future_targets, self.cached_lag_mask_decoder = get_lagged_subsequences(future_targets,
+                                                                                       self.n_prediction_steps,
+                                                                                       self.decoder.lagged_value,
+                                                                                       self.cached_lag_mask_decoder)
+            else:
+                future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
+
+            x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
+                                                                                dim=-1)
+            x_future = x_future.to(self.device)
+
+            if self.encoder_has_hidden_states:
+                # RNN
+                _, features_latent = self.encoder(x_past, output_seq=True)
+                x_future, _ = self.decoder(x_future, features_latent)
+            elif self.mask_on_future_target:
+                features_latent = self.encoder(x_past, output_seq=True)
+                x_future = self.decoder(x_future, features_latent, tgt_mask=self.tgt_mask.to(self.device))
+            else:
+                raise NotImplementedError
+            net_output = self.head(x_future)
+
+            return self.rescale_output(net_output, loc, scale, self.device)
+        else:
+            if self.encoder_has_hidden_states:
+                _, features_latent = self.encoder(x_past, output_seq=True)
+            else:
+                features_latent = self.encoder(x_past, output_seq=True)
+
+            if future_features is not None:
+                future_features = future_features
+
+            if self.forecast_strategy != 'sample':
+                all_predictions = []
+                predicted_target = past_targets[:, [-1]]
+                past_targets = past_targets[:, :-1]
+                for idx_pred in range(self.n_prediction_steps):
+                    if self.decoder_lagged_input:
+                        x_future = torch.cat([past_targets, predicted_target.cpu()], dim=1)
+                        if self.decoder_has_hidden_states:
+                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
+                        else:
+                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
+                                                                         self.decoder.lagged_value)
+                    else:
+                        if self.decoder_has_hidden_states:
+                            x_future = predicted_target[:, [-1]]
+                        else:
+                            x_future = predicted_target
+
+                    if self.decoder_has_hidden_states:
+                        x_future = x_future if future_features is None else torch.cat(
+                            [future_features[:, [idx_pred], :], x_future], dim=-1)
+                    else:
+                        x_future = x_future if future_features is None else torch.cat(
+                            [future_features[:, idx_pred + 1, :], x_future], dim=-1)
+
+                    x_future = x_future.to(self.device)
+                    if self.decoder_has_hidden_states:
+                        x_future, features_latent = self.decoder(x_future, features_latent=features_latent)
+                    else:
+                        x_future = self.decoder(x_future, features_latent)
+
+                    net_output = self.head(x_future[:, -1:, ])
+                    predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()],
+                                                 dim=1)
+
+                    all_predictions.append(net_output)
+
+                if self.output_type != 'distribution':
+                    all_predictions = torch.cat(all_predictions, dim=1)
+                else:
+                    all_predictions = self.pred_from_net_output(all_predictions)
+
+                return self.rescale_output(all_predictions, loc, scale, self.device)
+
+            else:
+                # we follow the DeepAR implementation:
+                all_samples = []
+                batch_size = past_targets.shape[0]
+
+                if self.encoder_has_hidden_states:
+
+                    if isinstance(features_latent, tuple):
+                        repeated_state = [
+                            s.repeat_interleave(repeats=self.num_samples, dim=1)
+                            for s in features_latent
+                        ]
+                    else:
+                        repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=1)
+                else:
+                    # Transformer's hidden states is of shape
+                    repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=0)
+                if self.decoder_lagged_input:
+                    max_lag_seq_length = max(self.decoder.lagged_value) + 1
+                else:
+                    max_lag_seq_length = 1 + self.window_size
+                repeated_past_target = past_targets[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
+                                                                                               dim=0).squeeze(1)
+                repeated_predicted_target = repeated_past_target[:, [-1]]
+                repeated_past_target = repeated_past_target[:, :-1, ]
+
+                repeated_static_feat = static_features.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ).unsqueeze(dim=1) if static_features is not None else None
+
+                repeated_time_feat = future_features.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if future_features is not None else None
+
+                for idx_pred in range(self.n_prediction_steps):
+                    if self.decoder_lagged_input:
+                        x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
+                        if self.decoder_has_hidden_states:
+                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
+                        else:
+                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
+                                                                         self.decoder.lagged_value)
+                    else:
+                        if self.decoder_has_hidden_states:
+                            x_future = repeated_predicted_target[:, [-1]]
+                        else:
+                            x_future = repeated_predicted_target
+
+                    if self.decoder_has_hidden_states:
+                        x_future = x_future if repeated_time_feat is None else torch.cat(
+                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
+                    else:
+                        # decoder uses the entire future targets
+                        x_future = x_future if repeated_time_feat is None else torch.cat(
+                            [repeated_time_feat[:, :idx_pred + 1, :], x_future], dim=-1)
+
+                    x_future = x_future.to(self.device)
+                    if self.decoder_has_hidden_states:
+                        x_future, repeated_state = self.decoder(x_future, features_latent=repeated_state)
+                    else:
+                        x_future = self.decoder(x_future, repeated_state)
+                    net_output = self.head(x_future[:, -1:, ])
+                    samples = self.pred_from_net_output(net_output).cpu()
+                    repeated_predicted_target = torch.cat([repeated_predicted_target,
+                                                           samples],
+                                                          dim=1)
+                    all_samples.append(samples)
+
+                all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
+
+                if self.aggregation == 'mean':
+                    return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
+                elif self.aggregation == 'median':
+                    return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
+                else:
+                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
+
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
+                ):
+        net_output = self(past_targets, past_features, future_features)
+        if self.output_type != 'distribution':
+            return self.pred_from_net_output(net_output)
+        else:
+            return net_output
+
+
+class ForecastingDeepARNet(ForecastingNet):
+    future_target_required = True
+
+    def __init__(self,
+                 **kwargs):
+        """
+        Forecasting network with DeepAR structure.
+
+        This structure is activate when the decoder is not recurrent (MLP) and its hyperparameter "auto_regressive" is
+        set  as True. We train the network to let it do a one-step prediction. This structure is compatible with any
+         sorts of encoder (except MLP).
+        """
+        super(ForecastingDeepARNet, self).__init__(**kwargs)
+        # this determines the training targets
+        self.encoder_bijective_seq_output = kwargs['network_encoder']['block_1'].encoder_properties.bijective_seq_output
+
+        self.cached_lag_mask_encoder_test = None
+        self.only_generate_future_dist = False
+
+    def train(self, mode: bool = True) -> nn.Module:
+        self.only_generate_future_dist = False
+        return super().train(mode=mode)
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        if self.training:
+            if self.encoder_lagged_input:
+                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    past_targets[:, -self.window_size:])
+                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+                future_targets = self.scale_value(future_targets, loc, scale)
+
+                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
+                seq_length = self.window_size + self.n_prediction_steps
+                targets_all, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_all,
+                                                                                    seq_length - 1,
+                                                                                    self.encoder.lagged_value,
+                                                                                    self.cached_lag_mask_encoder)
+            else:
+                if self.window_size < past_targets.shape[1]:
+                    past_targets = past_targets[:, -self.window_size:]
+                past_targets, _, loc, scale = self.target_scaler(past_targets)
+                future_targets = self.scale_value(future_targets, loc, scale)
+                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
+
+            x_input = targets_all
+            if past_features is not None:
+                features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
+                x_input = torch.cat([features_all, x_input], dim=-1)
+            x_input = x_input.to(self.device)
+
+            x_input = self.embedding(x_input)
+
+            if self.encoder_has_hidden_states:
+                x_input, _ = self.encoder(x_input, output_seq=True)
+            else:
+                x_input = self.encoder(x_input, output_seq=True)
+            if self.only_generate_future_dist:
+                x_input = x_input[:, -self.n_prediction_steps:]
+            net_output = self.head(self.decoder(x_input))
+            return self.rescale_output(net_output, loc, scale, self.device)
+        else:
+            if self.encoder_lagged_input:
+                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    past_targets[:, -self.window_size:])
+                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+                x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
+                                                                                    self.window_size,
+                                                                                    self.encoder.lagged_value,
+                                                                                    self.cached_lag_mask_encoder_test)
+            else:
+                if self.window_size < past_targets.shape[1]:
+                    past_targets = past_targets[:, -self.window_size:]
+
+                past_targets, _, loc, scale = self.target_scaler(past_targets)
+                x_past = past_targets
+            if past_features is not None:
+                # features is one step ahead of target
+                if self.window_size > 1:
+                    features_all = torch.cat([past_features[:, -self.window_size + 1:, ],
+                                              future_features],
+                                             dim=1)
+                else:
+                    features_all = future_features
+            else:
+                features_all = None
+            x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
+                                                                   dim=-1)
+
+            x_past = x_past.to(self.device)
+            # TODO consider static features
+            x_past = self.embedding(x_past)
+
+            all_samples = []
+            batch_size = past_targets.shape[0]
+
+            if self.encoder_has_hidden_states:
+                # For RNN, we only feed the hidden state and generated future input to the netwrok
+                encoder_output, hidden_states = self.encoder(x_past)
+                if isinstance(hidden_states, tuple):
+                    repeated_state = [
+                        s.repeat_interleave(repeats=self.num_samples, dim=1)
+                        for s in hidden_states
+                    ]
+                else:
+                    repeated_state = hidden_states.repeat_interleave(repeats=self.num_samples, dim=1)
+            else:
+                # For other models, the full past targets are passed to the network.
+                encoder_output = self.encoder(x_past)
+            if self.encoder_lagged_input:
+                max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size)
+            else:
+                max_lag_seq_length = self.window_size
+            # TODO considering padding targets here instead of inside get_lagged function
+            repeated_past_target = past_targets[:, -max_lag_seq_length:, ].repeat_interleave(
+                repeats=self.num_samples,
+                dim=0).squeeze(1)
+
+            repeated_static_feat = static_features.repeat_interleave(
+                repeats=self.num_samples, dim=0
+            ).unsqueeze(dim=1) if static_features is not None else None
+
+            if features_all is not None:
+                if not self.encoder_has_hidden_states:
+                    # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
+                    # data properties!!!
+                    time_feature = features_all
+                else:
+                    time_feature = future_features[:, 1:] if self.n_prediction_steps > 1 else None
+            else:
+                time_feature = None
+
+            repeated_time_feat = time_feature.repeat_interleave(
+                repeats=self.num_samples, dim=0
+            ) if future_features is not None else None
+
+            net_output = self.head(self.decoder(encoder_output))
+
+            next_sample = net_output.sample(sample_shape=(self.num_samples,))
+
+            next_sample = next_sample.transpose(0, 1).reshape(
+                (next_sample.shape[0] * next_sample.shape[1], 1, -1)
+            ).cpu()
+
+            all_samples.append(next_sample)
+
+            for k in range(1, self.n_prediction_steps):
+                if self.encoder_has_hidden_states:
+                    if self.encoder_lagged_input:
+                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
+                        x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
+                    else:
+                        x_next = next_sample
+                    x_next = x_next if repeated_time_feat is None else torch.cat([repeated_time_feat[:, k - 1:k],
+                                                                                  x_next], dim=-1)
+                    x_next = x_next.to(self.device)
+                    encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
+                else:
+                    if self.encoder_lagged_input:
+                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
+                        x_next = get_lagged_subsequences_inference(x_next,
+                                                                   self.window_size + k,
+                                                                   self.encoder.lagged_value)
+                    else:
+                        x_next = torch.cat([repeated_past_target[:, -self.window_size:], *all_samples], dim=1)
+                    if repeated_time_feat is None:
+                        x_next = x_next
+                    else:
+                        x_next = torch.cat([repeated_time_feat[:, :self.window_size + k],
+                                            x_next], dim=-1)
+                    x_next = x_next.to(self.device)
+                    encoder_output = self.encoder(x_next)
+
+                # During training, the encoder output a sequence. Thus for prediction, the network should have the same
+                # output format
+                encoder_output = torch.unsqueeze(encoder_output, 1)
+
+                net_output = self.head(self.decoder(encoder_output))
+
+                next_sample = net_output.sample().cpu()
+                all_samples.append(next_sample)
+
+            all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
+
+            if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
+                raise ValueError(
+                    f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
+                    f"but this network has {self.output_type} and {self.forecast_strategy}")
+            if self.aggregation == 'mean':
+                return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
+            elif self.aggregation == 'median':
+                return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
+            else:
+                raise ValueError(f'Unknown aggregation: {self.aggregation}')
+
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
+                ):
+        net_output = self(past_targets, past_features, future_features)
+        return net_output
+
+
+class NBEATSNet(ForecastingNet):
+    future_target_required = False
+
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+        if self.window_size < past_targets.shape[1]:
+            past_targets = past_targets[:, -self.window_size:]
+        past_targets, _, loc, scale = self.target_scaler(past_targets)
+        past_targets = past_targets.to(self.device)
+
+        batch_size = past_targets.shape[0]
+        output_shape = past_targets.shape[2:]
+        forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
+
+        forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
+        backcast = self.encoder(past_targets)
+        for block in self.decoder:
+            backcast_block, forecast_block = block(backcast)
+
+            backcast = backcast - backcast_block
+            forecast = forecast + forecast_block
+        backcast = backcast.reshape(past_targets.shape)
+        forecast = forecast.reshape(forcast_shape)
+
+        forecast = self.rescale_output(forecast, loc, scale, self.device)
+        if self.training:
+            backcast = self.rescale_output(backcast, loc, scale, self.device)
+            return backcast, forecast
+        else:
+            return forecast
+
+    def pred_from_net_output(self, net_output: torch.Tensor):
+        return net_output
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index d88240dd9..f29f880a7 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -38,785 +38,12 @@
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
-
-
-class TransformedDistribution_(TransformedDistribution):
-    """
-    We implement the mean function such that we do not need to enquire base mean every time
-    """
-
-    @property
-    def mean(self):
-        mean = self.base_dist.mean
-        for transform in self.transforms:
-            mean = transform(mean)
-        return mean
-
-
-def get_lagged_subsequences(
-        sequence: torch.Tensor,
-        subsequences_length: int,
-        lags_seq: Optional[List[int]] = None,
-        mask: Optional[torch.Tensor] = None
-) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-    """
-    Returns lagged subsequences of a given sequence, this allows the model to receive the input from the past targets
-    outside the sliding windows. This implementation is similar to gluonTS's implementation
-     the only difference is that we pad the sequence that is not long enough
-
-    Parameters
-    ----------
-    sequence : Tensor
-        the sequence from which lagged subsequences should be extracted.
-        Shape: (N, T, C).
-    subsequences_length : int
-        length of the subsequences to be extracted.
-    lags_seq: Optional[List[int]]
-        lags of the sequence, indicating the sequence that needs to be extracted
-    lag_mask: Optional[torch.Tensor]
-        a mask tensor indicating
-
-    Returns
-    --------
-    lagged : Tensor
-        a tensor of shape (N, S, I * C), where S = subsequences_length and
-        I = len(indices), containing lagged subsequences.
-    """
-    batch_size = sequence.shape[0]
-    num_features = sequence.shape[2]
-    if mask is None:
-        if lags_seq is None:
-            warnings.warn('Neither lag_mask or lags_seq is given, we simply return the input value')
-            return sequence, None
-        # generate mask
-        num_lags = len(lags_seq)
-
-        # build a mask
-        mask_length = max(lags_seq) + subsequences_length
-        mask = torch.zeros((num_lags, mask_length), dtype=torch.bool)
-        for i, lag_index in enumerate(lags_seq):
-            begin_index = -lag_index - subsequences_length
-            end_index = -lag_index if lag_index > 0 else None
-            mask[i, begin_index: end_index] = True
-    else:
-        num_lags = mask.shape[0]
-        mask_length = mask.shape[1]
-
-    mask_extend = mask.clone()
-
-    if mask_length > sequence.shape[1]:
-        sequence = torch.cat([sequence.new_zeros([batch_size, mask_length - sequence.shape[1], num_features]),
-                              sequence], dim=1)
-    elif mask_length < sequence.shape[1]:
-        mask_extend = torch.cat([mask.new_zeros([num_lags, sequence.shape[1] - mask_length]), mask_extend], dim=1)
-    #  (N, 1, T, C)
-    sequence = sequence.unsqueeze(1)
-
-    # (I, T, 1)
-    mask_extend = mask_extend.unsqueeze(-1)
-
-    # (N, I, S, C)
-    lagged_seq = torch.masked_select(sequence, mask_extend).reshape(batch_size, num_lags, subsequences_length, -1)
-
-    lagged_seq = torch.transpose(lagged_seq, 1, 2).reshape(batch_size, subsequences_length, -1)
-
-    return lagged_seq, mask
-
-
-def get_lagged_subsequences_inference(
-        sequence: torch.Tensor,
-        subsequences_length: int,
-        lags_seq: Optional[List[int]] = None, ):
-    """
-    this function works exactly the same as get_lagged_subsequences. However, this implementation is faster when no
-    cached value is available, thus it more suitable during inference times.
-
-    designed for doing inference for DeepAR, the core idea is to use
-    """
-    sequence_length = sequence.shape[1]
-    batch_size = sequence.shape[0]
-    lagged_values = []
-    for lag_index in lags_seq:
-        begin_index = -lag_index - subsequences_length
-        end_index = -lag_index if lag_index > 0 else None
-        if end_index is not None and end_index < -sequence_length:
-            lagged_values.append(torch.zeros([batch_size, subsequences_length, *sequence.shape[2:]]))
-            continue
-        if begin_index < -sequence_length:
-            if end_index is not None:
-                pad_shape = [batch_size, subsequences_length - sequence_length - end_index, *sequence.shape[2:]]
-                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence[:, :end_index, ...]], dim=1))
-            else:
-                pad_shape = [batch_size, subsequences_length - sequence_length, *sequence.shape[2:]]
-                lagged_values.append(torch.cat([torch.zeros(pad_shape), sequence], dim=1))
-            continue
-        else:
-            lagged_values.append(sequence[:, begin_index:end_index, ...])
-
-    lagged_seq = torch.stack(lagged_values, -1).transpose(-1, -2).reshape(batch_size, subsequences_length, -1)
-    return lagged_seq
-
-
-class ForecastingNet(nn.Module):
-    future_target_required = False
-
-    def __init__(self,
-                 network_structure: NetworkStructure,
-                 network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
-                 network_encoder: Dict[str, EncoderBlockInfo],
-                 network_decoder: Dict[str, DecoderBlockInfo],
-                 network_head: Optional[nn.Module],
-                 window_size: int,
-                 target_scaler: BaseTargetScaler,
-                 dataset_properties: Dict,
-                 output_type: str = 'regression',
-                 forecast_strategy: Optional[str] = 'mean',
-                 num_samples: Optional[int] = 100,
-                 aggregation: Optional[str] = 'mean'
-                 ):
-        """
-        This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
-        MLP decoder and the final head).
-
-        This structure is active when the decoder is a MLP with auto_regressive set as false
-
-        Args:
-            network_embedding (nn.Module): network embedding
-            network_encoder (EncoderNetwork): Encoder network, could be selected to return a sequence or a
-            network_decoder (nn.Module): network decoder
-            network_head (nn.Module): network head, maps the output of decoder to the final output
-            dataset_properties (Dict): dataset properties
-            encoder_properties (Dict): encoder properties
-            decoder_properties: (Dict): decoder properties
-            output_type (str): the form that the network outputs. It could be regression, distribution and
-            (TODO) quantile
-            forecast_strategy (str): only valid if output_type is distribution or quantile, how the network transforms
-            its output to predicted values, could be mean or sample
-            num_samples (int): only valid if output_type is not regression and forecast_strategy is sample. this
-            indicates the number of the points to sample when doing prediction
-            aggregation (str): how the samples are aggregated. We could take their mean or median values.
-        """
-        super(ForecastingNet, self).__init__()
-        self.network_structure = network_structure
-        self.embedding = network_embedding
-        self.head = network_head
-
-        encoders = OrderedDict()
-        decoders = OrderedDict()
-
-        first_decoder = 0
-        for i in range(1, network_structure.num_blocks + 1):
-            block_number = f'block_{i}'
-            encoders[block_number] = network_encoder[block_number].encoder
-            if block_number in decoders:
-                if first_decoder == 0:
-                    first_decoder = block_number
-                decoders[block_number] = network_decoder[block_number].decoder
-
-        if first_decoder == 0:
-            raise ValueError("At least one decoder must be specified!")
-        self.encoder = nn.ModuleDict(encoders)
-        self.decoder = nn.ModuleDict(decoders)
-
-        self.target_scaler = target_scaler
-
-        self.n_prediction_steps = dataset_properties['n_prediction_steps']  # type: int
-        self.window_size = window_size
-
-        self.output_type = output_type
-        self.forecast_strategy = forecast_strategy
-        self.num_samples = num_samples
-        self.aggregation = aggregation
-
-        # self.mask_futur_features = decoder_properties['mask_future_features']
-        self._device = torch.device('cpu')
-
-        if not network_structure.variable_selection:
-            self.encoder_lagged_input = network_encoder['block_1'].encoder_properties.lagged_input
-            self.decoder_lagged_input = network_decoder[f'block_{first_decoder}'].decoder_properties.lagged_input
-        else:
-            self.encoder_lagged_input = False
-            self.decoder_lagged_input = False
-
-        if self.encoder_lagged_input:
-            self.cached_lag_mask_encoder = None
-        if self.decoder_lagged_input:
-            self.cached_lag_mask_decoder = None
-
-    @property
-    def device(self):
-        return self._device
-
-    @device.setter
-    def device(self, device: torch.device):
-        self.to(device)
-        self._device = device
-
-    def rescale_output(self,
-                       outputs: Union[torch.distributions.Distribution, torch.Tensor],
-                       loc: Optional[torch.Tensor],
-                       scale: Optional[torch.Tensor],
-                       device: torch.device = torch.device('cpu')):
-        if loc is not None or scale is not None:
-            if isinstance(outputs, torch.distributions.Distribution):
-                transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
-                                            scale=1.0 if scale is None else scale.to(device),
-                                            )
-                outputs = TransformedDistribution_(outputs, [transform])
-            else:
-                if loc is None:
-                    outputs = outputs * scale.to(device)
-                elif scale is None:
-                    outputs = outputs + loc.to(device)
-                else:
-                    outputs = outputs * scale.to(device) + loc.to(device)
-        return outputs
-
-    def scale_value(self,
-                    outputs: Union[torch.distributions.Distribution, torch.Tensor],
-                    loc: Optional[torch.Tensor],
-                    scale: Optional[torch.Tensor],
-                    device: torch.device = torch.device('cpu')):
-        if loc is not None or scale is not None:
-            if loc is None:
-                outputs = outputs / scale.to(device)
-            elif scale is None:
-                outputs = outputs - loc.to(device)
-            else:
-                outputs = (outputs - loc.to(device)) / scale.to(device)
-        return outputs
-
-    def forward(self,
-                past_targets: torch.Tensor,
-                future_targets: Optional[torch.Tensor] = None,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.encoder_lagged_input:
-            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
-            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
-            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
-                                                                           self.window_size,
-                                                                           self.encoder.lagged_value,
-                                                                           self.cached_lag_mask_encoder)
-        else:
-            if self.window_size < past_targets.shape[1]:
-                past_targets = past_targets[:, -self.window_size:]
-            past_targets, _, loc, scale = self.target_scaler(past_targets)
-            x_past = past_targets
-
-        if past_features is not None:
-            x_past = torch.cat([past_features, x_past], dim=1)
-
-        x_past = x_past.to(device=self.device)
-        x_past = self.embedding(x_past)
-
-        if self.encoder_has_hidden_states:
-            x_past, _ = self.encoder(x_past)
-        else:
-            x_past = self.encoder(x_past)
-        x_past = self.decoder(x_past)
-        output = self.head(x_past)
-        return self.rescale_output(output, loc, scale, self.device)
-
-    def pred_from_net_output(self, net_output):
-        if self.output_type == 'regression':
-            return net_output
-        elif self.output_type == 'distribution':
-            if self.forecast_strategy == 'mean':
-                if isinstance(net_output, list):
-                    return torch.cat([dist.mean for dist in net_output], dim=-2)
-                else:
-                    return net_output.mean
-            elif self.forecast_strategy == 'sample':
-                if isinstance(net_output, list):
-                    samples = torch.cat([dist.sample((self.num_samples,)) for dist in net_output], dim=-2)
-                else:
-                    samples = net_output.sample((self.num_samples,))
-                if self.aggregation == 'mean':
-                    return torch.mean(samples, dim=0)
-                elif self.aggregation == 'median':
-                    return torch.median(samples, 0)[0]
-                else:
-                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
-            else:
-                raise ValueError(f'Unknown forecast_strategy: {self.forecast_strategy}')
-        else:
-            raise ValueError(f'Unknown output_type: {self.output_type}')
-
-    def predict(self,
-                past_targets: torch.Tensor,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
-                ):
-        net_output = self(past_targets, past_features)
-        return self.pred_from_net_output(net_output)
-
-
-class ForecastingSeq2SeqNet(ForecastingNet):
-    future_target_required = True
-    """
-    Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while 
-
-    This structure is activate when the decoder is recurrent (RNN or transformer). 
-    We train the network with teacher forcing, thus
-    future_targets is required for the network. To train the network, past targets and past features are fed to the
-    encoder to obtain the hidden states whereas future targets and future features.
-    When the output type is distribution and forecast_strategy is sampling, this model is equivalent to a deepAR model 
-    during inference.
-    """
-
-    def __init__(self, **kwargs):
-        super(ForecastingSeq2SeqNet, self).__init__(**kwargs)
-        self.mask_on_future_target = kwargs['decoder_properties']['mask_on_future_target']
-        if self.mask_on_future_target:
-            self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
-
-    def forward(self,
-                past_targets: torch.Tensor,
-                future_targets: Optional[torch.Tensor] = None,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.encoder_lagged_input:
-            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
-            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
-            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
-                                                                           self.window_size,
-                                                                           self.encoder.lagged_value,
-                                                                           self.cached_lag_mask_encoder)
-        else:
-            if self.window_size < past_targets.shape[1]:
-                past_targets = past_targets[:, -self.window_size:]
-            past_targets, _, loc, scale = self.target_scaler(past_targets)
-            x_past = past_targets
-
-        x_past = x_past if past_features is None else torch.cat([past_features, x_past], dim=-1)
-
-        x_past = x_past.to(self.device)
-        x_past = self.embedding(x_past)
-
-        if self.training:
-            # we do one step ahead forecasting
-            if self.decoder_lagged_input:
-                future_targets = torch.cat([past_targets, future_targets[:, :-1, :]], dim=1)
-                future_targets, self.cached_lag_mask_decoder = get_lagged_subsequences(future_targets,
-                                                                                       self.n_prediction_steps,
-                                                                                       self.decoder.lagged_value,
-                                                                                       self.cached_lag_mask_decoder)
-            else:
-                future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
-
-            x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
-                                                                                dim=-1)
-            x_future = x_future.to(self.device)
-
-            if self.encoder_has_hidden_states:
-                # RNN
-                _, features_latent = self.encoder(x_past, output_seq=True)
-                x_future, _ = self.decoder(x_future, features_latent)
-            elif self.mask_on_future_target:
-                features_latent = self.encoder(x_past, output_seq=True)
-                x_future = self.decoder(x_future, features_latent, tgt_mask=self.tgt_mask.to(self.device))
-            else:
-                raise NotImplementedError
-            net_output = self.head(x_future)
-
-            return self.rescale_output(net_output, loc, scale, self.device)
-        else:
-            if self.encoder_has_hidden_states:
-                _, features_latent = self.encoder(x_past, output_seq=True)
-            else:
-                features_latent = self.encoder(x_past, output_seq=True)
-
-            if future_features is not None:
-                future_features = future_features
-
-            if self.forecast_strategy != 'sample':
-                all_predictions = []
-                predicted_target = past_targets[:, [-1]]
-                past_targets = past_targets[:, :-1]
-                for idx_pred in range(self.n_prediction_steps):
-                    if self.decoder_lagged_input:
-                        x_future = torch.cat([past_targets, predicted_target.cpu()], dim=1)
-                        if self.decoder_has_hidden_states:
-                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
-                        else:
-                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
-                                                                         self.decoder.lagged_value)
-                    else:
-                        if self.decoder_has_hidden_states:
-                            x_future = predicted_target[:, [-1]]
-                        else:
-                            x_future = predicted_target
-
-                    if self.decoder_has_hidden_states:
-                        x_future = x_future if future_features is None else torch.cat(
-                            [future_features[:, [idx_pred], :], x_future], dim=-1)
-                    else:
-                        x_future = x_future if future_features is None else torch.cat(
-                            [future_features[:, idx_pred + 1, :], x_future], dim=-1)
-
-                    x_future = x_future.to(self.device)
-                    if self.decoder_has_hidden_states:
-                        x_future, features_latent = self.decoder(x_future, features_latent=features_latent)
-                    else:
-                        x_future = self.decoder(x_future, features_latent)
-
-                    net_output = self.head(x_future[:, -1:, ])
-                    predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()],
-                                                 dim=1)
-
-                    all_predictions.append(net_output)
-
-                if self.output_type != 'distribution':
-                    all_predictions = torch.cat(all_predictions, dim=1)
-                else:
-                    all_predictions = self.pred_from_net_output(all_predictions)
-
-                return self.rescale_output(all_predictions, loc, scale, self.device)
-
-            else:
-                # we follow the DeepAR implementation:
-                all_samples = []
-                batch_size = past_targets.shape[0]
-
-                if self.encoder_has_hidden_states:
-
-                    if isinstance(features_latent, tuple):
-                        repeated_state = [
-                            s.repeat_interleave(repeats=self.num_samples, dim=1)
-                            for s in features_latent
-                        ]
-                    else:
-                        repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=1)
-                else:
-                    # Transformer's hidden states is of shape
-                    repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=0)
-                if self.decoder_lagged_input:
-                    max_lag_seq_length = max(self.decoder.lagged_value) + 1
-                else:
-                    max_lag_seq_length = 1 + self.window_size
-                repeated_past_target = past_targets[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
-                                                                                               dim=0).squeeze(1)
-                repeated_predicted_target = repeated_past_target[:, [-1]]
-                repeated_past_target = repeated_past_target[:, :-1, ]
-
-                repeated_static_feat = static_features.repeat_interleave(
-                    repeats=self.num_samples, dim=0
-                ).unsqueeze(dim=1) if static_features is not None else None
-
-                repeated_time_feat = future_features.repeat_interleave(
-                    repeats=self.num_samples, dim=0
-                ) if future_features is not None else None
-
-                for idx_pred in range(self.n_prediction_steps):
-                    if self.decoder_lagged_input:
-                        x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
-                        if self.decoder_has_hidden_states:
-                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
-                        else:
-                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
-                                                                         self.decoder.lagged_value)
-                    else:
-                        if self.decoder_has_hidden_states:
-                            x_future = repeated_predicted_target[:, [-1]]
-                        else:
-                            x_future = repeated_predicted_target
-
-                    if self.decoder_has_hidden_states:
-                        x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
-                    else:
-                        # decoder uses the entire future targets
-                        x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, :idx_pred + 1, :], x_future], dim=-1)
-
-                    x_future = x_future.to(self.device)
-                    if self.decoder_has_hidden_states:
-                        x_future, repeated_state = self.decoder(x_future, features_latent=repeated_state)
-                    else:
-                        x_future = self.decoder(x_future, repeated_state)
-                    net_output = self.head(x_future[:, -1:, ])
-                    samples = self.pred_from_net_output(net_output).cpu()
-                    repeated_predicted_target = torch.cat([repeated_predicted_target,
-                                                           samples],
-                                                          dim=1)
-                    all_samples.append(samples)
-
-                all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
-
-                if self.aggregation == 'mean':
-                    return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
-                elif self.aggregation == 'median':
-                    return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
-                else:
-                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
-
-    def predict(self,
-                past_targets: torch.Tensor,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
-                ):
-        net_output = self(past_targets, past_features, future_features)
-        if self.output_type != 'distribution':
-            return self.pred_from_net_output(net_output)
-        else:
-            return net_output
-
-
-class ForecastingDeepARNet(ForecastingNet):
-    future_target_required = True
-
-    def __init__(self,
-                 **kwargs):
-        """
-        Forecasting network with DeepAR structure.
-
-        This structure is activate when the decoder is not recurrent (MLP) and its hyperparameter "auto_regressive" is
-        set  as True. We train the network to let it do a one-step prediction. This structure is compatible with any
-         sorts of encoder (except MLP).
-        """
-        super(ForecastingDeepARNet, self).__init__(**kwargs)
-        # this determines the training targets
-        self.encoder_bijective_seq_output = kwargs['network_encoder']['block_1'].encoder_properties.bijective_seq_output
-
-        self.cached_lag_mask_encoder_test = None
-        self.only_generate_future_dist = False
-
-    def train(self, mode: bool = True) -> nn.Module:
-        self.only_generate_future_dist = False
-        return super().train(mode=mode)
-
-    def forward(self,
-                past_targets: torch.Tensor,
-                future_targets: Optional[torch.Tensor] = None,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.training:
-            if self.encoder_lagged_input:
-                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    past_targets[:, -self.window_size:])
-                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
-                future_targets = self.scale_value(future_targets, loc, scale)
-
-                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
-                seq_length = self.window_size + self.n_prediction_steps
-                targets_all, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_all,
-                                                                                    seq_length - 1,
-                                                                                    self.encoder.lagged_value,
-                                                                                    self.cached_lag_mask_encoder)
-            else:
-                if self.window_size < past_targets.shape[1]:
-                    past_targets = past_targets[:, -self.window_size:]
-                past_targets, _, loc, scale = self.target_scaler(past_targets)
-                future_targets = self.scale_value(future_targets, loc, scale)
-                targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
-
-            x_input = targets_all
-            if past_features is not None:
-                features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
-                x_input = torch.cat([features_all, x_input], dim=-1)
-            x_input = x_input.to(self.device)
-
-            x_input = self.embedding(x_input)
-
-            if self.encoder_has_hidden_states:
-                x_input, _ = self.encoder(x_input, output_seq=True)
-            else:
-                x_input = self.encoder(x_input, output_seq=True)
-            if self.only_generate_future_dist:
-                x_input = x_input[:, -self.n_prediction_steps:]
-            net_output = self.head(self.decoder(x_input))
-            return self.rescale_output(net_output, loc, scale, self.device)
-        else:
-            if self.encoder_lagged_input:
-                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    past_targets[:, -self.window_size:])
-                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
-                x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
-                                                                                    self.window_size,
-                                                                                    self.encoder.lagged_value,
-                                                                                    self.cached_lag_mask_encoder_test)
-            else:
-                if self.window_size < past_targets.shape[1]:
-                    past_targets = past_targets[:, -self.window_size:]
-
-                past_targets, _, loc, scale = self.target_scaler(past_targets)
-                x_past = past_targets
-            if past_features is not None:
-                # features is one step ahead of target
-                if self.window_size > 1:
-                    features_all = torch.cat([past_features[:, -self.window_size + 1:, ],
-                                              future_features],
-                                             dim=1)
-                else:
-                    features_all = future_features
-            else:
-                features_all = None
-            x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
-                                                                   dim=-1)
-
-            x_past = x_past.to(self.device)
-            # TODO consider static features
-            x_past = self.embedding(x_past)
-
-            all_samples = []
-            batch_size = past_targets.shape[0]
-
-            if self.encoder_has_hidden_states:
-                # For RNN, we only feed the hidden state and generated future input to the netwrok
-                encoder_output, hidden_states = self.encoder(x_past)
-                if isinstance(hidden_states, tuple):
-                    repeated_state = [
-                        s.repeat_interleave(repeats=self.num_samples, dim=1)
-                        for s in hidden_states
-                    ]
-                else:
-                    repeated_state = hidden_states.repeat_interleave(repeats=self.num_samples, dim=1)
-            else:
-                # For other models, the full past targets are passed to the network.
-                encoder_output = self.encoder(x_past)
-            if self.encoder_lagged_input:
-                max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size)
-            else:
-                max_lag_seq_length = self.window_size
-            # TODO considering padding targets here instead of inside get_lagged function
-            repeated_past_target = past_targets[:, -max_lag_seq_length:, ].repeat_interleave(
-                repeats=self.num_samples,
-                dim=0).squeeze(1)
-
-            repeated_static_feat = static_features.repeat_interleave(
-                repeats=self.num_samples, dim=0
-            ).unsqueeze(dim=1) if static_features is not None else None
-
-            if features_all is not None:
-                if not self.encoder_has_hidden_states:
-                    # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
-                    # data properties!!!
-                    time_feature = features_all
-                else:
-                    time_feature = future_features[:, 1:] if self.n_prediction_steps > 1 else None
-            else:
-                time_feature = None
-
-            repeated_time_feat = time_feature.repeat_interleave(
-                repeats=self.num_samples, dim=0
-            ) if future_features is not None else None
-
-            net_output = self.head(self.decoder(encoder_output))
-
-            next_sample = net_output.sample(sample_shape=(self.num_samples,))
-
-            next_sample = next_sample.transpose(0, 1).reshape(
-                (next_sample.shape[0] * next_sample.shape[1], 1, -1)
-            ).cpu()
-
-            all_samples.append(next_sample)
-
-            for k in range(1, self.n_prediction_steps):
-                if self.encoder_has_hidden_states:
-                    if self.encoder_lagged_input:
-                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                        x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
-                    else:
-                        x_next = next_sample
-                    x_next = x_next if repeated_time_feat is None else torch.cat([repeated_time_feat[:, k - 1:k],
-                                                                                  x_next], dim=-1)
-                    x_next = x_next.to(self.device)
-                    encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
-                else:
-                    if self.encoder_lagged_input:
-                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                        x_next = get_lagged_subsequences_inference(x_next,
-                                                                   self.window_size + k,
-                                                                   self.encoder.lagged_value)
-                    else:
-                        x_next = torch.cat([repeated_past_target[:, -self.window_size:], *all_samples], dim=1)
-                    if repeated_time_feat is None:
-                        x_next = x_next
-                    else:
-                        x_next = torch.cat([repeated_time_feat[:, :self.window_size + k],
-                                            x_next], dim=-1)
-                    x_next = x_next.to(self.device)
-                    encoder_output = self.encoder(x_next)
-
-                # During training, the encoder output a sequence. Thus for prediction, the network should have the same
-                # output format
-                encoder_output = torch.unsqueeze(encoder_output, 1)
-
-                net_output = self.head(self.decoder(encoder_output))
-
-                next_sample = net_output.sample().cpu()
-                all_samples.append(next_sample)
-
-            all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
-
-            if not self.output_type == 'distribution' and self.forecast_strategy == 'sample':
-                raise ValueError(
-                    f"A DeepAR network must have output type as Distribution and forecast_strategy as sample,"
-                    f"but this network has {self.output_type} and {self.forecast_strategy}")
-            if self.aggregation == 'mean':
-                return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)
-            elif self.aggregation == 'median':
-                return self.rescale_output(torch.median(all_predictions, dim=1)[0], loc, scale)
-            else:
-                raise ValueError(f'Unknown aggregation: {self.aggregation}')
-
-    def predict(self,
-                past_targets: torch.Tensor,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
-                ):
-        net_output = self(past_targets, past_features, future_features)
-        return net_output
-
-
-class NBEATSNet(ForecastingNet):
-    future_target_required = False
-
-    def forward(self,
-                past_targets: torch.Tensor,
-                future_targets: Optional[torch.Tensor] = None,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.window_size < past_targets.shape[1]:
-            past_targets = past_targets[:, -self.window_size:]
-        past_targets, _, loc, scale = self.target_scaler(past_targets)
-        past_targets = past_targets.to(self.device)
-
-        batch_size = past_targets.shape[0]
-        output_shape = past_targets.shape[2:]
-        forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
-
-        forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
-        backcast = self.encoder(past_targets)
-        for block in self.decoder:
-            backcast_block, forecast_block = block(backcast)
-
-            backcast = backcast - backcast_block
-            forecast = forecast + forecast_block
-        backcast = backcast.reshape(past_targets.shape)
-        forecast = forecast.reshape(forcast_shape)
-
-        forecast = self.rescale_output(forecast, loc, scale, self.device)
-        if self.training:
-            backcast = self.rescale_output(backcast, loc, scale, self.device)
-            return backcast, forecast
-        else:
-            return forecast
-
-    def pred_from_net_output(self, net_output: torch.Tensor):
-        return net_output
+from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
+    ForecastingNet,
+    ForecastingSeq2SeqNet,
+    ForecastingDeepARNet,
+    NBEATSNet,
+)
 
 
 class ForecastingNetworkComponent(NetworkComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index 34f8e8ad3..e470f77f5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -4,10 +4,175 @@
 from functools import partial
 import torch.nn.functional as F
 import math
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone. \
+    forecasting_encoder.base_forecasting_encoder import (
+    NetworkStructure,
+    EncoderBlockInfo
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone. \
+    forecasting_decoder.base_forecasting_decoder import DecoderBlockInfo
+
 from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
     TimeDistributed, TimeDistributedInterpolation, GatedLinearUnit, ResampleNorm, AddNorm, GateAddNorm,
-    GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention
+    GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention,
 )
+from pytorch_forecasting.utils import create_mask
+
+
+class AddLayer(nn.Module):
+    def __init__(self, input_size: int, skip_size: int):
+        super().__init__()
+        if input_size == skip_size:
+            self.fc = nn.Linear(skip_size, input_size)
+
+    def forward(self, input: torch.Tensor, skip: torch.Tensor):
+        if hasattr(self, 'fc'):
+            return input + self.fc(skip)
+        else:
+            return input
+
+
+class TemporalFusionLayer(nn.Module):
+    """
+    (Lim et al.
+    Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
+    https://arxiv.org/abs/1912.09363)
+    we follow the implementation from pytorch forecasting:
+    https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
+    """
+
+    def __init__(self,
+                 window_size: int,
+                 n_prediction_steps: int,
+                 network_structure: NetworkStructure,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 n_decoder_output_features: int,
+                 d_model: int,
+                 n_head: int,
+                 dropout: Optional[float] = None):
+        super().__init__()
+        num_blocks = network_structure.num_blocks
+        last_block = f'block_{num_blocks}'
+        n_encoder_output = network_encoder[last_block].encoder_output_shape_[-1]
+        self.window_size = window_size
+        self.n_prediction_steps = n_prediction_steps
+        self.timestep = window_size + n_prediction_steps
+
+        if n_decoder_output_features != n_encoder_output:
+            self.decoder_proj_layer = nn.Linear(n_decoder_output_features, n_encoder_output, bias=False)
+        else:
+            self.decoder_proj_layer = None
+        if network_structure.variable_selection:
+            if network_structure.skip_connection:
+                # static feature selector needs to generate the same number of features as the output of the encoder
+                n_encoder_output_first = network_encoder['block_1'].encoder_output_shape_[-1]
+                self.static_context_enrichment = GatedResidualNetwork(
+                    n_encoder_output_first, n_encoder_output_first, n_encoder_output_first, dropout
+                )
+                self.enrichment = GatedResidualNetwork(
+                    input_size=n_encoder_output,
+                    hidden_size=n_encoder_output,
+                    output_size=d_model,
+                    dropout=dropout,
+                    context_size=n_encoder_output_first,
+                    residual=True,
+                )
+                self.enrich_with_static = True
+        if not hasattr(self, 'enrichment'):
+            self.enrichment = GatedResidualNetwork(
+                input_size=n_encoder_output,
+                hidden_size=n_encoder_output,
+                output_size=d_model,
+                dropout=self.dropout_rate if self.use_dropout else None,
+                residual=True,
+            )
+            self.enrich_with_static = False
+
+        self.attention_fusion = InterpretableMultiHeadAttention(
+            d_model=d_model,
+            n_head=n_head,
+            dropout=dropout
+        )
+        self.post_attn_gate_norm = GateAddNorm(d_model, dropout=dropout, trainable_add=False)
+        self.pos_wise_ff = GatedResidualNetwork(input_size=d_model, hidden_size=d_model,
+                                                output_size=d_model, dropout=self.hparams.dropout)
+
+        self.network_structure = network_structure
+        if network_structure.skip_connection:
+            if network_structure.skip_connection_type == 'add':
+                self.residual_connection = AddLayer(d_model, n_encoder_output)
+            elif network_structure.skip_connection_type == 'gate_add_norm':
+                self.residual_connection = GateAddNorm(d_model, skip_size=n_encoder_output,
+                                                       dropout=None, trainable_add=False)
+
+    def forward(self, encoder_output: torch.Tensor, decoder_output: torch.Tensor, encoder_lengths: torch.LongTensor,
+                static_embedding: Optional[torch.Tensor] = None):
+        """
+        Args:
+            encoder_output: the output of the last layer of encoder network
+            decoder_output: the output of the last layer of decoder network
+            encoder_lengths: length of encoder network
+            static_embedding: output of static variable selection network (if applible)
+        """
+        if self.decoder_proj_layer is not None:
+            decoder_output = self.decoder_proj_layer(decoder_output)
+        network_output = torch.cat([encoder_output, decoder_output], dim=1)
+
+        if self.enrich_with_static:
+            static_context_enrichment = self.static_context_enrichment(static_embedding)
+            attn_input = self.enrichment(
+                network_output, static_context_enrichment[:, None].expand(-1, self.timesteps, -1)
+            )
+        else:
+            attn_input = self.enrichment(network_output)
+
+        # Attention
+        attn_output, attn_output_weights = self.attention_fusion(
+            q=attn_input[:, self.window_size:],  # query only for predictions
+            k=attn_input,
+            v=attn_input,
+            mask=self.get_attention_mask(
+                encoder_lengths=encoder_lengths, decoder_length=self.n_prediction_steps
+            ),
+        )
+        # skip connection over attention
+        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, self.window_size:])
+        output = self.pos_wise_ff(attn_output)
+
+        if self.network_structure.skip_connection:
+            return self.residual_connection(output, decoder_output)
+        else:
+            return output
+
+    def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length: int):
+        """
+        https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
+        temporal_fusion_transformer/__init__.py
+        """
+        # indices to which is attended
+        attend_step = torch.arange(decoder_length, device=self.device)
+        # indices for which is predicted
+        predict_step = torch.arange(0, decoder_length, device=self.device)[:, None]
+        # do not attend to steps to self or after prediction
+        # todo: there is potential value in attending to future forecasts if they are made with knowledge currently
+        #   available
+        #   one possibility is here to use a second attention layer for future attention (assuming different effects
+        #   matter in the future than the past)
+        #   or alternatively using the same layer but allowing forward attention - i.e. only masking out non-available
+        #   data and self
+        decoder_mask = attend_step >= predict_step
+        # do not attend to steps where data is padded
+        encoder_mask = create_mask(encoder_lengths.max(), encoder_lengths)
+        # combine masks along attended time - first encoder and then decoder
+        mask = torch.cat(
+            (
+                encoder_mask.unsqueeze(1).expand(-1, decoder_length, -1),
+                decoder_mask.unsqueeze(0).expand(encoder_lengths.size(0), -1, -1),
+            ),
+            dim=2,
+        )
+        return mask
 
 
 def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='encoder'):
@@ -28,50 +193,6 @@ def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='e
         raise ValueError('layer_type must be encoder or decoder!')
 
 
-class TunableAddNorm(AddNorm):
-    def __init__(self, input_size: int, skip_size: int = None, trainable_add: bool = True,
-                 layer_norm_eps: float = 1e-5):
-        super(TunableAddNorm, self).__init__(input_size, skip_size, trainable_add)
-        self.norm = nn.LayerNorm(self.input_size, eps=layer_norm_eps)
-
-
-class TunableGateAddNorm(GateAddNorm):
-    def __init__(self, input_size: int, hidden_size: int = None, skip_size: int = None, trainable_add: bool = False,
-                 dropout: Optional[float] = None, layer_norm_eps: float = 1e-5):
-        super().__init__(input_size, hidden_size, skip_size, trainable_add, dropout)
-        self.add_norm = TunableAddNorm(self.hidden_size, skip_size=self.skip_size, trainable_add=trainable_add,
-                                       layer_norm_eps=layer_norm_eps)
-
-
-class TunableGatedResidualNetwork(GatedResidualNetwork):
-    def __init__(self, input_size: int, hidden_size: int, output_size: int, dropout: float = 0.1,
-                 context_size: int = None, residual: bool = False, layer_norm_eps: float = 1e-5):
-        super().__init__(input_size, hidden_size, output_size, dropout, context_size, residual)
-        self.gate_norm = TunableGateAddNorm(
-            input_size=self.hidden_size,
-            skip_size=self.output_size,
-            hidden_size=self.output_size,
-            dropout=self.dropout,
-            trainable_add=False,
-            layer_norm_eps=layer_norm_eps
-        )
-
-
-class InterpretableMultiAttentionEncoderLayer(nn.Module):
-    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
-                 layer_norm_eps: float = 1e-5, device=None, dtype=None) -> None:
-        self.multi_attention = InterpretableMultiHeadAttention(n_head=nhead, d_model=d_model, dropout=dropout)
-        self.post_attn_gate_norm = TunableGateAddNorm(input_size=d_model,
-                                                      hidden_size=dim_feedforward,
-                                                      dropout=dropout,
-                                                      trainable_add=False,
-                                                      layer_norm_eps=layer_norm_eps
-                                                      )
-        self.pos_wise_ff = TunableGatedResidualNetwork(
-            self.hparams.hidden_size, self.hparams.hidden_size, self.hparams.hidden_size, dropout=self.hparams.dropout
-        )
-
-
 # https://github.com/pytorch/examples/blob/master/word_language_model/model.py
 class PositionalEncoding(nn.Module):
     r"""
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index d76de1071..7c8d57073 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -76,8 +76,7 @@ def _build_decoder(self,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
         # RNN decoder only allows RNN encoder, these parameters need to exists.
         hidden_size = self.rnn_kwargs['hidden_size']
-        num_layers = 2 * self.rnn_kwargs['num_layers'] if self.rnn_kwargs['bidirectional'] else self.rnn_kwargs[
-            'num_layers']
+        num_layers = self.rnn_kwargs['num_layers']
         cell_type = self.rnn_kwargs['cell_type']
         dropout = self.rnn_kwargs['dropout']
         decoder = RNN_Module(in_features=future_variable_input[-1],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 42c5f5a9c..7ef597d1d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -31,7 +31,7 @@ class NetworkStructure(NamedTuple):
     num_blocks: int = 1
     variable_selection: bool = False
     skip_connection: bool = False
-    skip_connection_type: str = "add"
+    skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
     grn_dropout_rate: float = 0.0
 
 
@@ -166,8 +166,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
             variable_selection = X.get("variable_selection", False)
             if variable_selection:
-                # TODO
-                pass
+                in_features = self.n_encoder_output_feature()
             elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
                 in_features = len(self.lagged_value) * output_shape[-1] + input_shape[-1] + static_features_shape
             else:
@@ -185,13 +184,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         has_hidden_states = self.encoder_properties().has_hidden_states
         self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
-
         return self
 
     @staticmethod
     def allowed_decoders():
         raise NotImplementedError
 
+    @abstractmethod
+    def n_encoder_output_feature(self) -> int:
+        raise NotImplementedError
+
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
         network_encoder = X.get('network_encoder', OrderedDict())
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index b1e29b906..f9b64c861 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -88,6 +88,10 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
         return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features * self.window_size))
 
+    def n_encoder_output_feature(self) -> int:
+        # This function should never be called!!
+        return self.config["num_units_%d" % (self.config['num_groups'])]
+
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index c7a438f03..e61bc40ff 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -43,6 +43,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.window_size = X["window_size"]
         return super().fit(X, y)
 
+    def n_encoder_output_feature(self):
+        # THIS function should never be called!!!
+        raise NotImplementedError
+
     def build_encoder(self,
                       input_shape: Tuple[int, ...]) -> nn.Module:
         preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 5f6ec53bb..92bd892ab 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -144,6 +144,9 @@ def build_encoder(self, input_shape: Tuple[int, ...] = (0,)) -> nn.Module:
         self._receptive_field = encoder.receptive_field
         return encoder
 
+    def n_encoder_output_feature(self) -> int:
+        return self.config['num_filters']
+
     @staticmethod
     def allowed_decoders():
         """
@@ -195,5 +198,4 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, num_filters, UniformIntegerHyperparameter)
         add_hyperparameter(cs, kernel_size, UniformIntegerHyperparameter)
         add_hyperparameter(cs, bottleneck_size, UniformIntegerHyperparameter)
-
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 5a6a9f594..a741de250 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -85,6 +85,9 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
                        )
         return encoder
 
+    def n_encoder_output_feature(self) -> int:
+        return 2 * self.config['hidden_size'] if self.config['bidirectional'] else self.config['hidden_size']
+
     @staticmethod
     def allowed_decoders():
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index fef6db303..d2f3ef072 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -134,6 +134,10 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> Tuple[nn.Module, int]:
         self._receptive_field = encoder.receptive_field
         return encoder, in_features
 
+    def n_encoder_output_feature(self) -> int:
+        num_blocks = self.config["num_blocks"]
+        return self.config[f"num_filters_{num_blocks}"]
+
     @staticmethod
     def allowed_decoders():
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 90652c38c..539169f83 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -36,7 +36,10 @@ def __init__(self,
             self.lagged_value = [0]
         else:
             self.lagged_value = lagged_value
-        self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        if in_features != d_model:
+            self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        else:
+            self.input_layer = []
         if use_positional_encoder:
             self.input_layer.append(PositionalEncoding(d_model, dropout_pe))
         self.input_layer = nn.Sequential(*self.input_layer)
@@ -90,6 +93,9 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
                                       lagged_value=self.lagged_value)
         return encoder
 
+    def n_encoder_output_feature(self) -> int:
+        return 2 ** self.config['d_model_log']
+
     @staticmethod
     def allowed_decoders():
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 87af5a6bb..e7569be00 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -65,7 +65,7 @@ def get_hyperparameter_search_space(
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                              value_range=(1, 2),
+                                                                              value_range=(1, 1),
                                                                               default_value=1),
             variable_selection: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="variable_selection",
@@ -82,8 +82,8 @@ def get_hyperparameter_search_space(
                                                                                    default_value=False),
             skip_connection_type: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="skip_connection_type",
-                value_range=("add", "grn"),
-                default_value="grn",
+                value_range=("add", "gate_add_norm"),
+                default_value="gate_add_norm",
             ),
             grn_use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="grn_use_dropout",
                                                                                    value_range=(True, False),
@@ -329,18 +329,28 @@ def get_hyperparameter_search_space(
                 deep_ar_hp = cs.get_hyperparameter(deep_ar_hp)
                 forbidden_ar = ForbiddenEqualsClause(deep_ar_hp, True)
                 if min_num_blocks == 1:
-                    if max_num_blocks - min_num_blocks > 1:
-                        forbidden = ForbiddenAndConjunction(
-                            ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
-                            forbidden_ar
-                        )
-                    else:
-                        forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
-                    cs.add_forbidden_clause(forbidden)
-
-        import pdb
-
-        pdb.set_trace()
+                    if max_num_blocks > 1:
+                        if max_num_blocks - min_num_blocks > 1:
+                            forbidden = ForbiddenAndConjunction(
+                                ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
+                                forbidden_ar
+                            )
+                        else:
+                            forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
+                        cs.add_forbidden_clause(forbidden)
+                if 'RNNEncoder' in available_encoders:
+                    for i in range(min_num_blocks, max_num_blocks + 1):
+                        rnn_bidirectional_hp = ':'.join([f'block_{min_num_blocks}',
+                                                         'RNNEncoder',
+                                                         'bidirectional'])
+                        if rnn_bidirectional_hp in cs:
+                            rnn_bidirectional_hp = cs.get_hyperparameter(rnn_bidirectional_hp)
+                            if 'True' in rnn_bidirectional_hp.choices:
+                                forbidden = ForbiddenAndConjunction(
+                                    ForbiddenEqualsClause(rnn_bidirectional_hp, True),
+                                    deep_ar_hp
+                                )
+                                cs.add_forbidden_clause(forbidden)
         return cs
 
     def set_hyperparameters(self,
@@ -368,8 +378,7 @@ def set_hyperparameters(self,
         decoder_auto_regressive = params['decoder_auto_regressive']
         forecasting_structure_kwargs = dict(num_blocks=num_blocks,
                                             variable_selection=params['variable_selection'],
-                                            skip_connection=params['skip_connection'],
-                                            decoder_auto_regressive=decoder_auto_regressive, )
+                                            skip_connection=params['skip_connection'])
 
         del params['num_blocks']
         del params['variable_selection']
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index cd6993f02..863f33541 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -1,19 +1,29 @@
-from typing import Any, Dict, Iterable, Tuple, List, Optional
+from typing import Any, Dict, Iterable, Tuple, List, Optional, Union
 
 import numpy as np
 import torch
 from torch import nn
 from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
+    UniformIntegerHyperparameter
+from ConfigSpace.conditions import EqualsCondition
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
+    base_forecasting_encoder import NetworkStructure, EncoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
     base_forecasting_decoder import DecoderBlockInfo
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
     ALL_DISTRIBUTIONS
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    TemporalFusionLayer
+)
 
 
 class ForecastingHead(NetworkHeadComponent):
@@ -24,13 +34,24 @@ class ForecastingHead(NetworkHeadComponent):
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
     def __init__(self,
-                 random_state: Optional[np.random.RandomState] = None):
+                 random_state: Optional[np.random.RandomState] = None,
+                 use_temporal_fusion: bool = False,
+                 attention_n_head_log: int = 2,
+                 attention_d_model_log: int = 4,
+                 use_dropout: bool = False,
+                 dropout_rate: Optional[float] = None,
+                 ):
         super(NetworkHeadComponent, self).__init__(random_state=random_state)
 
         self.add_fit_requirements(self._required_fit_requirements)
         self.head: Optional[nn.Module] = None
         self.required_net_out_put_type: Optional[str] = None
         self.output_shape = None
+        self.use_temporal_fusion = use_temporal_fusion
+        self.attention_n_head_log = attention_n_head_log
+        self.attention_d_model_log = attention_d_model_log
+        self.use_dropout = use_dropout
+        self.dropout_rate = dropout_rate
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -39,9 +60,11 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
-            FitRequirement('network_decoder', (Dict,),  user_defined=False, dataset_property=False),
+            FitRequirement('network_encoder', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('network_structure', (EncoderBlockInfo,), user_defined=False, dataset_property=False),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -54,8 +77,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         Returns:
             Self
         """
-
         self.check_requirements(X, y)
+
         output_shape = X['dataset_properties']['output_shape']
 
         self.required_net_out_put_type = X['required_net_out_put_type']
@@ -78,9 +101,23 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         head_input_shape = X["n_decoder_output_features"]
         n_prediction_heads = X["n_prediction_heads"]
 
-        decoder_has_local_layer = X.get('mlp_has_local_layer', True)
+        network_structure = X['network_structure']  # type: NetworkStructure
+        head_components = {}
+
+        if self.use_temporal_fusion:
+            temporal_fusion = TemporalFusionLayer(window_size=X['window_size'],
+                                                  n_prediction_steps=X['dataset_properties']['n_prediction_steps'],
+                                                  network_structure=network_structure,
+                                                  network_encoder=X['network_encoder'],
+                                                  n_decoder_output_features=X['n_decoder_output_features'],
+                                                  d_model=2 ** self.attention_d_model_log,
+                                                  n_head=2 ** self.attention_n_head_log
+                                                  )
+            head_components['temporal_fusion'] = temporal_fusion
+            head_input_shape = 2 ** self.attention_d_model_log
 
-        self.head = self.build_head(
+        decoder_has_local_layer = X.get('mlp_has_local_layer', True)
+        head_components['head'] = self.build_head(
             input_shape=head_input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
@@ -88,6 +125,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             decoder_has_local_layer=decoder_has_local_layer,
             n_prediction_heads=n_prediction_heads,
         )
+        self.head = nn.ModuleDict(head_components)
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -108,6 +146,27 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
                       'network_decoder': decoder})
         return X
 
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        """Get the properties of the underlying algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, Union[str, int]]):
+                Describes the dataset to work on
+
+        Returns:
+            Dict[str, Any]:
+                Properties of the algorithm
+        """
+        return {
+            'shortname': 'ForecastingHead',
+            'name': 'ForecastingHead',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
     def build_head(self,
                    input_shape: Tuple[int, ...],
                    output_shape: Tuple[int, ...],
@@ -197,17 +256,66 @@ def build_proj_layer(input_shape: Tuple[int, ...],
 
     @staticmethod
     def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_temporal_fusion',
+                value_range=(True, False),
+                default_value=False),
+            attention_n_head_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='attention_n_head_log',
+                value_range=(1, 3),
+                default_value=2,
+            ),
+            attention_d_model_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='attention_d_model_log',
+                value_range=(4, 8),
+                default_value=4,
+            ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_dropout',
+                value_range=(True, False),
+                default_value=True,
+            ),
+            dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='dropout_rate',
+                value_range=(0.0, 0.8),
+                default_value=0.1,
+            )
     ) -> ConfigurationSpace:
         """Return the configuration space of this classification algorithm.
 
         Args:
             dataset_properties (Optional[Dict[str, Union[str, int]]):
                 Describes the dataset to work on
-
+            use_temporal_fusion (HyperparameterSearchSpace):
+                if attention fusion layer is applied (Lim et al.
+                Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
+                https://arxiv.org/abs/1912.09363)
+            attention_n_head_log (HyperparameterSearchSpace):
+                log value of number of heads for interpretable
+            attention_d_model_log (HyperparameterSearchSpace):
+                log value of input of attention model
+            use_dropout (HyperparameterSearchSpace):
+                if dropout is applied to temporal fusion layer
+            dropout_rate (HyperparameterSearchSpace):
+                dropout rate of the temporal fusion  layer
         Returns:
             ConfigurationSpace:
                 The configuration space of this algorithm.
         """
         cs = ConfigurationSpace()
+
+        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
+        attention_n_head_log = get_hyperparameter(attention_n_head_log, UniformIntegerHyperparameter)
+        attention_d_model_log = get_hyperparameter(attention_d_model_log, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout_rate = get_hyperparameter(dropout_rate, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([use_temporal_fusion, attention_n_head_log, attention_d_model_log, use_dropout,
+                                dropout_rate])
+        cond_attention_n_head_log = EqualsCondition(attention_n_head_log, use_temporal_fusion, True)
+        cond_attention_d_model_log = EqualsCondition(attention_d_model_log, use_temporal_fusion, True)
+        cond_use_dropout = EqualsCondition(use_dropout, use_temporal_fusion, True)
+        cond_dropout_rate = EqualsCondition(dropout_rate, use_dropout, True)
+        cs.add_conditions([cond_attention_n_head_log, cond_attention_d_model_log, cond_use_dropout, cond_dropout_rate])
         return cs
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 64669a0af..2f20608c7 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -10,6 +10,7 @@
 
 from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+from torch.nn.utils.rnn import pad_sequence
 
 
 class TestSequenceDataset(TransformSubset):
@@ -24,15 +25,14 @@ def __getitem__(self, idx: int) -> np.ndarray:
         return seq.__getitem__(len(seq) - 1, self.train)
 
 
-def pad_sequence_from_start(sequences: List[torch.Tensor],
-                            seq_minimal_length: int,
-                            seq_max_length: int = np.inf,
-                            batch_first=True,
-                            padding_value=0.0) -> torch.Tensor:
+def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
+                                     seq_minimal_length: int,
+                                     seq_max_length: int = np.inf,
+                                     batch_first=True,
+                                     padding_value=0.0) -> torch.Tensor:
     r"""
-    This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we pad new values from the start of
-    the sequence. i.e., instead of extending [1,2,3] to [1,2,3,0,0], we extend it as [0,0,1,2,3]. Additionally, the
-    generated sequnece needs to have a length of at least seq_minimal_length
+    This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we constraint the sequence to be
+    at least seq_minimal_length and at most seq_max_length
     """
 
     # assuming trailing dimensions and type of all the Tensors
@@ -52,9 +52,9 @@ def pad_sequence_from_start(sequences: List[torch.Tensor],
         length = min(tensor.size(0), seq_max_length)
         # use index notation to prevent duplicate references to the tensor
         if batch_first:
-            out_tensor[i, -length:, ...] = tensor[-length:]
+            out_tensor[i, :length, ...] = tensor[-length:]
         else:
-            out_tensor[-length:, i, ...] = tensor[-length:]
+            out_tensor[length:, i, ...] = tensor[-length:]
 
     return out_tensor
 
@@ -67,7 +67,8 @@ class PadSequenceCollector:
 
     """
 
-    def __init__(self, window_size: int, sample_interval, target_padding_value: float = 0.0, seq_max_length: int = np.inf):
+    def __init__(self, window_size: int, sample_interval, target_padding_value: float = 0.0,
+                 seq_max_length: int = np.inf):
         self.window_size = window_size
         self.sample_interval = sample_interval
         self.target_padding_value = target_padding_value
@@ -77,7 +78,7 @@ def __call__(self, batch, sample_interval=1, padding_value=0.0):
         elem = batch[0]
         elem_type = type(elem)
         if isinstance(elem, torch.Tensor):
-            seq = pad_sequence_from_start(batch,
+            seq = pad_sequence_with_minimal_length(batch,
                                           seq_minimal_length=self.window_size,
                                           seq_max_length=self.seq_max_length,
                                           batch_first=True, padding_value=padding_value)  # type: torch.Tensor
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index f211af861..576640c12 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -201,6 +201,8 @@ def _get_hyperparameter_search_space(self,
         if 'loss' in self.named_steps.keys() and 'network_backbone' in self.named_steps.keys():
             hp_loss = cs.get_hyperparameter('loss:__choice__')
 
+            ar_forbidden = True
+
             hp_auto_regressive = []
             for hp_name in cs.get_hyperparameter_names():
                 if hp_name.startswith('network_backbone:'):
@@ -212,7 +214,7 @@ def _get_hyperparameter_search_space(self,
             if 'RegressionLoss' in hp_loss.choices:
                 forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
                 for hp_ar in hp_auto_regressive:
-                    forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, ar_forbidden)
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
                     forbidden_losses_all.append(forbidden_hp_dist)
 
@@ -244,7 +246,7 @@ def _get_hyperparameter_search_space(self,
                 forbidden = ['MLPEncoder']
                 forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
                 for hp_ar in hp_auto_regressive:
-                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
                     forbidden_hp_mlpencoder = ForbiddenInClause(network_encoder_hp, forbidden_deepAREncoder)
                     forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                     forbidden_losses_all.append(forbidden_hp_ar_mlp)
@@ -253,7 +255,7 @@ def _get_hyperparameter_search_space(self,
                 forbidden = ['MLPEncoder']
                 forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
                 for hp_ar in hp_auto_regressive:
-                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
                     forbidden_hp_mlpencoder = ForbiddenInClause(network_encoder_hp, forbidden_deepAREncoder)
                     forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                     forbidden_losses_all.append(forbidden_hp_ar_mlp)
@@ -261,7 +263,7 @@ def _get_hyperparameter_search_space(self,
             forecast_strategy = cs.get_hyperparameter('network:forecast_strategy')
             if 'mean' in forecast_strategy.choices:
                 for hp_ar in hp_auto_regressive:
-                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, True)
+                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
                     forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
                     forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
                                                                                 forbidden_hp_forecast_strategy)

From 57461b903d7bd44b481094d813b8a9980dfca140 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 2 Mar 2022 09:24:28 +0100
Subject: [PATCH 169/347] construct network

---
 .../setup/network/forecasting_architecture.py | 216 +++++++++++++++++-
 .../setup/network/forecasting_network.py      |   5 +-
 .../forecasting_backbone/components_util.py   |   5 +-
 .../base_forecasting_decoder.py               |  12 +-
 .../base_forecasting_encoder.py               |  18 ++
 .../seq_encoder/InceptionTimeEncoder.py       |   5 +-
 .../seq_encoder/RNNEncoder.py                 |  37 +--
 .../seq_encoder/TCNEncoder.py                 |   5 +-
 .../seq_encoder/TransformerEncoder.py         |   9 +-
 .../seq_encoder/__init__.py                   |  63 ++++-
 .../forecasting_head.py                       |   2 +-
 11 files changed, 333 insertions(+), 44 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 0e0b215ac..83e23ab47 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1,11 +1,8 @@
 from collections import OrderedDict
 from typing import Any, Dict, Optional, Union, Tuple, List
+from enum import Enum
 
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
-from ConfigSpace.conditions import EqualsCondition
-
-import numpy as np
+from abc import abstractmethod
 
 import torch
 from torch import nn
@@ -16,8 +13,6 @@
     TransformedDistribution,
 )
 
-from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.\
@@ -28,12 +23,30 @@
     NetworkStructure,
     EncoderProperties
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder.\
+    RNNEncoder import _RNN
+
 from autoPyTorch.pipeline.components.setup.network_backbone.\
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
     DecoderBlockInfo,
     DecoderProperties
 )
 
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import AddLayer
+from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
+    TimeDistributed, TimeDistributedInterpolation, GatedLinearUnit, ResampleNorm, AddNorm, GateAddNorm,
+    GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention
+)
+
+
+class EncoderOutputForm(Enum):
+    NoOutput = 0
+    HiddenStates = 1 # RNN -> RNN
+    Sequence = 2   # RNN/TCN/Transformer -> MLP
+    SequenceLast = 3 # Transformer -> Transformer
+
+
+
 class TransformedDistribution_(TransformedDistribution):
     """
     We implement the mean function such that we do not need to enquire base mean every time
@@ -151,7 +164,62 @@ def get_lagged_subsequences_inference(
     return lagged_seq
 
 
-class ForecastingNet(nn.Module):
+class StackedEncoder(nn.Module):
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 has_temporal_fusion: bool,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 network_decoder: Dict[str, DecoderBlockInfo],
+                 ):
+        self.num_blocks = network_structure.num_blocks
+        self.skip_connection = network_structure.skip_connection
+        self.has_temporal_fusion = has_temporal_fusion
+
+        self.encoder_output_type = {}
+        self.encoder_has_hidden_states = {}
+        encoder = nn.ModuleDict()
+        for i in range(1, self.num_blocks + 1):
+            block_id = f'block_{i}'
+            encoder[block_id] = network_encoder[block_id].encoder
+            if self.skip_connection:
+                input_size = network_encoder[block_id].encoder_output_shape_[-1]
+                skip_size = network_encoder[block_id].encoder_input_shape[-1]
+                if network_structure.skip_connection_type == 'add':
+                    encoder[f'skip_connection_{i}'] = AddLayer(input_size, skip_size)
+                elif network_structure.skip_connection_type == 'gate_add_norm':
+                    encoder[f'skip_connection_{i}'] = GateAddNorm(input_size,
+                                                                  hidden_size=input_size,
+                                                                  skip_size=skip_size,
+                                                                  dropout=network_structure.grn_dropout_rate)
+            if block_id in network_decoder:
+                if network_decoder[block_id].decoder_properties.recurrent:
+                    if network_decoder[block_id].decoder_properties.has_hidden_states:
+                        # RNN
+                        self.encoder_has_decoder[i] = EncoderOutputForm.HiddenStates
+                    else:
+                        # Transformer
+                        self.encoder_has_decoder[i] = EncoderOutputForm.Sequence
+                else:
+                    self.encoder_has_decoder[i] = EncoderOutputForm.SequenceLast
+            else:
+                self.encoder_has_decoder[i] = EncoderOutputForm.NoOutput
+            if network_decoder[block_id].decoder_properties.has_hidden_states:
+                self.encoder_has_hidden_states[i] = True
+            else:
+                self.encoder_has_hidden_states[i] = False
+        self.encoder = encoder
+
+    def forward(self, encoder_input: torch.Tensor, additional_input: List[Optional[torch.Tensor]], output_seq: bool):
+        output_for_decoder = []
+        for i in range(1, self.num_blocks + 1):
+            if self.encoder_has_hidden_states[i]:
+                x, hx = self.encoder[f'block_{i}'](encoder_input, )
+
+
+
+
+
+class AbstractForecastingNet(nn.Module):
     future_target_required = False
 
     def __init__(self,
@@ -163,6 +231,7 @@ def __init__(self,
                  window_size: int,
                  target_scaler: BaseTargetScaler,
                  dataset_properties: Dict,
+                 auto_regressive: bool,
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
                  num_samples: Optional[int] = 100,
@@ -180,6 +249,7 @@ def __init__(self,
             network_decoder (nn.Module): network decoder
             network_head (nn.Module): network head, maps the output of decoder to the final output
             dataset_properties (Dict): dataset properties
+            auto_regressive (bool): if the overall model is auto-regressive model
             encoder_properties (Dict): encoder properties
             decoder_properties: (Dict): decoder properties
             output_type (str): the form that the network outputs. It could be regression, distribution and
@@ -237,6 +307,103 @@ def __init__(self,
         if self.decoder_lagged_input:
             self.cached_lag_mask_decoder = None
 
+        if network_structure.variable_selection:
+            # TODO rewrite forecasting dataset to allow mutli-variant models!!!
+            first_encoder = network_encoder['block_1']
+            first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape_[-1]
+            static_input_sizes = dataset_properties['static_features_shape']
+            variable_selector = nn.ModuleDict()
+            if static_input_sizes > 0:
+                variable_selector['static_variable_selection'] = VariableSelectionNetwork(
+                    input_sizes=static_input_sizes,
+                    hidden_size=first_encoder_output_shape,
+                    input_embedding_flags={},
+                    dropout=network_structure.grn_dropout_rate,
+                )
+            if dataset_properties['uni_variant']:
+                # variable selection for encoder and decoder
+                encoder_input_sizes = {
+                    'past_targets': dataset_properties['input_shape'][-1],
+                    'past_features': 0
+                }
+                decoder_input_sizes = {
+                    'future_features': 0
+                }
+                if auto_regressive:
+                    decoder_input_sizes.update({'future_prediction': dataset_properties['output_shape'][-1]})
+            else:
+                # TODO
+                pass
+
+            # create single variable grns that are shared across decoder and encoder
+            if network_structure.share_single_variable_networks:
+                variable_selector['shared_single_variable_grns'] = nn.ModuleDict()
+                for name, input_size in encoder_input_sizes.items():
+                    variable_selector['shared_single_variable_grns'][name] = GatedResidualNetwork(
+                        input_size,
+                        min(input_size, first_encoder_output_shape),
+                        first_encoder_output_shape,
+                        network_structure.grn_dropout_rate,
+                    )
+                for name, input_size in decoder_input_sizes.items():
+                    if name not in self.shared_single_variable_grns:
+                        variable_selector['shared_single_variable_grns'][name] = GatedResidualNetwork(
+                            input_size,
+                            min(input_size, first_encoder_output_shape),
+                            first_encoder_output_shape,
+                            network_structure.grn_dropout_rate,
+                        )
+
+            variable_selector['encoder_variable_selection'] = VariableSelectionNetwork(
+                input_sizes=encoder_input_sizes,
+                hidden_size=first_encoder_output_shape,
+                input_embedding_flags={},
+                dropout=network_structure.grn_dropout_rate,
+                context_size=first_encoder_output_shape,
+                single_variable_grns={}
+                if not network_structure.share_single_variable_networks
+                else variable_selector['shared_single_variable_grns'],
+            )
+
+            variable_selector['encoder_variable_selection'] = VariableSelectionNetwork(
+                input_sizes=decoder_input_sizes,
+                hidden_size=self.hparams.hidden_size,
+                input_embedding_flags={},
+                dropout=network_structure.grn_dropout_rate,
+                context_size=first_encoder_output_shape,
+                single_variable_grns={}
+                if not network_structure.share_single_variable_networks
+                else variable_selector['shared_single_variable_grns'],
+            )
+
+            variable_selector['static_context_variable_selection'] = GatedResidualNetwork(
+                input_size=first_encoder_output_shape,
+                hidden_size=first_encoder_output_shape,
+                output_size=first_encoder_output_shape,
+                dropout=network_structure.grn_dropout_rate,
+            )
+
+            if first_encoder.encoder_properties.has_hidden_states:
+                if isinstance(first_encoder.encoder, _RNN):
+                    # for hidden state of the rnn
+                    variable_selector['static_context_initial_hidden_lstm'] = GatedResidualNetwork(
+                        input_size=first_encoder_output_shape,
+                        hidden_size=first_encoder_output_shape,
+                        output_size=first_encoder_output_shape,
+                        dropout=network_structure.grn_dropout_rate,
+                    )
+                    if first_encoder.encoder.cell_type == 'lstm':
+                        # for cell state of the lstm
+                        variable_selector['static_context_initial_cell_lstm'] = GatedResidualNetwork(
+                            input_size=first_encoder_output_shape,
+                            hidden_size=first_encoder_output_shape,
+                            output_size=first_encoder_output_shape,
+                            dropout=network_structure.grn_dropout_rate,
+                        )
+                else:
+                    raise NotImplementedError
+
+
     @property
     def device(self):
         return self._device
@@ -280,13 +447,44 @@ def scale_value(self,
                 outputs = (outputs - loc.to(device)) / scale.to(device)
         return outputs
 
+    @abstractmethod
     def forward(self,
                 past_targets: torch.Tensor,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+                encoder_length: Optional[torch.Tensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None,
+                ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def pred_from_net_output(self, net_output):
+        raise NotImplementedError
+
+    @abstractmethod
+    def predict(self,
+                past_targets: torch.Tensor,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None
+                ):
+        raise NotImplementedError
+
+
+class ForecastingNet(AbstractForecastingNet):
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
+                encoder_length: Optional[torch.Tensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None,
+                hidden_states: Optional[Tuple[torch.Tensor]] = None,
+                ):
         if self.encoder_lagged_input:
             past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
             past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index f29f880a7..7d894d137 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -75,9 +75,9 @@ def _required_fit_requirements(self):
             FitRequirement("network_decoder", (Dict[str, DecoderBlockInfo]), user_defined=False,
                            dataset_property=False),
             FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
+            FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
             FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
             FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
-            FitRequirement("encoder_properties_1", (Dict,), user_defined=False, dataset_property=False),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
@@ -99,6 +99,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    network_encoder=network_encoder,
                                    network_decoder=network_decoder,
                                    network_head=X['network_head'],
+                                   auto_regressive=X['auto_regressive'],
                                    window_size=X['window_size'],
                                    dataset_properties=X['dataset_properties'],
                                    target_scaler=X['target_scaler'],
@@ -106,6 +107,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    forecast_strategy=self.forecast_strategy,
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
+        import pdb
+        pdb.set_trace()
 
         if X['decoder_properties']['recurrent']:
             # decoder is RNN or Transformer
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index e470f77f5..7e06edb6b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -25,12 +25,13 @@ def __init__(self, input_size: int, skip_size: int):
         super().__init__()
         if input_size == skip_size:
             self.fc = nn.Linear(skip_size, input_size)
+        self.norm = nn.LayerNorm(input_size)
 
     def forward(self, input: torch.Tensor, skip: torch.Tensor):
         if hasattr(self, 'fc'):
-            return input + self.fc(skip)
+            return self.norm(input + self.fc(skip))
         else:
-            return input
+            return self.norm(input)
 
 
 class TemporalFusionLayer(nn.Module):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 11f113a75..5784a2833 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -26,6 +26,8 @@ class DecoderProperties(NamedTuple):
 class DecoderBlockInfo(NamedTuple):
     decoder: nn.Module
     decoder_properties: DecoderProperties
+    decoder_output_shape: Tuple[int, ...]
+    decoder_input_shape: Tuple[int, ...]
 
 
 class DecoderNetwork(nn.Module):
@@ -62,6 +64,7 @@ def __init__(self,
         self.config = kwargs
         self.decoder: Optional[nn.Module] = None
         self.n_decoder_output_features = None
+        self.decoder_input_shape = None
         self.n_prediction_heads = 1
         self.is_last_decoder = False
 
@@ -133,6 +136,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             n_prediction_heads=self.n_prediction_heads,
             dataset_properties=X['dataset_properties']
         )
+        self.decoder_input_shape = encoder_output_shape
 
         X['n_decoder_output_features'] = self.n_decoder_output_features
         return self
@@ -150,8 +154,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         # 'n_prediction_heads' and 'n_decoder_output_features' are only applied to the head such that they could be
         # overwritten by the following decoders
         network_decoder = X.get('network_decoder', OrderedDict())
-        network_decoder[f'block_{self.block_number}'] = DecoderBlockInfo(decoder=self.decoder,
-                                                                         decoder_properties=self.decoder_properties())
+        network_decoder[f'block_{self.block_number}'] = DecoderBlockInfo(
+            decoder=self.decoder,
+            decoder_properties=self.decoder_properties(),
+            decoder_input_shape=self.decoder_input_shape,
+            decoder_output_shape=(self.n_prediction_heads, self.n_decoder_output_features)
+        )
         if self.is_last_decoder:
             X.update({f'network_decoder': network_decoder,
                       'n_prediction_heads': self.n_prediction_heads,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 7ef597d1d..29bb21f7c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -25,11 +25,13 @@ class EncoderProperties(NamedTuple):
     bijective_seq_output: bool = True
     fixed_input_seq_length: bool = False
     lagged_input: bool = False
+    causality: bool = True  # this value indicates if the output of the model only depends on the past targets
 
 
 class NetworkStructure(NamedTuple):
     num_blocks: int = 1
     variable_selection: bool = False
+    share_single_variable_networks: bool = False
     skip_connection: bool = False
     skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
     grn_dropout_rate: float = 0.0
@@ -38,6 +40,7 @@ class NetworkStructure(NamedTuple):
 class EncoderBlockInfo(NamedTuple):
     encoder: nn.Module
     encoder_properties: EncoderProperties
+    encoder_input_shape: Tuple[int, ...]
     encoder_output_shape_: Tuple[int, ...]
 
 
@@ -45,6 +48,7 @@ class ForecastingNetworkStructure(autoPyTorchComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None,
                  num_blocks: int = 1,
                  variable_selection: bool = False,
+                 share_single_variable_networks: bool = False,
                  skip_connection: bool = False,
                  skip_connection_type: str = "add",
                  grn_dropout_rate: float = 0.0,
@@ -52,6 +56,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None,
         super().__init__()
         self.network_structure = NetworkStructure(num_blocks=num_blocks,
                                                   variable_selection=variable_selection,
+                                                  share_single_variable_networks=share_single_variable_networks,
                                                   skip_connection=skip_connection,
                                                   skip_connection_type=skip_connection_type,
                                                   grn_dropout_rate=grn_dropout_rate)
@@ -107,6 +112,18 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
         raise NotImplementedError
 
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        get the last value of the sequential output
+        Args:
+            x: torch.Tensor(B, L, N): a sequential value output by the network, usually this value needs to be fed
+                to the decoder
+        Returns:
+            output: torch.Tensor(B, M): last element of the sequential value
+
+        """
+        raise NotImplementedError
+
 
 class BaseForecastingEncoder(autoPyTorchComponent):
     """
@@ -199,6 +216,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         network_encoder = X.get('network_encoder', OrderedDict())
         network_encoder[f'block_{self.block_number}'] = EncoderBlockInfo(encoder=self.encoder,
                                                                          encoder_properties=self.encoder_properties(),
+                                                                         encoder_input_shape=self.input_shape,
                                                                          encoder_output_shape_=self.encoder_output_shape)
 
         X.update({f'network_encoder': network_encoder})
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 92bd892ab..fcfcdbb32 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -128,7 +128,10 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
         if output_seq:
             return x
         else:
-            return x[:, -1, :]
+            return self.get_last_seq_value(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, -1, :]
 
 
 class InceptionTimeEncoder(BaseForecastingEncoder):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index a741de250..8b93b4371 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -40,6 +40,7 @@ def __init__(self,
                               dropout=config.get("dropout", 0.0),
                               bidirectional=config["bidirectional"],
                               batch_first=True)
+        self.cell_type = config['cell_type']
 
     def forward(self,
                 x: torch.Tensor,
@@ -52,19 +53,22 @@ def forward(self,
         if output_seq:
             return outputs, hidden_state
         else:
-            if not self.config["bidirectional"]:
-                return outputs[:, -1, :], hidden_state
-            else:
-                # concatenate last forward hidden state with first backward hidden state
-                outputs_by_direction = outputs.view(B,
-                                                    T,
-                                                    2,
-                                                    self.config["hidden_size"])
-                out = torch.cat([
-                    outputs_by_direction[:, -1, 0, :],
-                    outputs_by_direction[:, 0, 1, :]
-                ], dim=-1)
-                return out, hidden_state
+            return self.get_last_seq_value(x), hidden_state
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, _ = x.shape
+        if not self.config["bidirectional"]:
+            return x[:, -1, :]
+        else:
+            x_by_direction = x.view(B,
+                                    T,
+                                    2,
+                                    self.config["hidden_size"])
+            x = torch.cat([
+                x_by_direction[:, -1, 0, :],
+                x_by_direction[:, 0, 1, :]
+            ], dim=-1)
+            return x
 
 
 class RNNEncoder(BaseForecastingEncoder):
@@ -143,15 +147,16 @@ def get_hyperparameter_search_space(
                                                                            value_range=(0., 0.5),
                                                                            default_value=0.1),
             bidirectional: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bidirectional',
-                                                                                 value_range=(True, False),
-                                                                                 default_value=True),
+                                                                                 value_range=(False,),
+                                                                                 default_value=False),
             decoder_type: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='decoder_type',
                                       value_range=('MLPDecoder', 'RNNDecoder'),
                                       default_value='MLPDecoder')
     ) -> ConfigurationSpace:
         """
-        get hyperparameter search space
+        get hyperparameter search space, bidirectional is not casual so I do not allow it to be set as True,
+        However, it might be further implemented to NLP tasks
 
         """
         cs = CS.ConfigurationSpace()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index d2f3ef072..5cb45d74c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -109,7 +109,10 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
         if output_seq:
             return x
         else:
-            return x[:, -1, :]
+            return self.get_last_seq_value(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, -1, :]
 
 
 class TCNEncoder(BaseForecastingEncoder):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 539169f83..2cabd81a5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -20,6 +20,7 @@
     PositionalEncoding, build_transformer_layers
 
 
+
 class _TransformerEncoder(EncoderNetwork):
     def __init__(self,
                  in_features: int,
@@ -63,7 +64,10 @@ def forward(self,
         if output_seq:
             return x
         else:
-            return x[:, -1, :]
+            return self.get_last_seq_value(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        return x[:, -1, :]
 
 
 class TransformerEncoder(BaseForecastingEncoder):
@@ -105,7 +109,8 @@ def allowed_decoders():
 
     @staticmethod
     def encoder_properties():
-        return EncoderProperties(lagged_input=True)
+        return EncoderProperties(lagged_input=True,
+                                 causality=False)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if 'lagged_value' in X['dataset_properties']:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index e7569be00..54904cfec 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -72,6 +72,11 @@ def get_hyperparameter_search_space(
                 value_range=(True, False),
                 default_value=False
             ),
+            share_single_variable_networks: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="share_single_variable_networks",
+                value_range=(True, False),
+                default_value=False,
+            ),
             decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="decoder_auto_regressive",
                 value_range=(True, False),
@@ -103,13 +108,17 @@ def get_hyperparameter_search_space(
             variable_selection (HyperparameterSearchSpace): if variable selection is applied, if True, then the first
                 block will be attached with a variable selection block while the following will be enriched with static
                 features.
+            share_single_variable_networks( HyperparameterSearchSpace): if single variable networks are shared between
+                encoder and decoder
             skip_connection: HyperparameterSearchSpace: if skip connection is applied
             skip_connection_type (HyperparameterSearchSpace): skip connection type, it could be directly added or a grn
                 network (
                 Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
                 https://arxiv.org/abs/1912.09363) TODO consider hidden size of grn as a new HP
-            grn_use_dropout (HyperparameterSearchSpace): if dropout layer is applied to grn
-            grn_dropout_rate (HyperparameterSearchSpace): dropout rate of grn
+            grn_use_dropout (HyperparameterSearchSpace): if dropout layer is applied to GRN, since variable selection
+                network also contains GRN, this parameter also influence variable selection network
+            grn_dropout_rate (HyperparameterSearchSpace): dropout rate of GRN, same as above, this variable also
+                influence variable selection network
             decoder_auto_regressive: HyperparameterSearchSpace: if decoder is auto_regressive, e.g., if the decoder
                 receives the output as its input, this only works for  auto_regressive decoder models
             default (Optional[str]): Default backbone to use
@@ -133,6 +142,7 @@ def get_hyperparameter_search_space(
         min_num_blocks, max_num_blocks = num_blocks.value_range
 
         variable_selection = get_hyperparameter(variable_selection, CategoricalHyperparameter)
+        share_single_variable_networks = get_hyperparameter(share_single_variable_networks, CategoricalHyperparameter)
         decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
 
@@ -147,11 +157,19 @@ def get_hyperparameter_search_space(
             if 'grn' in skip_connection_type.choices:
                 grn_use_dropout = get_hyperparameter(grn_use_dropout, CategoricalHyperparameter)
                 hp_network_structures.append(grn_use_dropout)
-                cond_skip_connections.append(EqualsCondition(grn_use_dropout, skip_connection_type, "grn"))
+                if True in variable_selection.choices:
+                    cond_skip_connections.append(
+                        OrConjunction(EqualsCondition(grn_use_dropout, skip_connection_type, "grn"),
+                                      EqualsCondition(grn_dropout_rate, variable_selection, True))
+                    )
+                else:
+                    cond_skip_connections.append(EqualsCondition(grn_use_dropout, skip_connection_type, "grn"))
                 if True in grn_use_dropout.choices:
                     grn_dropout_rate = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
                     hp_network_structures.append(grn_dropout_rate)
                     cond_skip_connections.append(EqualsCondition(grn_dropout_rate, grn_use_dropout, True))
+        elif True in variable_selection.choices:
+            cond_skip_connections.append(EqualsCondition(grn_dropout_rate, variable_selection, True))
 
         cs.add_hyperparameters(hp_network_structures)
         if cond_skip_connections:
@@ -166,6 +184,9 @@ def get_hyperparameter_search_space(
                     ForbiddenEqualsClause(variable_selection, False),
                     ForbiddenEqualsClause(decoder_auto_regressive, True)
                 ))
+        if True in variable_selection.choices:
+            cs.add_hyperparameter(share_single_variable_networks)
+            cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection, True))
 
         # Compile a list of legal preprocessors for this problem
         available_encoders = self.get_available_components(
@@ -283,16 +304,18 @@ def get_hyperparameter_search_space(
                     hp_encoder_choice = cs.get_hyperparameter(block_prefix + '__choice__')
                     for encoder_single in encoder_with_single_decoder:
                         if encoder_single in hp_encoder_choice.choices:
+                            if forbidden_decoder_ar is not None:
+                                forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
+                                    forbidden_decoder_ar,
+                                    ForbiddenEqualsClause(hp_encoder_choice, encoder_single)
+                                ))
+                    for encode_multi in encoders_with_multi_decoder:
+                        hp_decoder_type = cs.get_hyperparameter(f"{block_prefix}{encode_multi}:decoder_type")
+                        if forbidden_decoder_ar is not None:
                             forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
                                 forbidden_decoder_ar,
                                 ForbiddenEqualsClause(hp_encoder_choice, encoder_single)
                             ))
-                    for encode_multi in encoders_with_multi_decoder:
-                        hp_decoder_type = cs.get_hyperparameter(f"{block_prefix}{encode_multi}:decoder_type")
-                        forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
-                            forbidden_decoder_ar,
-                            ForbiddenEqualsClause(hp_decoder_type, decoder_name)
-                        ))
 
                 hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
                 conditions_to_add = []
@@ -322,7 +345,25 @@ def get_hyperparameter_search_space(
 
                 cs.add_conditions(conditions_to_add)
 
+        for encoder_name, encoder in available_encoders.item():
+            encoder_is_casual = encoder.encoder_properties()
+            if not encoder_is_casual:
+                # we do not allow non-casual encoder to appear in the lower layer of the network. e.g, if we have an
+                # encoder with 3 blocks, then non_casual encoder is only allowed to appear in the third layer
+                for i in range(max(min_num_blocks, 2), max_num_blocks + 1):
+                    for j in range(1, i):
+                        choice_hp = cs.get_hyperparameter(f"block_{j}:__choice__")
+                        if encoder_name in choice_hp.choices:
+                            forbidden_encoder_uncasual = [ForbiddenEqualsClause(num_blocks, i),
+                                                          ForbiddenEqualsClause(choice_hp, encoder_name)]
+                            if forbidden_decoder_ar is not None:
+                                forbidden_encoder_uncasual.append(forbidden_decoder_ar)
+                            forbiddens_decoder_auto_regressive.append(
+                                ForbiddenAndConjunction(*forbidden_encoder_uncasual)
+                            )
+
         cs.add_forbidden_clauses(forbiddens_decoder_auto_regressive)
+
         if self.deepAR_decoder_name in available_decoders:
             deep_ar_hp = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
             if deep_ar_hp in cs:
@@ -379,12 +420,16 @@ def set_hyperparameters(self,
         forecasting_structure_kwargs = dict(num_blocks=num_blocks,
                                             variable_selection=params['variable_selection'],
                                             skip_connection=params['skip_connection'])
+        if 'share_single_variable_networks' in params:
+            forecasting_structure_kwargs['forecasting_structure_kwargs'] = params['forecasting_structure_kwargs']
+            del params['forecasting_structure_kwargs']
 
         del params['num_blocks']
         del params['variable_selection']
         del params['skip_connection']
         del params['decoder_auto_regressive']
 
+
         if 'skip_connection_type' in params:
             forecasting_structure_kwargs['skip_connection_type'] = params['skip_connection_type']
             del params['skip_connection_type']
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 863f33541..cc2b2fff2 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -64,7 +64,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('network_structure', (EncoderBlockInfo,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:

From 20eb852fce48900631dd26546512210b3f54e68a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 2 Mar 2022 20:52:23 +0100
Subject: [PATCH 170/347] cells for networks

---
 .../setup/network/forecasting_architecture.py | 178 +-----
 .../setup/network/forecasting_network.py      |  19 +-
 .../forecasting_backbone/cells.py             | 508 ++++++++++++++++++
 .../forecasting_backbone/components_util.py   | 202 ++-----
 .../forecasting_decoder/MLPDecoder.py         |   3 +-
 .../base_forecasting_decoder.py               |  81 +--
 .../forecasting_decoder/components.py         |  36 ++
 .../forecasting_encoder/__init__.py           |   5 +-
 .../base_forecasting_encoder.py               | 118 +---
 .../forecasting_encoder/components.py         |  68 +++
 .../flat_encoder/MLPEncoder.py                |  12 +-
 .../flat_encoder/NBEATSEncoder.py             |   4 +-
 .../seq_encoder/RNNEncoder.py                 |  13 +-
 13 files changed, 725 insertions(+), 522 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 83e23ab47..85129aa5a 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -15,21 +15,13 @@
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderNetwork,
-    NetworkStructure,
     EncoderBlockInfo,
-    NetworkStructure,
-    EncoderProperties
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder.\
-    RNNEncoder import _RNN
-
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
-    DecoderBlockInfo,
-    DecoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo
 )
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import AddLayer
@@ -39,14 +31,6 @@
 )
 
 
-class EncoderOutputForm(Enum):
-    NoOutput = 0
-    HiddenStates = 1 # RNN -> RNN
-    Sequence = 2   # RNN/TCN/Transformer -> MLP
-    SequenceLast = 3 # Transformer -> Transformer
-
-
-
 class TransformedDistribution_(TransformedDistribution):
     """
     We implement the mean function such that we do not need to enquire base mean every time
@@ -164,61 +148,6 @@ def get_lagged_subsequences_inference(
     return lagged_seq
 
 
-class StackedEncoder(nn.Module):
-    def __init__(self,
-                 network_structure: NetworkStructure,
-                 has_temporal_fusion: bool,
-                 network_encoder: Dict[str, EncoderBlockInfo],
-                 network_decoder: Dict[str, DecoderBlockInfo],
-                 ):
-        self.num_blocks = network_structure.num_blocks
-        self.skip_connection = network_structure.skip_connection
-        self.has_temporal_fusion = has_temporal_fusion
-
-        self.encoder_output_type = {}
-        self.encoder_has_hidden_states = {}
-        encoder = nn.ModuleDict()
-        for i in range(1, self.num_blocks + 1):
-            block_id = f'block_{i}'
-            encoder[block_id] = network_encoder[block_id].encoder
-            if self.skip_connection:
-                input_size = network_encoder[block_id].encoder_output_shape_[-1]
-                skip_size = network_encoder[block_id].encoder_input_shape[-1]
-                if network_structure.skip_connection_type == 'add':
-                    encoder[f'skip_connection_{i}'] = AddLayer(input_size, skip_size)
-                elif network_structure.skip_connection_type == 'gate_add_norm':
-                    encoder[f'skip_connection_{i}'] = GateAddNorm(input_size,
-                                                                  hidden_size=input_size,
-                                                                  skip_size=skip_size,
-                                                                  dropout=network_structure.grn_dropout_rate)
-            if block_id in network_decoder:
-                if network_decoder[block_id].decoder_properties.recurrent:
-                    if network_decoder[block_id].decoder_properties.has_hidden_states:
-                        # RNN
-                        self.encoder_has_decoder[i] = EncoderOutputForm.HiddenStates
-                    else:
-                        # Transformer
-                        self.encoder_has_decoder[i] = EncoderOutputForm.Sequence
-                else:
-                    self.encoder_has_decoder[i] = EncoderOutputForm.SequenceLast
-            else:
-                self.encoder_has_decoder[i] = EncoderOutputForm.NoOutput
-            if network_decoder[block_id].decoder_properties.has_hidden_states:
-                self.encoder_has_hidden_states[i] = True
-            else:
-                self.encoder_has_hidden_states[i] = False
-        self.encoder = encoder
-
-    def forward(self, encoder_input: torch.Tensor, additional_input: List[Optional[torch.Tensor]], output_seq: bool):
-        output_for_decoder = []
-        for i in range(1, self.num_blocks + 1):
-            if self.encoder_has_hidden_states[i]:
-                x, hx = self.encoder[f'block_{i}'](encoder_input, )
-
-
-
-
-
 class AbstractForecastingNet(nn.Module):
     future_target_required = False
 
@@ -307,103 +236,6 @@ def __init__(self,
         if self.decoder_lagged_input:
             self.cached_lag_mask_decoder = None
 
-        if network_structure.variable_selection:
-            # TODO rewrite forecasting dataset to allow mutli-variant models!!!
-            first_encoder = network_encoder['block_1']
-            first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape_[-1]
-            static_input_sizes = dataset_properties['static_features_shape']
-            variable_selector = nn.ModuleDict()
-            if static_input_sizes > 0:
-                variable_selector['static_variable_selection'] = VariableSelectionNetwork(
-                    input_sizes=static_input_sizes,
-                    hidden_size=first_encoder_output_shape,
-                    input_embedding_flags={},
-                    dropout=network_structure.grn_dropout_rate,
-                )
-            if dataset_properties['uni_variant']:
-                # variable selection for encoder and decoder
-                encoder_input_sizes = {
-                    'past_targets': dataset_properties['input_shape'][-1],
-                    'past_features': 0
-                }
-                decoder_input_sizes = {
-                    'future_features': 0
-                }
-                if auto_regressive:
-                    decoder_input_sizes.update({'future_prediction': dataset_properties['output_shape'][-1]})
-            else:
-                # TODO
-                pass
-
-            # create single variable grns that are shared across decoder and encoder
-            if network_structure.share_single_variable_networks:
-                variable_selector['shared_single_variable_grns'] = nn.ModuleDict()
-                for name, input_size in encoder_input_sizes.items():
-                    variable_selector['shared_single_variable_grns'][name] = GatedResidualNetwork(
-                        input_size,
-                        min(input_size, first_encoder_output_shape),
-                        first_encoder_output_shape,
-                        network_structure.grn_dropout_rate,
-                    )
-                for name, input_size in decoder_input_sizes.items():
-                    if name not in self.shared_single_variable_grns:
-                        variable_selector['shared_single_variable_grns'][name] = GatedResidualNetwork(
-                            input_size,
-                            min(input_size, first_encoder_output_shape),
-                            first_encoder_output_shape,
-                            network_structure.grn_dropout_rate,
-                        )
-
-            variable_selector['encoder_variable_selection'] = VariableSelectionNetwork(
-                input_sizes=encoder_input_sizes,
-                hidden_size=first_encoder_output_shape,
-                input_embedding_flags={},
-                dropout=network_structure.grn_dropout_rate,
-                context_size=first_encoder_output_shape,
-                single_variable_grns={}
-                if not network_structure.share_single_variable_networks
-                else variable_selector['shared_single_variable_grns'],
-            )
-
-            variable_selector['encoder_variable_selection'] = VariableSelectionNetwork(
-                input_sizes=decoder_input_sizes,
-                hidden_size=self.hparams.hidden_size,
-                input_embedding_flags={},
-                dropout=network_structure.grn_dropout_rate,
-                context_size=first_encoder_output_shape,
-                single_variable_grns={}
-                if not network_structure.share_single_variable_networks
-                else variable_selector['shared_single_variable_grns'],
-            )
-
-            variable_selector['static_context_variable_selection'] = GatedResidualNetwork(
-                input_size=first_encoder_output_shape,
-                hidden_size=first_encoder_output_shape,
-                output_size=first_encoder_output_shape,
-                dropout=network_structure.grn_dropout_rate,
-            )
-
-            if first_encoder.encoder_properties.has_hidden_states:
-                if isinstance(first_encoder.encoder, _RNN):
-                    # for hidden state of the rnn
-                    variable_selector['static_context_initial_hidden_lstm'] = GatedResidualNetwork(
-                        input_size=first_encoder_output_shape,
-                        hidden_size=first_encoder_output_shape,
-                        output_size=first_encoder_output_shape,
-                        dropout=network_structure.grn_dropout_rate,
-                    )
-                    if first_encoder.encoder.cell_type == 'lstm':
-                        # for cell state of the lstm
-                        variable_selector['static_context_initial_cell_lstm'] = GatedResidualNetwork(
-                            input_size=first_encoder_output_shape,
-                            hidden_size=first_encoder_output_shape,
-                            output_size=first_encoder_output_shape,
-                            dropout=network_structure.grn_dropout_rate,
-                        )
-                else:
-                    raise NotImplementedError
-
-
     @property
     def device(self):
         return self._device
@@ -1008,4 +840,4 @@ def forward(self,
             return forecast
 
     def pred_from_net_output(self, net_output: torch.Tensor):
-        return net_output
\ No newline at end of file
+        return net_output
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 7d894d137..0b4b8a86f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -9,29 +9,16 @@
 
 import torch
 from torch import nn
-import warnings
-
-from torch.distributions import (
-    AffineTransform,
-    TransformedDistribution,
-)
 
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
     base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    EncoderNetwork,
-    NetworkStructure,
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderBlockInfo,
-    NetworkStructure,
-    EncoderProperties
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderBlockInfo,
-    DecoderProperties
 )
 
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
@@ -107,8 +94,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    forecast_strategy=self.forecast_strategy,
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
-        import pdb
-        pdb.set_trace()
 
         if X['decoder_properties']['recurrent']:
             # decoder is RNN or Transformer
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
new file mode 100644
index 000000000..e82224bee
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -0,0 +1,508 @@
+from pytorch_forecasting.utils import create_mask
+
+from typing import Any, Dict, Optional, List, Tuple
+
+import torch
+from torch import nn
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderBlockInfo, EncoderOutputForm
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    NetworkStructure,
+    AddLayer
+)
+
+from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
+    GateAddNorm, GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention
+)
+
+
+class TemporalFusionLayer(nn.Module):
+    """
+    (Lim et al.
+    Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
+    https://arxiv.org/abs/1912.09363)
+    we follow the implementation from pytorch forecasting:
+    https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
+    """
+
+    def __init__(self,
+                 window_size: int,
+                 n_prediction_steps: int,
+                 network_structure: NetworkStructure,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 n_decoder_output_features: int,
+                 d_model: int,
+                 n_head: int,
+                 dropout: Optional[float] = None):
+        super().__init__()
+        num_blocks = network_structure.num_blocks
+        last_block = f'block_{num_blocks}'
+        n_encoder_output = network_encoder[last_block].encoder_output_shape[-1]
+        self.window_size = window_size
+        self.n_prediction_steps = n_prediction_steps
+        self.timestep = window_size + n_prediction_steps
+
+        if n_decoder_output_features != n_encoder_output:
+            self.decoder_proj_layer = nn.Linear(n_decoder_output_features, n_encoder_output, bias=False)
+        else:
+            self.decoder_proj_layer = None
+        if network_structure.variable_selection:
+            if network_structure.skip_connection:
+                # static feature selector needs to generate the same number of features as the output of the encoder
+                n_encoder_output_first = network_encoder['block_1'].encoder_output_shape[-1]
+                self.static_context_enrichment = GatedResidualNetwork(
+                    n_encoder_output_first, n_encoder_output_first, n_encoder_output_first, dropout
+                )
+                self.enrichment = GatedResidualNetwork(
+                    input_size=n_encoder_output,
+                    hidden_size=n_encoder_output,
+                    output_size=d_model,
+                    dropout=dropout,
+                    context_size=n_encoder_output_first,
+                    residual=True,
+                )
+                self.enrich_with_static = True
+        if not hasattr(self, 'enrichment'):
+            self.enrichment = GatedResidualNetwork(
+                input_size=n_encoder_output,
+                hidden_size=n_encoder_output,
+                output_size=d_model,
+                dropout=self.dropout_rate if self.use_dropout else None,
+                residual=True,
+            )
+            self.enrich_with_static = False
+
+        self.attention_fusion = InterpretableMultiHeadAttention(
+            d_model=d_model,
+            n_head=n_head,
+            dropout=dropout
+        )
+        self.post_attn_gate_norm = GateAddNorm(d_model, dropout=dropout, trainable_add=False)
+        self.pos_wise_ff = GatedResidualNetwork(input_size=d_model, hidden_size=d_model,
+                                                output_size=d_model, dropout=self.hparams.dropout)
+
+        self.network_structure = network_structure
+        if network_structure.skip_connection:
+            if network_structure.skip_connection_type == 'add':
+                self.residual_connection = AddLayer(d_model, n_encoder_output)
+            elif network_structure.skip_connection_type == 'gate_add_norm':
+                self.residual_connection = GateAddNorm(d_model, skip_size=n_encoder_output,
+                                                       dropout=None, trainable_add=False)
+
+    def forward(self, encoder_output: torch.Tensor, decoder_output: torch.Tensor, encoder_lengths: torch.LongTensor,
+                static_embedding: Optional[torch.Tensor] = None):
+        """
+        Args:
+            encoder_output: the output of the last layer of encoder network
+            decoder_output: the output of the last layer of decoder network
+            encoder_lengths: length of encoder network
+            static_embedding: output of static variable selection network (if applible)
+        """
+        if self.decoder_proj_layer is not None:
+            decoder_output = self.decoder_proj_layer(decoder_output)
+        network_output = torch.cat([encoder_output, decoder_output], dim=1)
+
+        if self.enrich_with_static:
+            static_context_enrichment = self.static_context_enrichment(static_embedding)
+            attn_input = self.enrichment(
+                network_output, static_context_enrichment[:, None].expand(-1, self.timesteps, -1)
+            )
+        else:
+            attn_input = self.enrichment(network_output)
+
+        # Attention
+        attn_output, attn_output_weights = self.attention_fusion(
+            q=attn_input[:, self.window_size:],  # query only for predictions
+            k=attn_input,
+            v=attn_input,
+            mask=self.get_attention_mask(
+                encoder_lengths=encoder_lengths, decoder_length=self.n_prediction_steps
+            ),
+        )
+        # skip connection over attention
+        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, self.window_size:])
+        output = self.pos_wise_ff(attn_output)
+
+        if self.network_structure.skip_connection:
+            return self.residual_connection(output, decoder_output)
+        else:
+            return output
+
+    def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length: int):
+        """
+        https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
+        temporal_fusion_transformer/__init__.py
+        """
+        # indices to which is attended
+        attend_step = torch.arange(decoder_length, device=self.device)
+        # indices for which is predicted
+        predict_step = torch.arange(0, decoder_length, device=self.device)[:, None]
+        # do not attend to steps to self or after prediction
+        # todo: there is potential value in attending to future forecasts if they are made with knowledge currently
+        #   available
+        #   one possibility is here to use a second attention layer for future attention (assuming different effects
+        #   matter in the future than the past)
+        #   or alternatively using the same layer but allowing forward attention - i.e. only masking out non-available
+        #   data and self
+        decoder_mask = attend_step >= predict_step
+        # do not attend to steps where data is padded
+        encoder_mask = create_mask(encoder_lengths.max(), encoder_lengths)
+        # combine masks along attended time - first encoder and then decoder
+        mask = torch.cat(
+            (
+                encoder_mask.unsqueeze(1).expand(-1, decoder_length, -1),
+                decoder_mask.unsqueeze(0).expand(encoder_lengths.size(0), -1, -1),
+            ),
+            dim=2,
+        )
+        return mask
+
+
+class VariableSelector(nn.Module):
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 dataset_properties: Dict,
+                 network_encoder: Dict[str, EncoderBlockInfo],
+                 auto_regressive: bool = False
+                 ):
+        super().__init__()
+        first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
+        static_input_sizes = dataset_properties['static_features_shape']
+        self.hidden_size = first_encoder_output_shape
+        variable_selector = nn.ModuleDict()
+        self.static_variable_selection = VariableSelectionNetwork(
+            input_sizes=static_input_sizes,
+            hidden_size=self.hidden_size,
+            input_embedding_flags={},
+            dropout=network_structure.grn_dropout_rate,
+        )
+        self.static_input_sizes = static_input_sizes
+        if dataset_properties['uni_variant']:
+            # variable selection for encoder and decoder
+            encoder_input_sizes = {
+                'past_targets': dataset_properties['input_shape'][-1],
+                'past_features': 0
+            }
+            decoder_input_sizes = {
+                'future_features': 0
+            }
+            if auto_regressive:
+                decoder_input_sizes.update({'future_prediction': dataset_properties['output_shape'][-1]})
+        else:
+            # TODO
+            pass
+
+        self.auto_regressive = auto_regressive
+
+        # create single variable grns that are shared across decoder and encoder
+        if network_structure.share_single_variable_networks:
+            self.shared_single_variable_grns = nn.ModuleDict()
+            for name, input_size in encoder_input_sizes.items():
+                self.shared_single_variable_grns[name] = GatedResidualNetwork(
+                    input_size,
+                    min(input_size, self.hidden_size),
+                    self.hidden_size,
+                    network_structure.grn_dropout_rate,
+                )
+            for name, input_size in decoder_input_sizes.items():
+                if name not in self.shared_single_variable_grns:
+                    self.shared_single_variable_grns[name] = GatedResidualNetwork(
+                        input_size,
+                        min(input_size, self.hidden_size),
+                        self.hidden_size,
+                        network_structure.grn_dropout_rate,
+                    )
+
+        self.encoder_variable_selection = VariableSelectionNetwork(
+            input_sizes=encoder_input_sizes,
+            hidden_size=self.hidden_size,
+            input_embedding_flags={},
+            dropout=network_structure.grn_dropout_rate,
+            context_size=self.hidden_size,
+            single_variable_grns={}
+            if not network_structure.share_single_variable_networks
+            else variable_selector['shared_single_variable_grns'],
+        )
+
+        self.decoder_variable_selection = VariableSelectionNetwork(
+            input_sizes=decoder_input_sizes,
+            hidden_size=self.hidden_size,
+            input_embedding_flags={},
+            dropout=network_structure.grn_dropout_rate,
+            context_size=self.hidden_size,
+            single_variable_grns={}
+            if not network_structure.share_single_variable_networks
+            else variable_selector['shared_single_variable_grns'],
+        )
+
+        self.static_context_variable_selection = GatedResidualNetwork(
+            input_size=self.hidden_size,
+            hidden_size=self.hidden_size,
+            output_size=self.hidden_size,
+            dropout=network_structure.grn_dropout_rate,
+        )
+
+        n_hidden_states = 0
+        if network_encoder['block_1'].encoder_properties.has_hidden_states:
+            n_hidden_states = network_encoder['block_1'].n_hidden_states
+
+        static_context_initial_hidden = [GatedResidualNetwork(input_size=self.hidden_size,
+                                                              hidden_size=self.hidden_size,
+                                                              output_size=self.hidden_size,
+                                                              dropout=network_structure.grn_dropout_rate,
+                                                              ) for _ in range(n_hidden_states)]
+
+        self.static_context_initial_hidden = nn.ModuleList(static_context_initial_hidden)
+        self.cached_static_contex = None
+        self.cached_static_embedding = None
+
+    def forward(self,
+                x_past: Optional[Dict[torch.Tensor]],
+                x_future: Optional[Dict[torch.Tensor]],
+                x_static: Optional[Dict[torch.Tensor]] = None,
+                length_past: int = 0,
+                length_future: int = 0,
+                batch_size: int = 0,
+                cache_static_contex: bool = False,
+                use_cached_static_contex: bool = False,
+                ):
+        if x_past is None and x_future is None:
+            raise ValueError('Either past input or future inputs need to be given!')
+        if length_past == 0 and length_future == 0:
+            raise ValueError("Either length_past or length_future must be given!")
+        timesteps = length_past + length_future
+        if not use_cached_static_contex:
+            if self.static_input_sizes > 0:
+                static_embedding, _ = self.static_variable_selection(x_static)
+            else:
+                static_embedding = torch.zeros(
+                    (batch_size, self.hidden_size), dtype=self.dtype, device=self.device
+                )
+                static_variable_selection = torch.zeros((batch_size, 0), dtype=self.dtype, device=self.device)
+
+            static_context_variable_selection = self.static_context_variable_selection(static_embedding)[:, None]
+            static_context_initial_hidden = (init_hidden(static_embedding) for init_hidden in
+                                             self.static_context_initial_hidden)
+            if cache_static_contex:
+                self.cached_static_contex = static_context_variable_selection
+                self.cached_static_embedding = static_embedding
+        else:
+            static_context_variable_selection = self.cached_static_contex
+        static_context_variable_selection = static_context_variable_selection[:, None].expand(-1, timesteps, -1)
+        if x_past is not None:
+            embeddings_varying_encoder, _ = self.encoder_variable_selection(
+                x_past,
+                static_context_variable_selection[:, :length_past],
+            )
+        else:
+            embeddings_varying_encoder = None
+        if x_future is not None:
+            embeddings_varying_decoder, _ = self.decoder_variable_selection(
+                x_future,
+                static_context_variable_selection[:, length_past:],
+            )
+        else:
+            embeddings_varying_decoder = None
+        return embeddings_varying_encoder, embeddings_varying_decoder, static_embedding, static_context_initial_hidden
+
+
+class StackedEncoder(nn.Module):
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 has_temporal_fusion: bool,
+                 encoder_info: Dict[str, EncoderBlockInfo],
+                 decoder_info: Dict[str, DecoderBlockInfo],
+                 ):
+        self.num_blocks = network_structure.num_blocks
+        self.skip_connection = network_structure.skip_connection
+        self.has_temporal_fusion = has_temporal_fusion
+
+        self.encoder_output_type = [EncoderOutputForm.NoOutput] * self.num_blocks
+        self.encoder_has_hidden_states = [False] * self.num_blocks
+        self.cached_intermediate_state = {}
+        self.encoder_num_hidden_states = []
+        encoder = nn.ModuleDict()
+        for i, block_idx in enumerate(range(1, self.num_blocks + 1)):
+            block_id = f'block_{block_idx}'
+            encoder[block_id] = encoder_info[block_id].encoder
+            if self.skip_connection:
+                input_size = encoder_info[block_id].encoder_output_shape[-1]
+                skip_size = encoder_info[block_id].encoder_input_shape[-1]
+                if network_structure.skip_connection_type == 'add':
+                    encoder[f'skip_connection_{block_idx}'] = AddLayer(input_size, skip_size)
+                elif network_structure.skip_connection_type == 'gate_add_norm':
+                    encoder[f'skip_connection_{block_idx}'] = GateAddNorm(input_size,
+                                                                  hidden_size=input_size,
+                                                                  skip_size=skip_size,
+                                                                  dropout=network_structure.grn_dropout_rate)
+            if block_id in decoder_info:
+                if decoder_info[block_id].decoder_properties.recurrent:
+                    if decoder_info[block_id].decoder_properties.has_hidden_states:
+                        # RNN
+                        self.encoder_output_type[i] = EncoderOutputForm.HiddenStates
+                    else:
+                        # Transformer
+                        self.encoder_output_type[i] = EncoderOutputForm.Sequence
+                else:
+                    self.encoder_output_type[i] = EncoderOutputForm.SequenceLast
+            if encoder_info[block_id].encoder_properties.has_hidden_states:
+                self.encoder_has_hidden_states[i] = True
+                self.encoder_num_hidden_states[i] = encoder_info[block_id].n_hidden_states
+            else:
+                self.encoder_has_hidden_states[i] = False
+        self.encoder = encoder
+
+    def forward(self,
+                encoder_input: torch.Tensor,
+                additional_input: List[Optional[torch.Tensor]],
+                output_seq: bool = False,
+                cache_intermediate_state: bool = False,
+                incremental_update: bool = False) -> Tuple[List[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        A forward pass through the encoder
+        Args:
+             encoder_input (torch.Tensor): encoder input
+             additional_input (List[Optional[torch.Tensor]]) additional input to the encoder, e.g., inital hidden states
+             output_seq (bool) if a sequence output is generated
+             incremental_update (bool) if an incremental update is applied, this is normally applied for auto-regressive
+                model, however, ony deepAR requires encoder to do incremental update, thus the decoder only need to
+                receive the last output of the encoder
+        """
+        encoder2decoder = []
+        x = encoder_input
+        for i, block_id in enumerate(range(1, self.num_blocks + 1)):
+            output_seq_i = (output_seq or self.has_temporal_fusion or block_id < self.num_blocks)
+            encoder_i = self.encoder[f'block_{block_id}']  # type: EncoderNetwork
+            if self.encoder_has_hidden_states[i]:
+                if incremental_update:
+                    hx = self.cached_intermediate_state[i]
+                    fx, hx = encoder_i(x, output_seq=False, hx=hx)
+                else:
+                    rnn_num_layers = encoder_i.config['num_layers']
+                    hx = additional_input[i]
+                    if rnn_num_layers == 1 or hx is None:
+                        fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
+                    else:
+                        if self.encoder_num_hidden_states[i] == 1:
+                            fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx.expand((rnn_num_layers, -1, -1)))
+                        else:
+                            hx = (hx_i.expand(rnn_num_layers, -1, -1) for hx_i in hx)
+                            fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
+            else:
+                if incremental_update:
+                    x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
+                    fx = encoder_i(x_all, output_seq=False)
+                else:
+                    fx = encoder_i(x, output_seq=output_seq_i)
+            if self.skip_connection:
+                fx = self.encoder[f'skip_connection_{block_id}'](fx, x)
+
+            if self.encoder_output_type == EncoderOutputForm.HiddenStates:
+                encoder2decoder.append(hx)
+            elif self.encoder_output_type[i] == EncoderOutputForm.Sequence:
+                encoder2decoder.append(fx)
+            elif self.encoder_output_type[i] == EncoderOutputForm.SequenceLast:
+                if output_seq or incremental_update:
+                    encoder2decoder.append(fx)
+                else:
+                    encoder2decoder.append(encoder_i.get_last_seq_value(fx))
+            if cache_intermediate_state:
+                if self.encoder_has_hidden_states[i]:
+                    self.cached_intermediate_state[i] = hx
+                else:
+                    if incremental_update:
+                        self.cached_intermediate_state[i] = x_all
+                    else:
+                        self.cached_intermediate_state[i] = x
+                    # otherwise the decoder does not exist for this layer
+            x = fx
+        if self.has_temporal_fusion:
+            if incremental_update:
+                self.cached_intermediate_state[i + 1] = torch.cat([self.cached_intermediate_state[i+1], x], dim=1)
+            else:
+                self.cached_intermediate_state[i + 1] = x
+            return encoder2decoder, None
+        else:
+            return encoder2decoder, x
+
+
+class StackedDecoder(nn.Module):
+    def __init__(self,
+                 network_structure: NetworkStructure,
+                 encoder: nn.ModuleDict,
+                 encoder_info: Dict[str, EncoderBlockInfo],
+                 decoder_info: Dict[str, DecoderBlockInfo],
+                 ):
+        self.num_blocks = network_structure.num_blocks
+        self.first_block = None
+        self.skip_connection = network_structure.skip_connection
+
+        self.decoder_has_hidden_states = []
+        decoder = nn.ModuleDict()
+        for i in range(1, self.num_blocks + 1):
+            block_id = f'block_{i}'
+            if block_id in decoder_info:
+                self.first_block = i if self.first_block is None else self.first_block
+                decoder[block_id] = decoder_info[block_id].decoder
+                if decoder_info[block_id].decoder_properties.has_hidden_states:
+                    self.decoder_has_hidden_states.append(True)
+                else:
+                    self.decoder_has_hidden_states.append(False)
+                if self.skip_connection:
+                    input_size_encoder = encoder_info[block_id].encoder_output_shape[-1]
+                    skip_size_encoder = encoder_info[block_id].encoder_input_shape[-1]
+
+                    input_size_decoder = decoder_info[block_id].decoder_output_shape[-1]
+                    skip_size_decoder = decoder_info[block_id].decoder_input_shape[-1]
+                    if input_size_encoder == input_size_decoder and skip_size_encoder == skip_size_decoder:
+                        decoder[f'skip_connection_{i}'] = encoder[f'skip_connection_{i}']
+                    else:
+                        if network_structure.skip_connection_type == 'add':
+                            decoder[f'skip_connection_{i}'] = AddLayer(input_size_decoder, skip_size_decoder)
+                        elif network_structure.skip_connection_type == 'gate_add_norm':
+                            decoder[f'skip_connection_{i}'] = GateAddNorm(input_size_decoder,
+                                                                          hidden_size=input_size_decoder,
+                                                                          skip_size=skip_size_decoder,
+                                                                          dropout=network_structure.grn_dropout_rate)
+        self.cached_intermediate_state = {}
+
+        self.decoder = decoder
+
+    def forward(self,
+                x_future: torch.Tensor,
+                encoder_output: List[torch.Tensor],
+                cache_intermediate_state: bool = False,
+                incremental_update: bool = False
+                ) -> torch.Tensor:
+        x = x_future
+        for i, block_id in enumerate(range(self.first_block, self.num_blocks + 1)):
+            decoder_i = self.decoder[f'block_{block_id}']  # type: DecoderNetwork
+            if self.decoder_has_hidden_states[i]:
+                if incremental_update:
+                    hx = self.cached_intermediate_state[i]
+                    fx, hx = decoder_i(x_future=x, encoder_output=hx)
+                else:
+                    fx, hx = decoder_i(x_future=x, encoder_output=encoder_output[i])
+            else:
+                if incremental_update:
+                    x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
+                    fx = decoder_i(x_all, encoder_output=encoder_output[i])
+                else:
+                    fx = decoder_i(x, encoder_output=encoder_output[i])
+            if self.skip_connection:
+                fx = self.decoder[f'skip_connection_{block_id}'](fx, x)
+            if cache_intermediate_state:
+                if self.encoder_has_hidden_states[i]:
+                    self.cached_intermediate_state[i] = hx
+                else:
+                    if incremental_update:
+                        self.cached_intermediate_state[i] = x_all
+                    else:
+                        self.cached_intermediate_state[i] = x
+            x = fx
+        return x
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index 7e06edb6b..f4e663e45 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -1,23 +1,51 @@
-from typing import Dict, Any, Optional
+import math
+from sklearn.base import BaseEstimator
+
+from typing import Any, Dict, NamedTuple
+
 import torch
 from torch import nn
-from functools import partial
-import torch.nn.functional as F
-import math
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone. \
-    forecasting_encoder.base_forecasting_encoder import (
-    NetworkStructure,
-    EncoderBlockInfo
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone. \
-    forecasting_decoder.base_forecasting_decoder import DecoderBlockInfo
 
-from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
-    TimeDistributed, TimeDistributedInterpolation, GatedLinearUnit, ResampleNorm, AddNorm, GateAddNorm,
-    GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention,
-)
-from pytorch_forecasting.utils import create_mask
+class NetworkStructure(NamedTuple):
+    num_blocks: int = 1
+    variable_selection: bool = False
+    share_single_variable_networks: bool = False
+    skip_connection: bool = False
+    skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
+    grn_dropout_rate: float = 0.0
+
+
+class ForecastingNetworkStructure(BaseEstimator):
+    def __init__(self,
+                 num_blocks: int = 1,
+                 variable_selection: bool = False,
+                 share_single_variable_networks: bool = False,
+                 skip_connection: bool = False,
+                 skip_connection_type: str = "add",
+                 grn_dropout_rate: float = 0.0,
+                 ) -> None:
+        super().__init__()
+        self.network_structure = NetworkStructure(num_blocks=num_blocks,
+                                                  variable_selection=variable_selection,
+                                                  share_single_variable_networks=share_single_variable_networks,
+                                                  skip_connection=skip_connection,
+                                                  skip_connection_type=skip_connection_type,
+                                                  grn_dropout_rate=grn_dropout_rate)
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({
+            'network_structure': self.network_structure,
+        })
+        return X
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
 
 
 class AddLayer(nn.Module):
@@ -34,148 +62,6 @@ def forward(self, input: torch.Tensor, skip: torch.Tensor):
             return self.norm(input)
 
 
-class TemporalFusionLayer(nn.Module):
-    """
-    (Lim et al.
-    Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
-    https://arxiv.org/abs/1912.09363)
-    we follow the implementation from pytorch forecasting:
-    https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/temporal_fusion_transformer/__init__.py
-    """
-
-    def __init__(self,
-                 window_size: int,
-                 n_prediction_steps: int,
-                 network_structure: NetworkStructure,
-                 network_encoder: Dict[str, EncoderBlockInfo],
-                 n_decoder_output_features: int,
-                 d_model: int,
-                 n_head: int,
-                 dropout: Optional[float] = None):
-        super().__init__()
-        num_blocks = network_structure.num_blocks
-        last_block = f'block_{num_blocks}'
-        n_encoder_output = network_encoder[last_block].encoder_output_shape_[-1]
-        self.window_size = window_size
-        self.n_prediction_steps = n_prediction_steps
-        self.timestep = window_size + n_prediction_steps
-
-        if n_decoder_output_features != n_encoder_output:
-            self.decoder_proj_layer = nn.Linear(n_decoder_output_features, n_encoder_output, bias=False)
-        else:
-            self.decoder_proj_layer = None
-        if network_structure.variable_selection:
-            if network_structure.skip_connection:
-                # static feature selector needs to generate the same number of features as the output of the encoder
-                n_encoder_output_first = network_encoder['block_1'].encoder_output_shape_[-1]
-                self.static_context_enrichment = GatedResidualNetwork(
-                    n_encoder_output_first, n_encoder_output_first, n_encoder_output_first, dropout
-                )
-                self.enrichment = GatedResidualNetwork(
-                    input_size=n_encoder_output,
-                    hidden_size=n_encoder_output,
-                    output_size=d_model,
-                    dropout=dropout,
-                    context_size=n_encoder_output_first,
-                    residual=True,
-                )
-                self.enrich_with_static = True
-        if not hasattr(self, 'enrichment'):
-            self.enrichment = GatedResidualNetwork(
-                input_size=n_encoder_output,
-                hidden_size=n_encoder_output,
-                output_size=d_model,
-                dropout=self.dropout_rate if self.use_dropout else None,
-                residual=True,
-            )
-            self.enrich_with_static = False
-
-        self.attention_fusion = InterpretableMultiHeadAttention(
-            d_model=d_model,
-            n_head=n_head,
-            dropout=dropout
-        )
-        self.post_attn_gate_norm = GateAddNorm(d_model, dropout=dropout, trainable_add=False)
-        self.pos_wise_ff = GatedResidualNetwork(input_size=d_model, hidden_size=d_model,
-                                                output_size=d_model, dropout=self.hparams.dropout)
-
-        self.network_structure = network_structure
-        if network_structure.skip_connection:
-            if network_structure.skip_connection_type == 'add':
-                self.residual_connection = AddLayer(d_model, n_encoder_output)
-            elif network_structure.skip_connection_type == 'gate_add_norm':
-                self.residual_connection = GateAddNorm(d_model, skip_size=n_encoder_output,
-                                                       dropout=None, trainable_add=False)
-
-    def forward(self, encoder_output: torch.Tensor, decoder_output: torch.Tensor, encoder_lengths: torch.LongTensor,
-                static_embedding: Optional[torch.Tensor] = None):
-        """
-        Args:
-            encoder_output: the output of the last layer of encoder network
-            decoder_output: the output of the last layer of decoder network
-            encoder_lengths: length of encoder network
-            static_embedding: output of static variable selection network (if applible)
-        """
-        if self.decoder_proj_layer is not None:
-            decoder_output = self.decoder_proj_layer(decoder_output)
-        network_output = torch.cat([encoder_output, decoder_output], dim=1)
-
-        if self.enrich_with_static:
-            static_context_enrichment = self.static_context_enrichment(static_embedding)
-            attn_input = self.enrichment(
-                network_output, static_context_enrichment[:, None].expand(-1, self.timesteps, -1)
-            )
-        else:
-            attn_input = self.enrichment(network_output)
-
-        # Attention
-        attn_output, attn_output_weights = self.attention_fusion(
-            q=attn_input[:, self.window_size:],  # query only for predictions
-            k=attn_input,
-            v=attn_input,
-            mask=self.get_attention_mask(
-                encoder_lengths=encoder_lengths, decoder_length=self.n_prediction_steps
-            ),
-        )
-        # skip connection over attention
-        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, self.window_size:])
-        output = self.pos_wise_ff(attn_output)
-
-        if self.network_structure.skip_connection:
-            return self.residual_connection(output, decoder_output)
-        else:
-            return output
-
-    def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length: int):
-        """
-        https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
-        temporal_fusion_transformer/__init__.py
-        """
-        # indices to which is attended
-        attend_step = torch.arange(decoder_length, device=self.device)
-        # indices for which is predicted
-        predict_step = torch.arange(0, decoder_length, device=self.device)[:, None]
-        # do not attend to steps to self or after prediction
-        # todo: there is potential value in attending to future forecasts if they are made with knowledge currently
-        #   available
-        #   one possibility is here to use a second attention layer for future attention (assuming different effects
-        #   matter in the future than the past)
-        #   or alternatively using the same layer but allowing forward attention - i.e. only masking out non-available
-        #   data and self
-        decoder_mask = attend_step >= predict_step
-        # do not attend to steps where data is padded
-        encoder_mask = create_mask(encoder_lengths.max(), encoder_lengths)
-        # combine masks along attended time - first encoder and then decoder
-        mask = torch.cat(
-            (
-                encoder_mask.unsqueeze(1).expand(-1, decoder_length, -1),
-                decoder_mask.unsqueeze(0).expand(encoder_lengths.size(0), -1, -1),
-            ),
-            dim=2,
-        )
-        return mask
-
-
 def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='encoder'):
     nhead = 2 ** config['n_head_log']
     dim_feedforward = 2 ** config['d_feed_forward_log']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 7229312e9..b378a6af3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -28,7 +28,7 @@ def __init__(self,
         self.auto_regressive = auto_regressive
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor):
-        if x_future is not None or self.auto_regressive:
+        if x_future is None or self.auto_regressive:
             # for auto-regressive model, x_future is fed to the encoders
             x = self.global_layers(encoder_output)
             if self.local_layers is None:
@@ -75,6 +75,7 @@ def _build_decoder(self,
                                 local_layers=nn.Sequential(*local_layers) if local_layers is not None else None,
                                 auto_regressive=self.auto_regressive), num_decoder_output_features
 
+
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 5784a2833..ddaa358ea 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -5,45 +5,12 @@
 import torch
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    EncoderBlockInfo,
-    NetworkStructure)
-
-
-class DecoderProperties(NamedTuple):
-    has_hidden_states: bool = False
-    has_local_layer: bool = True
-    recurrent: bool = False
-    lagged_input: bool = False
-    multi_blocks: bool = False
-    mask_on_future_target: bool = False
-
-
-class DecoderBlockInfo(NamedTuple):
-    decoder: nn.Module
-    decoder_properties: DecoderProperties
-    decoder_output_shape: Tuple[int, ...]
-    decoder_input_shape: Tuple[int, ...]
-
-
-class DecoderNetwork(nn.Module):
-    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
-        """
-        Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
-
-
-        Args:
-            x_future: torch.Tensor(B, L_future, N_out), the future features
-            encoder_output: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
-        Returns:
-            net_output: torch.Tensor with shape either (B, L_future, N)
-
-        """
-        raise NotImplementedError
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo, DecoderProperties
+)
 
 
 class BaseForecastingDecoder(autoPyTorchComponent):
@@ -100,7 +67,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         output_shape = X['dataset_properties']['output_shape']
         static_features_shape = X["dataset_properties"]["static_features_shape"]
 
-        encoder_output_shape = X['network_encoder'][f'block_{self.block_number}'].encoder_output_shape_
+        encoder_output_shape = X['network_encoder'][f'block_{self.block_number}'].encoder_output_shape
 
         auto_regressive = self.auto_regressive
 
@@ -111,22 +78,24 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         network_structure = X['network_structure']
         variable_selection = network_structure.variable_selection
-        future_feature_shapes = X['dataset_properties']['future_feature_shapes']
-
-        if self.block_number == network_structure.num_blocks:
-            self.is_last_decoder = True
-
-        future_in_features = future_feature_shapes[-1] + static_features_shape
-        if variable_selection:
-            # TODO
-            pass
+        if 'n_decoder_output_features' not in X:
+            future_feature_shapes = X['dataset_properties']['future_feature_shapes']
+
+            if self.block_number == network_structure.num_blocks:
+                self.is_last_decoder = True
+
+            future_in_features = future_feature_shapes[-1] + static_features_shape
+            if variable_selection:
+                future_in_features = X['network_encoder']['block_1'].encoder_output_shape[-1]
+            else:
+                if auto_regressive:
+                    if self.decoder_properties()["lagged_input"] and hasattr(self, 'lagged_value'):
+                        future_in_features += len(self.lagged_value) * output_shape[-1]
+                    else:
+                        future_in_features += output_shape[-1]
+            future_variable_input = (self.n_prediction_heads, future_in_features)
         else:
-            if auto_regressive:
-                if self.decoder_properties()["lagged_input"] and hasattr(self, 'lagged_value'):
-                    future_in_features += len(self.lagged_value) * output_shape[-1]
-                else:
-                    future_in_features += output_shape[-1]
-        future_variable_input = (self.n_prediction_heads, future_in_features)
+            future_variable_input = (self.n_prediction_heads, X['n_decoder_output_features'])
 
         # TODO consider decoder auto regressive and fill in decoder part
 
@@ -138,9 +107,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         )
         self.decoder_input_shape = encoder_output_shape
 
-        X['n_decoder_output_features'] = self.n_decoder_output_features
         return self
 
+
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the network head into the fit dictionary 'X' and returns it.
@@ -166,7 +135,9 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
                       'n_decoder_output_features': self.n_decoder_output_features,
                       'auto_regressive': self.auto_regressive})
         else:
-            X.update({f'network_decoder': network_decoder})
+            X.update({f'network_decoder': network_decoder,
+                      f'n_decoder_output_features': self.n_decoder_output_features,
+                      })
 
         return X
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
new file mode 100644
index 000000000..08f5a4505
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -0,0 +1,36 @@
+from typing import Any, Dict, Optional, Union, Tuple, List, NamedTuple
+
+import torch
+from torch import nn
+
+
+class DecoderProperties(NamedTuple):
+    has_hidden_states: bool = False
+    has_local_layer: bool = True
+    recurrent: bool = False
+    lagged_input: bool = False
+    multi_blocks: bool = False
+    mask_on_future_target: bool = False
+
+
+class DecoderBlockInfo(NamedTuple):
+    decoder: nn.Module
+    decoder_properties: DecoderProperties
+    decoder_output_shape: Tuple[int, ...]
+    decoder_input_shape: Tuple[int, ...]
+
+
+class DecoderNetwork(nn.Module):
+    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
+        """
+        Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
+
+
+        Args:
+            x_future: torch.Tensor(B, L_future, N_out), the future features
+            encoder_output: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
+        Returns:
+            net_output: torch.Tensor with shape either (B, L_future, N)
+
+        """
+        raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 0c526d9e9..74671807f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -17,8 +17,9 @@
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder,
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.\
+    forecasting_encoder.base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
     ForecastingNetworkStructure
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 29bb21f7c..0a51bc8dd 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -15,116 +15,12 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
-from autoPyTorch.pipeline.components.base_component import (
-    autoPyTorchComponent,
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderProperties, EncoderBlockInfo, EncoderNetwork
 )
 
 
-class EncoderProperties(NamedTuple):
-    has_hidden_states: bool = False
-    bijective_seq_output: bool = True
-    fixed_input_seq_length: bool = False
-    lagged_input: bool = False
-    causality: bool = True  # this value indicates if the output of the model only depends on the past targets
-
-
-class NetworkStructure(NamedTuple):
-    num_blocks: int = 1
-    variable_selection: bool = False
-    share_single_variable_networks: bool = False
-    skip_connection: bool = False
-    skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
-    grn_dropout_rate: float = 0.0
-
-
-class EncoderBlockInfo(NamedTuple):
-    encoder: nn.Module
-    encoder_properties: EncoderProperties
-    encoder_input_shape: Tuple[int, ...]
-    encoder_output_shape_: Tuple[int, ...]
-
-
-class ForecastingNetworkStructure(autoPyTorchComponent):
-    def __init__(self, random_state: Optional[np.random.RandomState] = None,
-                 num_blocks: int = 1,
-                 variable_selection: bool = False,
-                 share_single_variable_networks: bool = False,
-                 skip_connection: bool = False,
-                 skip_connection_type: str = "add",
-                 grn_dropout_rate: float = 0.0,
-                 ) -> None:
-        super().__init__()
-        self.network_structure = NetworkStructure(num_blocks=num_blocks,
-                                                  variable_selection=variable_selection,
-                                                  share_single_variable_networks=share_single_variable_networks,
-                                                  skip_connection=skip_connection,
-                                                  skip_connection_type=skip_connection_type,
-                                                  grn_dropout_rate=grn_dropout_rate)
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
-        self.check_requirements(X, y)
-        return self
-
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        X.update({
-            'network_structure': self.network_structure,
-        })
-        return X
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            **kwargs: Any
-    ) -> ConfigurationSpace:
-        return ConfigurationSpace()
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'EarlyPreprocessing',
-            'name': 'Early Preprocessing Node',
-        }
-
-    def __str__(self) -> str:
-        """ Allow a nice understanding of what components where used """
-        string = self.__class__.__name__
-        return string
-
-
-class EncoderNetwork(nn.Module):
-    def forward(self, x: torch.Tensor, output_seq: bool = False):
-        """
-        Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
-        When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
-         needs to be set as True
-        When the decoder is a seq2seq model, the network needs to output a 2-d Tensor (B, N), in which case,
-        output_seq needs to be set as False
-
-        Args:
-            x: torch.Tensor(B, L_in, N)
-            output_seq (bool), if the network outputs a sequence tensor. If it is set True,
-            output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
-            If this value is set as False, the network only returns the last item of the sequence.
-        Returns:
-            net_output: torch.Tensor with shape either (B, N) or (B, L_out, N)
-
-        """
-        raise NotImplementedError
-
-    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        get the last value of the sequential output
-        Args:
-            x: torch.Tensor(B, L, N): a sequential value output by the network, usually this value needs to be fed
-                to the decoder
-        Returns:
-            output: torch.Tensor(B, M): last element of the sequential value
-
-        """
-        raise NotImplementedError
-
-
 class BaseForecastingEncoder(autoPyTorchComponent):
     """
     Base class for network backbones. Holds the backbone module and the config which was used to create it.
@@ -140,7 +36,7 @@ def __init__(self,
         )
         self.encoder: nn.Module = None
         self.config = kwargs
-        self.input_shape: Optional[Iterable] = None
+        self.input_shape: Optional[Tuple[int, ...]] = None
         self.block_number = block_number
         self.encoder_output_shape: Optional[Tuple[int, ...]] = None
 
@@ -211,13 +107,17 @@ def allowed_decoders():
     def n_encoder_output_feature(self) -> int:
         raise NotImplementedError
 
+    def n_hidden_states(self) -> int:
+        return 0
+
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
         network_encoder = X.get('network_encoder', OrderedDict())
         network_encoder[f'block_{self.block_number}'] = EncoderBlockInfo(encoder=self.encoder,
                                                                          encoder_properties=self.encoder_properties(),
                                                                          encoder_input_shape=self.input_shape,
-                                                                         encoder_output_shape_=self.encoder_output_shape)
+                                                                         encoder_output_shape=self.encoder_output_shape,
+                                                                         n_hidden_states=self.n_hidden_states())
 
         X.update({f'network_encoder': network_encoder})
         return X
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
new file mode 100644
index 000000000..428422dbb
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -0,0 +1,68 @@
+from enum import Enum
+
+import torch
+from torch import nn
+from typing import Dict, Optional, Tuple, List, NamedTuple
+
+
+class EncoderProperties(NamedTuple):
+    has_hidden_states: bool = False
+    bijective_seq_output: bool = True
+    fixed_input_seq_length: bool = False
+    lagged_input: bool = False
+    causality: bool = True  # this value indicates if the output of the model only depends on the past targets
+
+
+class EncoderBlockInfo(NamedTuple):
+    encoder: nn.Module
+    encoder_properties: EncoderProperties
+    encoder_input_shape: Tuple[int, ...]
+    encoder_output_shape: Tuple[int, ...]
+    n_hidden_states: int
+
+
+class EncoderNetwork(nn.Module):
+    def forward(self,
+                x: torch.Tensor,
+                output_seq: bool = False):
+        """
+        Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
+        When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
+         needs to be set as True
+        When the decoder is a seq2seq model, the network needs to output a 2-d Tensor (B, N), in which case,
+        output_seq needs to be set as False
+
+        Args:
+            x: torch.Tensor(B, L_in, N)
+            output_seq (bool): if the network outputs a sequence tensor. If it is set True,
+                output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
+                If this value is set as False, the network only returns the last item of the sequence.
+            hx (Optional[torch.Tensor]): addational input to the network, this could be a hidden states or a sequence
+                from previous inputs
+
+        Returns:
+            net_output: torch.Tensor with shape either (B, N) or (B, L_out, N)
+
+        """
+        raise NotImplementedError
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        get the last value of the sequential output
+        Args:
+            x: torch.Tensor(B, L, N): a sequential value output by the network, usually this value needs to be fed
+                to the decoder
+        Returns:
+            output: torch.Tensor(B, M): last element of the sequential value
+
+        """
+        raise NotImplementedError
+
+
+class EncoderOutputForm(Enum):
+    NoOutput = 0
+    HiddenStates = 1  # RNN -> RNN
+    Sequence = 2  # Transformer -> Transformer
+    SequenceLast = 3 #RNN/TCN/Transformer -> MLP
+
+
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index f9b64c861..413c36203 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -16,9 +16,10 @@
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
 
 
-class TimeSeriesMLPrecpocessor(EncoderNetwork):
+class TimeSeriesMLP(EncoderNetwork):
     def __init__(self,
                  window_size: int,
+                 network: Optional[nn.Module] = None
                  ):
         """
         Transform the input features (B, T, N) to fit the requirement of MLP
@@ -30,6 +31,7 @@ def __init__(self,
         """
         super().__init__()
         self.window_size = window_size
+        self.network = network
 
     def forward(self, x: torch.Tensor, output_seq: bool = False):
         """
@@ -54,6 +56,9 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
                 # we need to ensure that the input size fits the network shape
                 x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
         x = x.flatten(-2)
+        return x if self.network is not None else self.network(x)
+
+    def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
@@ -85,8 +90,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[-1]
-        feature_preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
-        return nn.Sequential(feature_preprocessor, *self._build_backbone(in_features * self.window_size))
+        network = nn.Sequential(*self._build_backbone(in_features * self.window_size))
+        return TimeSeriesMLP(window_size=self.window_size,
+                             network=network)
 
     def n_encoder_output_feature(self) -> int:
         # This function should never be called!!
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index e61bc40ff..3e70a6f29 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -11,7 +11,7 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.MLPEncoder import \
-    TimeSeriesMLPrecpocessor
+    TimeSeriesMLP
 
 
 class NBEATSEncoder(BaseForecastingEncoder):
@@ -49,7 +49,7 @@ def n_encoder_output_feature(self):
 
     def build_encoder(self,
                       input_shape: Tuple[int, ...]) -> nn.Module:
-        preprocessor = TimeSeriesMLPrecpocessor(window_size=self.window_size)
+        preprocessor = TimeSeriesMLP(window_size=self.window_size)
         return preprocessor
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 8b93b4371..6e3ebf48f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -12,8 +12,11 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork, EncoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderNetwork, EncoderProperties
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
@@ -92,6 +95,12 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
     def n_encoder_output_feature(self) -> int:
         return 2 * self.config['hidden_size'] if self.config['bidirectional'] else self.config['hidden_size']
 
+    def n_hidden_states(self) -> int:
+        if self.config['cell_type'] == 'lstm':
+            return 2
+        elif self.config['cell_type'] == 'gru':
+            return 1
+
     @staticmethod
     def allowed_decoders():
         """

From f5cede796df122057769935e15ff8dbbbc01d700 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 4 Mar 2022 13:43:39 +0100
Subject: [PATCH 171/347] forecasting backbones

---
 .../setup/network/forecasting_architecture.py | 404 ++++++++++++------
 .../forecasting_backbone/cells.py             |  11 +-
 .../forecasting_backbone/components_util.py   |   3 +
 .../forecasting_decoder/TransformerDecoder.py |  15 +-
 .../base_forecasting_decoder.py               |   1 -
 .../seq_encoder/__init__.py                   |  17 +-
 .../forecasting_head.py                       |   2 +-
 7 files changed, 300 insertions(+), 153 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 85129aa5a..25eee819d 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -20,6 +20,12 @@
     EncoderNetwork,
     EncoderBlockInfo,
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    VariableSelector,
+    StackedEncoder,
+    StackedDecoder,
+    TemporalFusionLayer
+)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderBlockInfo
 )
@@ -192,24 +198,34 @@ def __init__(self,
         super(ForecastingNet, self).__init__()
         self.network_structure = network_structure
         self.embedding = network_embedding
-        self.head = network_head
-
-        encoders = OrderedDict()
-        decoders = OrderedDict()
+        if network_structure.variable_selection:
+            self.variable_selector = VariableSelector(network_structure=network_structure,
+                                                      dataset_properties=dataset_properties,
+                                                      network_encoder=network_encoder,
+                                                      auto_regressive=auto_regressive)
+        has_temporal_fusion = "temporal_fusion" in network_head
+        self.encoder = StackedEncoder(network_structure=network_structure,
+                                      has_temporal_fusion=has_temporal_fusion,
+                                      encoder_info=network_encoder,
+                                      decoder_info=network_decoder)
+        self.decoder = StackedDecoder(network_structure=network_structure,
+                                      encoder=self.encoder.encoder,
+                                      encoder_info=network_encoder,
+                                      decoder_info=network_decoder)
+        if has_temporal_fusion:
+            self.temporal_fusion = network_head['temporal_fusion']  # type: TemporalFusionLayer
+        self.has_temporal_fusion = has_temporal_fusion
+        self.head = network_head['head']
 
         first_decoder = 0
         for i in range(1, network_structure.num_blocks + 1):
             block_number = f'block_{i}'
-            encoders[block_number] = network_encoder[block_number].encoder
-            if block_number in decoders:
+            if block_number in network_decoder:
                 if first_decoder == 0:
                     first_decoder = block_number
-                decoders[block_number] = network_decoder[block_number].decoder
 
         if first_decoder == 0:
             raise ValueError("At least one decoder must be specified!")
-        self.encoder = nn.ModuleDict(encoders)
-        self.decoder = nn.ModuleDict(decoders)
 
         self.target_scaler = target_scaler
 
@@ -286,9 +302,8 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_length: Optional[torch.Tensor] = None,
+                encoder_lengths: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None,
                 ):
         raise NotImplementedError
 
@@ -305,18 +320,31 @@ def predict(self,
                 ):
         raise NotImplementedError
 
+    def repeat_intermediate_values(self,
+                                   intermediate_values: List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]],
+                                   is_hidden_states: List[bool],
+                                   repeats: int) -> List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]]:
+        for i, (is_hx, inter_value) in enumerate(is_hidden_states, intermediate_values):
+            if isinstance(inter_value, torch.Tensor):
+                repeated_value = inter_value.repeat_interleave(repeats=repeats, dim=1 if is_hx else 0)
+                intermediate_values[i] = repeated_value
+            elif isinstance(inter_value, Tuple):
+                dim = 1 if is_hx else 0
+                repeated_value = (hx.repeat_interleave(repeats=repeats, dim=dim) for hx in inter_value)
+                intermediate_values[i] = repeated_value
+        return intermediate_values
+
 
 class ForecastingNet(AbstractForecastingNet):
-    def forward(self,
-                past_targets: torch.Tensor,
-                future_targets: Optional[torch.Tensor] = None,
-                past_features: Optional[torch.Tensor] = None,
-                future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
-                encoder_length: Optional[torch.Tensor] = None,
-                decoder_observed_values: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None,
-                ):
+    def pre_processing(self,
+                       past_targets: torch.Tensor,
+                       past_features: Optional[torch.Tensor] = None,
+                       future_features: Optional[torch.Tensor] = None,
+                       static_features: Optional[torch.Tensor] = None,
+                       length_past: int = 0,
+                       length_future: int = 0,
+                       variable_selector_kwargs: Dict = {},
+                       ):
         if self.encoder_lagged_input:
             past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
             past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
@@ -330,18 +358,68 @@ def forward(self,
             past_targets, _, loc, scale = self.target_scaler(past_targets)
             x_past = past_targets
 
-        if past_features is not None:
-            x_past = torch.cat([past_features, x_past], dim=1)
+        if self.network_structure.variable_selection:
+            batch_size = x_past.shape[0]
+            if length_past > 0:
+                if past_features is None:
+                    x_past = {'past_targets': x_past.to(device=self.device),
+                              'past_features': torch.zeros((batch_size, length_past, 0),
+                                                           dtype=self.dtype, device=self.device)}
+            else:
+                x_past = None
+            if length_future > 0:
+                if future_features is None:
+                    x_future = {'future_features': torch.zeros((batch_size, length_future, 0),
+                                                               dtype=self.dtype, device=self.device)}
+            else:
+                x_future = None
+            x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
+                x_past=x_past,
+                x_future=x_future,
+                x_static=static_features,
+                batch_size=batch_size,
+                length_past=length_past,
+                length_future=length_future,
+                **variable_selector_kwargs
+            )
+            return x_past, x_future, x_static, loc, scale, static_context_initial_hidden
+        else:
+            if past_features is not None:
+                x_past = torch.cat([past_features, x_past], dim=1)
 
-        x_past = x_past.to(device=self.device)
-        x_past = self.embedding(x_past)
+            x_past = x_past.to(device=self.device)
+            x_past = self.embedding(x_past)
+            return x_past, future_features, static_features, loc, scale, None
 
-        if self.encoder_has_hidden_states:
-            x_past, _ = self.encoder(x_past)
-        else:
-            x_past = self.encoder(x_past)
-        x_past = self.decoder(x_past)
-        output = self.head(x_past)
+    def forward(self,
+                past_targets: torch.Tensor,
+                future_targets: Optional[torch.Tensor] = None,
+                past_features: Optional[torch.Tensor] = None,
+                future_features: Optional[torch.Tensor] = None,
+                static_features: Optional[torch.Tensor] = None,
+                encoder_lengths: Optional[torch.LongTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None,
+                ):
+        x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
+            past_targets=past_targets,
+            past_features=past_features,
+            future_features=future_features,
+            static_features=static_features,
+            length_past=self.window_size,
+            length_future=self.n_prediction_steps
+        )
+        encoder_additional = [static_context_initial_hidden]
+        encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+        encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
+        decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder)
+
+        if self.has_temporal_fusion:
+            decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                  decoder_output=decoder_output,
+                                                  encoder_lengths=encoder_lengths,
+                                                  static_embedding=x_static
+                                                  )
+        output = self.head(decoder_output)
         return self.rescale_output(output, loc, scale, self.device)
 
     def pred_from_net_output(self, net_output):
@@ -394,9 +472,24 @@ class ForecastingSeq2SeqNet(ForecastingNet):
 
     def __init__(self, **kwargs):
         super(ForecastingSeq2SeqNet, self).__init__(**kwargs)
-        self.mask_on_future_target = kwargs['decoder_properties']['mask_on_future_target']
-        if self.mask_on_future_target:
-            self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
+
+    def decoder_select_variable(self, future_targets: torch.tensor, future_features: Optional[torch.Tensor]):
+        batch_size = future_targets.shape[0]
+        length_future = future_targets.shape[1]
+        if future_features is None:
+            x_future = {
+                'future_prediction': future_targets.to(self.device),
+                'future_features': torch.zeros((batch_size, length_future, 0),
+                                               dtype=self.dtype, device=self.device)}
+        _, x_future, _, _ = self.variable_selector(x_past=None,
+                                                   x_future=x_future,
+                                                   x_static=None,
+                                                   length_past=0,
+                                                   length_future=length_future,
+                                                   batch_size=batch_size,
+                                                   use_cached_static_contex=True
+                                                   )
+        return x_future
 
     def forward(self,
                 past_targets: torch.Tensor,
@@ -404,24 +497,18 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
-        if self.encoder_lagged_input:
-            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
-            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
-            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
-                                                                           self.window_size,
-                                                                           self.encoder.lagged_value,
-                                                                           self.cached_lag_mask_encoder)
-        else:
-            if self.window_size < past_targets.shape[1]:
-                past_targets = past_targets[:, -self.window_size:]
-            past_targets, _, loc, scale = self.target_scaler(past_targets)
-            x_past = past_targets
-
-        x_past = x_past if past_features is None else torch.cat([past_features, x_past], dim=-1)
-
-        x_past = x_past.to(self.device)
-        x_past = self.embedding(x_past)
+                encoder_lengths: Optional[torch.Tensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ):
+        x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
+            past_targets=past_targets,
+            past_features=past_features,
+            future_features=future_features,
+            static_features=static_features,
+            length_future=0,
+            variable_selector_kwargs={'cache_static_contex': True}
+        )
+        encoder_additional = [static_context_initial_hidden]
+        encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
         if self.training:
             # we do one step ahead forecasting
@@ -434,27 +521,27 @@ def forward(self,
             else:
                 future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
 
-            x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
-                                                                                dim=-1)
+            if self.network_structure.variable_selection:
+                x_future = self.decoder_select_variable(future_targets, future_features)
+            else:
+                x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
+                                                                                    dim=-1)
             x_future = x_future.to(self.device)
 
-            if self.encoder_has_hidden_states:
-                # RNN
-                _, features_latent = self.encoder(x_past, output_seq=True)
-                x_future, _ = self.decoder(x_future, features_latent)
-            elif self.mask_on_future_target:
-                features_latent = self.encoder(x_past, output_seq=True)
-                x_future = self.decoder(x_future, features_latent, tgt_mask=self.tgt_mask.to(self.device))
-            else:
-                raise NotImplementedError
-            net_output = self.head(x_future)
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
+            decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder)
+
+            if self.has_temporal_fusion:
+                decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                      decoder_output=decoder_output,
+                                                      encoder_lengths=encoder_lengths,
+                                                      static_embedding=x_static
+                                                      )
+            net_output = self.head(decoder_output)
 
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
-            if self.encoder_has_hidden_states:
-                _, features_latent = self.encoder(x_past, output_seq=True)
-            else:
-                features_latent = self.encoder(x_past, output_seq=True)
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
 
             if future_features is not None:
                 future_features = future_features
@@ -463,34 +550,41 @@ def forward(self,
                 all_predictions = []
                 predicted_target = past_targets[:, [-1]]
                 past_targets = past_targets[:, :-1]
+                if self.has_temporal_fusion:
+                    decoder_output_all = None
                 for idx_pred in range(self.n_prediction_steps):
+                    predicted_target = predicted_target.cpu()
                     if self.decoder_lagged_input:
-                        x_future = torch.cat([past_targets, predicted_target.cpu()], dim=1)
-                        if self.decoder_has_hidden_states:
-                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
-                        else:
-                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
-                                                                         self.decoder.lagged_value)
+                        x_future = torch.cat([past_targets, predicted_target], dim=1)
+                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
                     else:
-                        if self.decoder_has_hidden_states:
-                            x_future = predicted_target[:, [-1]]
-                        else:
-                            x_future = predicted_target
+                        x_future = predicted_target[:, [-1]]
 
-                    if self.decoder_has_hidden_states:
-                        x_future = x_future if future_features is None else torch.cat(
-                            [future_features[:, [idx_pred], :], x_future], dim=-1)
+                    if self.network_structure.variable_selection:
+                        x_future = self.decoder_select_variable(
+                            future_targets=predicted_target[:, -1:],
+                            future_features=future_features[:, [idx_pred]] if future_features is not None else None
+                        )
                     else:
-                        x_future = x_future if future_features is None else torch.cat(
-                            [future_features[:, idx_pred + 1, :], x_future], dim=-1)
-
-                    x_future = x_future.to(self.device)
-                    if self.decoder_has_hidden_states:
-                        x_future, features_latent = self.decoder(x_future, features_latent=features_latent)
-                    else:
-                        x_future = self.decoder(x_future, features_latent)
-
-                    net_output = self.head(x_future[:, -1:, ])
+                        x_future = x_future if future_features is None else torch.cat([future_features, future_targets],
+                                                                                      dim=-1)
+                    decoder_output = self.decoder(x_future,
+                                                  encoder_output=encoder2decoder,
+                                                  cache_intermediate_state=True,
+                                                  incremental_update=idx_pred > 0)
+
+                    if self.has_temporal_fusion:
+                        if decoder_output_all is not None:
+                            decoder_output_all = torch.cat([decoder_output_all, decoder_output], dim=1)
+                        else:
+                            decoder_output_all = decoder_output
+                        decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                              decoder_output=decoder_output_all,
+                                                              encoder_lengths=encoder_lengths,
+                                                              static_embedding=x_static
+                                                              )[:, -1:]
+
+                    net_output = self.head(decoder_output)
                     predicted_target = torch.cat([predicted_target, self.pred_from_net_output(net_output).cpu()],
                                                  dim=1)
 
@@ -508,18 +602,11 @@ def forward(self,
                 all_samples = []
                 batch_size = past_targets.shape[0]
 
-                if self.encoder_has_hidden_states:
+                encoder2decoder = self.repeat_intermediate_values(
+                    encoder2decoder,
+                    is_hidden_states=self.encoder.encoder_has_hidden_states,
+                    repeats=self.num_samples)
 
-                    if isinstance(features_latent, tuple):
-                        repeated_state = [
-                            s.repeat_interleave(repeats=self.num_samples, dim=1)
-                            for s in features_latent
-                        ]
-                    else:
-                        repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=1)
-                else:
-                    # Transformer's hidden states is of shape
-                    repeated_state = features_latent.repeat_interleave(repeats=self.num_samples, dim=0)
                 if self.decoder_lagged_input:
                     max_lag_seq_length = max(self.decoder.lagged_value) + 1
                 else:
@@ -540,32 +627,37 @@ def forward(self,
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
                         x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
-                        if self.decoder_has_hidden_states:
-                            x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
-                        else:
-                            x_future = get_lagged_subsequences_inference(x_future, idx_pred + 1,
-                                                                         self.decoder.lagged_value)
+                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
                     else:
-                        if self.decoder_has_hidden_states:
-                            x_future = repeated_predicted_target[:, [-1]]
-                        else:
-                            x_future = repeated_predicted_target
+                        x_future = repeated_predicted_target[:, [-1]]
 
-                    if self.decoder_has_hidden_states:
-                        x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
+                    if self.network_structure.variable_selection:
+                        x_future = self.decoder_select_variable(future_targets=x_future[:, -1:],
+                                                                future_features=repeated_time_feat[:, [idx_pred]])
                     else:
-                        # decoder uses the entire future targets
                         x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, :idx_pred + 1, :], x_future], dim=-1)
+                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
 
-                    x_future = x_future.to(self.device)
-                    if self.decoder_has_hidden_states:
-                        x_future, repeated_state = self.decoder(x_future, features_latent=repeated_state)
-                    else:
-                        x_future = self.decoder(x_future, repeated_state)
-                    net_output = self.head(x_future[:, -1:, ])
+                        x_future = x_future.to(self.device)
+
+                    decoder_output = self.decoder(x_future,
+                                                  encoder_output=encoder2decoder,
+                                                  cache_intermediate_state=True,
+                                                  incremental_update=idx_pred > 0)
+                    if self.has_temporal_fusion:
+                        if decoder_output_all is not None:
+                            decoder_output_all = torch.cat([decoder_output_all, decoder_output], dim=1)
+                        else:
+                            decoder_output_all = decoder_output
+                        decoder_output = self.temporal_fusion(encoder_output=encoder_output,
+                                                              decoder_output=decoder_output_all,
+                                                              encoder_lengths=encoder_lengths,
+                                                              static_embedding=x_static
+                                                              )[:, -1:]
+
+                    net_output = self.head(decoder_output)
                     samples = self.pred_from_net_output(net_output).cpu()
+
                     repeated_predicted_target = torch.cat([repeated_predicted_target,
                                                            samples],
                                                           dim=1)
@@ -616,13 +708,32 @@ def train(self, mode: bool = True) -> nn.Module:
         self.only_generate_future_dist = False
         return super().train(mode=mode)
 
+    def decoder_select_variable(self, future_targets: torch.tensor, future_features: Optional[torch.Tensor]):
+        batch_size = future_targets.shape[0]
+        length_future = future_targets.shape[1]
+        if future_features is None:
+            x_future = {
+                'future_prediction': future_targets.to(self.device),
+                'future_features': torch.zeros((batch_size, length_future, 0),
+                                               dtype=self.dtype, device=self.device)}
+        _, x_future, _, _ = self.variable_selector(x_past=None,
+                                                   x_future=x_future,
+                                                   x_static=None,
+                                                   length_past=0,
+                                                   length_future=length_future,
+                                                   batch_size=batch_size,
+                                                   use_cached_static_contex=True
+                                                   )
+        return x_future
+
     def forward(self,
                 past_targets: torch.Tensor,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+                encoder_lengths: Optional[torch.Tensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.training:
             if self.encoder_lagged_input:
                 past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
@@ -643,21 +754,40 @@ def forward(self,
                 future_targets = self.scale_value(future_targets, loc, scale)
                 targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
 
-            x_input = targets_all
-            if past_features is not None:
-                features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
-                x_input = torch.cat([features_all, x_input], dim=-1)
-            x_input = x_input.to(self.device)
+            if self.network_structure.variable_selection:
+                batch_size = past_targets.shape[0]
+                length_past = self.window_size + self.n_prediction_steps
+                if past_features is None:
+                    if past_features is None:
+                        x_past = {'past_targets': targets_all.to(device=self.device),
+                                  'past_features': torch.zeros((batch_size, length_past, 0),
+                                                               dtype=self.dtype, device=self.device)}
+
+                x_input, _, _, static_context_initial_hidden = self.variable_selector.forward(x_past=x_past,
+                                                                                              x_future=None,
+                                                                                              x_static=static_features,
+                                                                                              length_past=length_past,
+                                                                                              length_future=0,
+                                                                                              batch_size=batch_size,
+                                                                                              )
+            else:
+                x_input = targets_all
+                if past_features is not None:
+                    features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
+                    x_input = torch.cat([features_all, targets_all], dim=-1)
+                x_input = x_input.to(self.device)
 
-            x_input = self.embedding(x_input)
+                x_input = self.embedding(x_input)
+
+            encoder_additional = [static_context_initial_hidden]
+            encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
 
-            if self.encoder_has_hidden_states:
-                x_input, _ = self.encoder(x_input, output_seq=True)
-            else:
-                x_input = self.encoder(x_input, output_seq=True)
             if self.only_generate_future_dist:
-                x_input = x_input[:, -self.n_prediction_steps:]
-            net_output = self.head(self.decoder(x_input))
+                encoder2decoder = encoder2decoder[:, -self.n_prediction_steps:]
+            net_output = self.head(self.decoder.forward(x_future=None, encoder_output=encoder2decoder))
+            # DeepAR does not allow tf layers
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
             if self.encoder_lagged_input:
@@ -674,6 +804,7 @@ def forward(self,
 
                 past_targets, _, loc, scale = self.target_scaler(past_targets)
                 x_past = past_targets
+
             if past_features is not None:
                 # features is one step ahead of target
                 if self.window_size > 1:
@@ -812,7 +943,8 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                hidden_states: Optional[Tuple[torch.Tensor]] = None):
+                encoder_lengths: Optional[torch.Tensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.window_size < past_targets.shape[1]:
             past_targets = past_targets[:, -self.window_size:]
         past_targets, _, loc, scale = self.target_scaler(past_targets)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index e82224bee..36c428e6d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -292,6 +292,8 @@ def forward(self,
                 self.cached_static_contex = static_context_variable_selection
                 self.cached_static_embedding = static_embedding
         else:
+            static_embedding = self.cached_static_embedding
+            static_context_initial_hidden = None
             static_context_variable_selection = self.cached_static_contex
         static_context_variable_selection = static_context_variable_selection[:, None].expand(-1, timesteps, -1)
         if x_past is not None:
@@ -324,7 +326,9 @@ def __init__(self,
 
         self.encoder_output_type = [EncoderOutputForm.NoOutput] * self.num_blocks
         self.encoder_has_hidden_states = [False] * self.num_blocks
-        self.cached_intermediate_state = {}
+        len_cached_intermediate_states = self.num_blocks + 1 if self.has_temporal_fusion else self.num_blocks
+        self.cached_intermediate_state = [torch.empty(0) for _ in range(len_cached_intermediate_states)]
+
         self.encoder_num_hidden_states = []
         encoder = nn.ModuleDict()
         for i, block_idx in enumerate(range(1, self.num_blocks + 1)):
@@ -469,8 +473,7 @@ def __init__(self,
                                                                           hidden_size=input_size_decoder,
                                                                           skip_size=skip_size_decoder,
                                                                           dropout=network_structure.grn_dropout_rate)
-        self.cached_intermediate_state = {}
-
+        self.cached_intermediate_state = [torch.empty(0) for _ in range(self.num_blocks + 1 - self.first_block)]
         self.decoder = decoder
 
     def forward(self,
@@ -491,7 +494,7 @@ def forward(self,
             else:
                 if incremental_update:
                     x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
-                    fx = decoder_i(x_all, encoder_output=encoder_output[i])
+                    fx = decoder_i(x_all, encoder_output=encoder_output[i])[:, -1:]
                 else:
                     fx = decoder_i(x, encoder_output=encoder_output[i])
             if self.skip_connection:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index f4e663e45..7ee9da187 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -11,6 +11,7 @@ class NetworkStructure(NamedTuple):
     num_blocks: int = 1
     variable_selection: bool = False
     share_single_variable_networks: bool = False
+    use_temporal_fusion: bool = False,
     skip_connection: bool = False
     skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
     grn_dropout_rate: float = 0.0
@@ -21,6 +22,7 @@ def __init__(self,
                  num_blocks: int = 1,
                  variable_selection: bool = False,
                  share_single_variable_networks: bool = False,
+                 use_temporal_fusion: bool = False,
                  skip_connection: bool = False,
                  skip_connection_type: str = "add",
                  grn_dropout_rate: float = 0.0,
@@ -29,6 +31,7 @@ def __init__(self,
         self.network_structure = NetworkStructure(num_blocks=num_blocks,
                                                   variable_selection=variable_selection,
                                                   share_single_variable_networks=share_single_variable_networks,
+                                                  use_temporal_fusion=use_temporal_fusion,
                                                   skip_connection=skip_connection,
                                                   skip_connection_type=skip_connection_type,
                                                   grn_dropout_rate=grn_dropout_rate)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 71493d16e..2f2f93487 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -39,6 +39,7 @@ def __init__(self,
                  use_layer_norm_output: bool,
                  dropout_pd: float = 0.0,
                  layer_norm_eps_output: Optional[float] = None,
+                 n_prediction_steps:int = 1,
                  lagged_value: Optional[Union[List, np.ndarray]] = None):
         super().__init__()
         self.lagged_value = lagged_value
@@ -58,17 +59,11 @@ def __init__(self,
         self.transformer_decoder_layers = nn.TransformerDecoder(decoder_layer=transformer_decoder_layers,
                                                                 num_layers=num_layers,
                                                                 norm=norm)
+        self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
 
-    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor,
-                tgt_mask: Optional[torch.Tensor] = None,
-                memory_mask: Optional[torch.Tensor] = None,
-                tgt_key_padding_mask: Optional[torch.Tensor] = None,
-                memory_key_padding_mask: Optional[torch.Tensor] = None):
+    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
         output = self.input_layer(x_future)
-        output = self.transformer_decoder_layers(output, encoder_output, tgt_mask=tgt_mask,
-                                                 memory_mask=memory_mask,
-                                                 tgt_key_padding_mask=tgt_key_padding_mask,
-                                                 memory_key_padding_mask=memory_key_padding_mask)
+        output = self.transformer_decoder_layers(output, encoder_output, tgt_mask=self.tgt_mask.to(self.device))
         return output
 
 
@@ -86,6 +81,7 @@ def _build_decoder(self,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
         d_model = 2 ** self.transformer_encoder_kwargs['d_model_log']
         transformer_decoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='decoder')
+        n_prediction_steps = dataset_properties['n_prediction_steps']
 
         decoder = _TransformerDecoder(in_features=future_variable_input[-1],
                                       d_model=d_model,
@@ -95,6 +91,7 @@ def _build_decoder(self,
                                       use_layer_norm_output=self.config['use_layer_norm_output'],
                                       dropout_pd=self.config.get('dropout_positional_decoder', 0.0),
                                       layer_norm_eps_output=self.config.get('layer_norm_eps_output', None),
+                                      n_prediction_steps=n_prediction_steps,
                                       lagged_value=self.lagged_value)
 
         return decoder, d_model
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index ddaa358ea..1ef4922e1 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -109,7 +109,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         return self
 
-
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the network head into the fit dictionary 'X' and returns it.
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 54904cfec..c6d813983 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -77,6 +77,10 @@ def get_hyperparameter_search_space(
                 value_range=(True, False),
                 default_value=False,
             ),
+            use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_temporal_fusion',
+                value_range=(True, False),
+                default_value=False),
             decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="decoder_auto_regressive",
                 value_range=(True, False),
@@ -111,6 +115,11 @@ def get_hyperparameter_search_space(
             share_single_variable_networks( HyperparameterSearchSpace): if single variable networks are shared between
                 encoder and decoder
             skip_connection: HyperparameterSearchSpace: if skip connection is applied
+            use_temporal_fusion (HyperparameterSearchSpace): if temporal fusion layer is applied
+            tf_attention_n_head_log (HyperparameterSearchSpace): log value of tf attention dims
+            tf_attention_d_model_log (HyperparameterSearchSpace): log value of tf attention d model
+            tf_use_dropout (HyperparameterSearchSpace): if tf uses dropout
+            tf_dropout_rate (HyperparameterSearchSpace): dropout rate of tf layer
             skip_connection_type (HyperparameterSearchSpace): skip connection type, it could be directly added or a grn
                 network (
                 Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
@@ -143,12 +152,15 @@ def get_hyperparameter_search_space(
 
         variable_selection = get_hyperparameter(variable_selection, CategoricalHyperparameter)
         share_single_variable_networks = get_hyperparameter(share_single_variable_networks, CategoricalHyperparameter)
+
+        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
         decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
 
         skip_connection = get_hyperparameter(skip_connection, CategoricalHyperparameter)
 
-        hp_network_structures = [num_blocks, decoder_auto_regressive, variable_selection, skip_connection]
+        hp_network_structures = [num_blocks, decoder_auto_regressive, variable_selection,
+                                 skip_connection, use_temporal_fusion]
         cond_skip_connections = []
         if True in skip_connection.choices:
             skip_connection_type = get_hyperparameter(skip_connection_type, CategoricalHyperparameter)
@@ -418,6 +430,7 @@ def set_hyperparameters(self,
         num_blocks = params['num_blocks']
         decoder_auto_regressive = params['decoder_auto_regressive']
         forecasting_structure_kwargs = dict(num_blocks=num_blocks,
+                                            use_temporal_fusion=params['use_temporal_fusion'],
                                             variable_selection=params['variable_selection'],
                                             skip_connection=params['skip_connection'])
         if 'share_single_variable_networks' in params:
@@ -425,11 +438,11 @@ def set_hyperparameters(self,
             del params['forecasting_structure_kwargs']
 
         del params['num_blocks']
+        del params['use_temporal_fusion']
         del params['variable_selection']
         del params['skip_connection']
         del params['decoder_auto_regressive']
 
-
         if 'skip_connection_type' in params:
             forecasting_structure_kwargs['skip_connection_type'] = params['skip_connection_type']
             del params['skip_connection_type']
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index cc2b2fff2..2e0a1f4a3 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -125,7 +125,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             decoder_has_local_layer=decoder_has_local_layer,
             n_prediction_heads=n_prediction_heads,
         )
-        self.head = nn.ModuleDict(head_components)
+        self.head = head_components
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:

From 50c559e2007fdff296896822f3739845a9eac87a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 4 Mar 2022 19:31:54 +0100
Subject: [PATCH 172/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py   |   2 +-
 .../setup/network/forecasting_architecture.py | 150 ++++++++++--------
 .../setup/network/forecasting_network.py      |  20 +--
 .../forecasting_backbone/cells.py             |  10 +-
 .../forecasting_decoder/MLPDecoder.py         |   5 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  14 +-
 .../forecasting_decoder/RNNDecoder.py         |   4 +-
 .../forecasting_decoder/TransformerDecoder.py |  11 +-
 .../forecasting_encoder/__init__.py           |   2 +-
 .../base_forecasting_encoder.py               |   2 +-
 .../seq_encoder/__init__.py                   |   4 +-
 .../forecasting_head.py                       |   9 +-
 12 files changed, 133 insertions(+), 100 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f53e17622..cfe98b692 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -149,7 +149,7 @@ def __getitem__(self, index: int, train: bool = True) \
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
                 'encoder_length': past_target.shape[0],
-                'decoder_length': targets_future.shape[0]}, targets_future
+                'decoder_length': None if targets_future is None else targets_future.shape[0] }, targets_future
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 25eee819d..96fa9b16c 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -195,7 +195,7 @@ def __init__(self,
             indicates the number of the points to sample when doing prediction
             aggregation (str): how the samples are aggregated. We could take their mean or median values.
         """
-        super(ForecastingNet, self).__init__()
+        super().__init__()
         self.network_structure = network_structure
         self.embedding = network_embedding
         if network_structure.variable_selection:
@@ -242,7 +242,7 @@ def __init__(self,
 
         if not network_structure.variable_selection:
             self.encoder_lagged_input = network_encoder['block_1'].encoder_properties.lagged_input
-            self.decoder_lagged_input = network_decoder[f'block_{first_decoder}'].decoder_properties.lagged_input
+            self.decoder_lagged_input = network_decoder[first_decoder].decoder_properties.lagged_input
         else:
             self.encoder_lagged_input = False
             self.decoder_lagged_input = False
@@ -763,7 +763,7 @@ def forward(self,
                                   'past_features': torch.zeros((batch_size, length_past, 0),
                                                                dtype=self.dtype, device=self.device)}
 
-                x_input, _, _, static_context_initial_hidden = self.variable_selector.forward(x_past=x_past,
+                x_input, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
                                                                                               x_future=None,
                                                                                               x_static=static_features,
                                                                                               length_past=length_past,
@@ -778,15 +778,16 @@ def forward(self,
                 x_input = x_input.to(self.device)
 
                 x_input = self.embedding(x_input)
+                static_context_initial_hidden = None
 
             encoder_additional = [static_context_initial_hidden]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
-            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_input, additional_input=encoder_additional)
 
             if self.only_generate_future_dist:
                 encoder2decoder = encoder2decoder[:, -self.n_prediction_steps:]
-            net_output = self.head(self.decoder.forward(x_future=None, encoder_output=encoder2decoder))
+            net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
             # DeepAR does not allow tf layers
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
@@ -805,39 +806,61 @@ def forward(self,
                 past_targets, _, loc, scale = self.target_scaler(past_targets)
                 x_past = past_targets
 
-            if past_features is not None:
-                # features is one step ahead of target
-                if self.window_size > 1:
-                    features_all = torch.cat([past_features[:, -self.window_size + 1:, ],
-                                              future_features],
-                                             dim=1)
-                else:
-                    features_all = future_features
+            if self.network_structure.variable_selection:
+                batch_size = past_targets.shape[0]
+                length_past = self.window_size
+                if past_features is None:
+                    if past_features is None:
+                        x_past = {'past_targets': past_targets.to(device=self.device),
+                                  'past_features': torch.zeros((batch_size, length_past, 0),
+                                                               dtype=self.dtype, device=self.device)}
+
+                x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
+                                                                                             x_future=None,
+                                                                                             x_static=static_features,
+                                                                                             length_past=length_past,
+                                                                                             length_future=0,
+                                                                                             batch_size=batch_size,
+                                                                                             cache_static_contex=True
+                                                                                             )
             else:
-                features_all = None
-            x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
-                                                                   dim=-1)
+                if past_features is not None:
+                    # features is one step ahead of target
+                    if self.window_size > 1:
+                        features_all = torch.cat([past_features[:, -self.window_size + 1:, ],
+                                                  future_features],
+                                                 dim=1)
+                    else:
+                        features_all = future_features
+                else:
+                    features_all = None
+                x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
+                                                                       dim=-1)
 
-            x_past = x_past.to(self.device)
-            # TODO consider static features
-            x_past = self.embedding(x_past)
+                x_past = x_past.to(self.device)
+                # TODO consider static features
+                x_past = self.embedding(x_past)
+                static_context_initial_hidden = None
 
             all_samples = []
             batch_size = past_targets.shape[0]
 
-            if self.encoder_has_hidden_states:
-                # For RNN, we only feed the hidden state and generated future input to the netwrok
-                encoder_output, hidden_states = self.encoder(x_past)
-                if isinstance(hidden_states, tuple):
-                    repeated_state = [
-                        s.repeat_interleave(repeats=self.num_samples, dim=1)
-                        for s in hidden_states
-                    ]
-                else:
-                    repeated_state = hidden_states.repeat_interleave(repeats=self.num_samples, dim=1)
-            else:
-                # For other models, the full past targets are passed to the network.
-                encoder_output = self.encoder(x_past)
+            encoder_additional = [static_context_initial_hidden]
+            encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
+                                                                   additional_input=encoder_additional,
+                                                                   cache_intermediate_state=True,
+                                                                   )
+
+            self.repeat_intermediate_values(self.encoder.cached_intermediate_state,
+                                            is_hidden_states=self.encoder.encoder_has_hidden_states,
+                                            repeats=self.num_samples)
+            if self.network_structure.variable_selection:
+                self.repeat_intermediate_values([self.variable_selector.cached_static_contex],
+                                                is_hidden_states=[False],
+                                                repeats=self.num_samples)
+
             if self.encoder_lagged_input:
                 max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size)
             else:
@@ -865,7 +888,7 @@ def forward(self,
                 repeats=self.num_samples, dim=0
             ) if future_features is not None else None
 
-            net_output = self.head(self.decoder(encoder_output))
+            net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
 
             next_sample = net_output.sample(sample_shape=(self.num_samples,))
 
@@ -876,37 +899,36 @@ def forward(self,
             all_samples.append(next_sample)
 
             for k in range(1, self.n_prediction_steps):
-                if self.encoder_has_hidden_states:
-                    if self.encoder_lagged_input:
-                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                        x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
-                    else:
-                        x_next = next_sample
-                    x_next = x_next if repeated_time_feat is None else torch.cat([repeated_time_feat[:, k - 1:k],
-                                                                                  x_next], dim=-1)
-                    x_next = x_next.to(self.device)
-                    encoder_output, repeated_state = self.encoder(x_next, hx=repeated_state)
+                if self.encoder_lagged_input:
+                    x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
+                    x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
                 else:
-                    if self.encoder_lagged_input:
-                        x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                        x_next = get_lagged_subsequences_inference(x_next,
-                                                                   self.window_size + k,
-                                                                   self.encoder.lagged_value)
-                    else:
-                        x_next = torch.cat([repeated_past_target[:, -self.window_size:], *all_samples], dim=1)
-                    if repeated_time_feat is None:
-                        x_next = x_next
-                    else:
-                        x_next = torch.cat([repeated_time_feat[:, :self.window_size + k],
-                                            x_next], dim=-1)
-                    x_next = x_next.to(self.device)
-                    encoder_output = self.encoder(x_next)
-
-                # During training, the encoder output a sequence. Thus for prediction, the network should have the same
-                # output format
-                encoder_output = torch.unsqueeze(encoder_output, 1)
+                    x_next = next_sample
 
-                net_output = self.head(self.decoder(encoder_output))
+                if self.network_structure.variable_selection:
+                    batch_size = past_targets.shape[0]
+                    if past_features is None:
+                        if past_features is None:
+                            x_next = {'past_targets': x_next.to(device=self.device),
+                                      'past_features': torch.zeros((batch_size, 1, 0),
+                                                                   dtype=self.dtype, device=self.device)}
+
+                    x_next, _, _, _ = self.variable_selector(x_past=x_next,
+                                                             x_future=None,
+                                                             x_static=static_features,
+                                                             length_past=1,
+                                                             length_future=0,
+                                                             batch_size=batch_size,
+                                                             cache_static_contex=False,
+                                                             use_cached_static_contex=True,
+                                                             )
+                encoder_output, _ = self.encoder(encoder_input=x_next,
+                                                 additional_input=[None] * self.network_structure.num_blocks,
+                                                 output_seq=False, cache_intermediate_state=True,
+                                                 incremental_update=True)
+
+
+                net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
 
                 next_sample = net_output.sample().cpu()
                 all_samples.append(next_sample)
@@ -955,8 +977,8 @@ def forward(self,
         forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
 
         forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
-        backcast = self.encoder(past_targets)
-        for block in self.decoder:
+        backcast, _ = self.encoder(past_targets)
+        for block in self.decoder.decoder['block_1']:
             backcast_block, forecast_block = block(backcast)
 
             backcast = backcast - backcast_block
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 0b4b8a86f..5fe6b6288 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -95,16 +95,18 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    num_samples=self.num_samples,
                                    aggregation=self.aggregation, )
 
-        if X['decoder_properties']['recurrent']:
-            # decoder is RNN or Transformer
-            self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
-        elif X['decoder_properties']['multi_blocks']:
-            self.network = NBEATSNet(**network_init_kwargs)
-        elif X['auto_regressive']:
-            # decoder is MLP and auto_regressive, we have deep AR model
-            self.network = ForecastingDeepARNet(**network_init_kwargs)
+        if X['auto_regressive']:
+            first_decoder = next(iter(network_decoder.items()))[1]
+            if first_decoder.decoder_properties.recurrent:
+                self.network = ForecastingSeq2SeqNet(**network_init_kwargs)
+            else:
+                self.network = ForecastingDeepARNet(**network_init_kwargs)
         else:
-            self.network = ForecastingNet(**network_init_kwargs)
+            first_decoder = next(iter(network_decoder.items()))[1]
+            if first_decoder.decoder_properties.multi_blocks:
+                self.network = NBEATSNet(**network_init_kwargs)
+            else:
+                self.network = ForecastingNet(**network_init_kwargs)
 
         # Properly set the network training device
         if self.device is None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 36c428e6d..ca185d799 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -262,9 +262,9 @@ def __init__(self,
         self.cached_static_embedding = None
 
     def forward(self,
-                x_past: Optional[Dict[torch.Tensor]],
-                x_future: Optional[Dict[torch.Tensor]],
-                x_static: Optional[Dict[torch.Tensor]] = None,
+                x_past: Optional[Dict[str,torch.Tensor]],
+                x_future: Optional[Dict[str, torch.Tensor]],
+                x_static: Optional[Dict[str, torch.Tensor]] = None,
                 length_past: int = 0,
                 length_future: int = 0,
                 batch_size: int = 0,
@@ -320,6 +320,7 @@ def __init__(self,
                  encoder_info: Dict[str, EncoderBlockInfo],
                  decoder_info: Dict[str, DecoderBlockInfo],
                  ):
+        super().__init__()
         self.num_blocks = network_structure.num_blocks
         self.skip_connection = network_structure.skip_connection
         self.has_temporal_fusion = has_temporal_fusion
@@ -442,6 +443,7 @@ def __init__(self,
                  encoder_info: Dict[str, EncoderBlockInfo],
                  decoder_info: Dict[str, DecoderBlockInfo],
                  ):
+        super().__init__()
         self.num_blocks = network_structure.num_blocks
         self.first_block = None
         self.skip_connection = network_structure.skip_connection
@@ -477,7 +479,7 @@ def __init__(self,
         self.decoder = decoder
 
     def forward(self,
-                x_future: torch.Tensor,
+                x_future: Optional[torch.Tensor],
                 encoder_output: List[torch.Tensor],
                 cache_intermediate_state: bool = False,
                 incremental_update: bool = False
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index b378a6af3..97298f2a9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -13,7 +13,10 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone. \
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderNetwork
+    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderNetwork
+)
 
 
 class MLPDecoderModule(DecoderNetwork):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 8ed84c0fe..913b6a832 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -1,5 +1,5 @@
 from typing import List
-
+import torch
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
     UniformFloatHyperparameter
@@ -15,9 +15,11 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.\
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderNetwork
+)
 
-
-class NBEATSBLock(nn.Module):
+class NBEATSBLock(DecoderNetwork):
     def __init__(self,
                  n_in_features: int,
                  stack_idx: int,
@@ -72,12 +74,12 @@ def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
         if self.use_dropout:
             layers.append(nn.Dropout(self.dropout_rate))
 
-    def forward(self, x):
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor):
         if self.backcast_head is None and self.forecast_head is None:
             # used to compute head dimensions
-            return self.backbone(x)
+            return self.backbone(encoder_output)
         else:
-            x = self.backbone(x)
+            x = self.backbone(encoder_output)
             forecast = self.forecast_head(x)
             backcast = self.backcast_head(x)
             return backcast, forecast
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 7c8d57073..06aed6d50 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -15,9 +15,11 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.\
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
     BaseForecastingDecoder,
-    DecoderNetwork,
     DecoderProperties
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderNetwork
+)
 
 from autoPyTorch.utils.common import FitRequirement
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 2f2f93487..0e39af488 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -19,10 +19,11 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.\
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
     BaseForecastingDecoder,
-    DecoderNetwork,
     DecoderProperties
 )
-
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderNetwork
+)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     PositionalEncoding, build_transformer_layers
 
@@ -63,7 +64,11 @@ def __init__(self,
 
     def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
         output = self.input_layer(x_future)
-        output = self.transformer_decoder_layers(output, encoder_output, tgt_mask=self.tgt_mask.to(self.device))
+        if self.training:
+            output = self.transformer_decoder_layers(output, encoder_output,
+                                                     tgt_mask=self.tgt_mask.to(self.device))
+        else:
+            output = self.transformer_decoder_layers(output, encoder_output)
         return output
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 74671807f..f83298cec 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -351,7 +351,7 @@ def set_hyperparameters(self,
         self.choice = self.get_components()[choice](**new_params)
         self.decoder_choice = decoder_components[decoder_type](**decoder_params)
 
-        self.pipeline = Pipeline([('net_structure', ForecastingNetworkStructure(random_state=self.random_state)),
+        self.pipeline = Pipeline([('net_structure', ForecastingNetworkStructure()),
                                   ('encoder', self.choice),
                                   ('decoder', self.decoder_choice)])
         return self
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 0a51bc8dd..ce18f5823 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -139,7 +139,7 @@ def build_encoder(self,
         raise NotImplementedError()
 
     @staticmethod
-    def encoder_properties(self) -> EncoderProperties:
+    def encoder_properties() -> EncoderProperties:
         """
         Encoder properties, this determines how the data flows over the forecasting networks
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index c6d813983..b0e566da8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -29,7 +29,7 @@
     AbstractForecastingEncoderChoice
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
-    base_forecasting_encoder import BaseForecastingEncoder, ForecastingNetworkStructure
+    base_forecasting_encoder import BaseForecastingEncoder
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -357,7 +357,7 @@ def get_hyperparameter_search_space(
 
                 cs.add_conditions(conditions_to_add)
 
-        for encoder_name, encoder in available_encoders.item():
+        for encoder_name, encoder in available_encoders.items():
             encoder_is_casual = encoder.encoder_properties()
             if not encoder_is_casual:
                 # we do not allow non-casual encoder to appear in the lower layer of the network. e.g, if we have an
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 2e0a1f4a3..bd2a2af60 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -10,10 +10,7 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
-    base_forecasting_encoder import NetworkStructure, EncoderBlockInfo
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    base_forecasting_decoder import DecoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
@@ -21,9 +18,7 @@
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
-    TemporalFusionLayer
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import TemporalFusionLayer
 
 
 class ForecastingHead(NetworkHeadComponent):

From 2dd0b11375f9b03174f00b4fbd43bf7756af1d07 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 6 Mar 2022 18:35:33 +0100
Subject: [PATCH 173/347] maint

---
 .../forecasting_training_loss/QuantileLoss.py | 22 +++--
 .../setup/network/forecasting_architecture.py | 86 +++++++++++--------
 .../forecasting_backbone/cells.py             | 17 ++--
 .../forecasting_decoder/RNNDecoder.py         |  2 +-
 .../base_forecasting_decoder.py               |  2 +-
 .../forecasting_encoder/components.py         |  4 +-
 .../flat_encoder/MLPEncoder.py                |  2 +-
 .../seq_encoder/RNNEncoder.py                 | 10 +--
 .../seq_encoder/TCNEncoder.py                 |  6 +-
 .../seq_encoder/TransformerEncoder.py         |  2 +-
 .../seq_encoder/__init__.py                   | 36 +-------
 .../forecasting_network_head/distribution.py  |  1 -
 .../forecasting_head.py                       | 26 +++---
 .../pipeline/components/training/losses.py    | 27 +++++-
 .../pipeline/time_series_forecasting.py       | 35 ++------
 15 files changed, 141 insertions(+), 137 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index 3fac5e68b..220aacbea 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -2,25 +2,28 @@
 import numpy as np
 
 from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter
+from ConfigSpace.hyperparameters import UniformFloatHyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
-from autoPyTorch.pipeline.components.training.losses import LogProbLoss
+from autoPyTorch.pipeline.components.training.losses import QuantileLoss
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
 
 
-class QuantileLoss():
-    loss = LogProbLoss
+class NetworkQuantileLoss(ForecastingLossComponents):
+    loss = QuantileLoss
     required_net_out_put_type = 'quantile'
 
     def __init__(self,
                  random_state: Optional[np.random.RandomState] = None,
+                 lower_quantile: float=0.1,
+                 upper_quantile: float=0.9,
                  ):
         super(QuantileLoss, self).__init__()
         self.random_state = random_state
+        self.loss = QuantileLoss(lower=lower_quantile, upper=upper_quantile)
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
@@ -43,7 +46,16 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            lower_quantile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='lower_quantile',
+                                                                     value_range=(0.0, 0.4),
+                                                                     default_value=0.1),
+            upper_quantile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='upper_quantile',
+                                                                                  value_range=(0.6, 1.0),
+                                                                                  default_value=0.9)
+
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
+        add_hyperparameter(cs, lower_quantile, UniformFloatHyperparameter)
+        add_hyperparameter(cs, upper_quantile, UniformFloatHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 96fa9b16c..14ce8bfdd 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -249,8 +249,10 @@ def __init__(self,
 
         if self.encoder_lagged_input:
             self.cached_lag_mask_encoder = None
+            self.encoder_lagged_value = network_encoder['block_1'].encoder.lagged_value
         if self.decoder_lagged_input:
             self.cached_lag_mask_decoder = None
+            self.decoder_lagged_value = network_decoder[first_decoder].decoder.lagged_value
 
     @property
     def device(self):
@@ -324,13 +326,13 @@ def repeat_intermediate_values(self,
                                    intermediate_values: List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]],
                                    is_hidden_states: List[bool],
                                    repeats: int) -> List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]]:
-        for i, (is_hx, inter_value) in enumerate(is_hidden_states, intermediate_values):
+        for i, (is_hx, inter_value) in enumerate(zip(is_hidden_states, intermediate_values)):
             if isinstance(inter_value, torch.Tensor):
                 repeated_value = inter_value.repeat_interleave(repeats=repeats, dim=1 if is_hx else 0)
                 intermediate_values[i] = repeated_value
             elif isinstance(inter_value, Tuple):
                 dim = 1 if is_hx else 0
-                repeated_value = (hx.repeat_interleave(repeats=repeats, dim=dim) for hx in inter_value)
+                repeated_value = tuple(hx.repeat_interleave(repeats=repeats, dim=dim) for hx in inter_value)
                 intermediate_values[i] = repeated_value
         return intermediate_values
 
@@ -350,7 +352,7 @@ def pre_processing(self,
             past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
             x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
                                                                            self.window_size,
-                                                                           self.encoder.lagged_value,
+                                                                           self.encoder_lagged_value,
                                                                            self.cached_lag_mask_encoder)
         else:
             if self.window_size < past_targets.shape[1]:
@@ -516,7 +518,7 @@ def forward(self,
                 future_targets = torch.cat([past_targets, future_targets[:, :-1, :]], dim=1)
                 future_targets, self.cached_lag_mask_decoder = get_lagged_subsequences(future_targets,
                                                                                        self.n_prediction_steps,
-                                                                                       self.decoder.lagged_value,
+                                                                                       self.decoder_lagged_value,
                                                                                        self.cached_lag_mask_decoder)
             else:
                 future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
@@ -528,7 +530,9 @@ def forward(self,
                                                                                     dim=-1)
             x_future = x_future.to(self.device)
 
-            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
+                                                           additional_input=encoder_additional)
+
             decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder)
 
             if self.has_temporal_fusion:
@@ -556,13 +560,15 @@ def forward(self,
                     predicted_target = predicted_target.cpu()
                     if self.decoder_lagged_input:
                         x_future = torch.cat([past_targets, predicted_target], dim=1)
-                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
+                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder_lagged_value)
                     else:
                         x_future = predicted_target[:, [-1]]
 
+                    x_future = x_future.to(self.device)
+
                     if self.network_structure.variable_selection:
                         x_future = self.decoder_select_variable(
-                            future_targets=predicted_target[:, -1:],
+                            future_targets=predicted_target[:, -1:].to(self.device),
                             future_features=future_features[:, [idx_pred]] if future_features is not None else None
                         )
                     else:
@@ -608,7 +614,7 @@ def forward(self,
                     repeats=self.num_samples)
 
                 if self.decoder_lagged_input:
-                    max_lag_seq_length = max(self.decoder.lagged_value) + 1
+                    max_lag_seq_length = max(self.decoder_lagged_value) + 1
                 else:
                     max_lag_seq_length = 1 + self.window_size
                 repeated_past_target = past_targets[:, -max_lag_seq_length:].repeat_interleave(repeats=self.num_samples,
@@ -627,7 +633,7 @@ def forward(self,
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
                         x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
-                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder.lagged_value)
+                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder_lagged_value)
                     else:
                         x_future = repeated_predicted_target[:, [-1]]
 
@@ -745,7 +751,7 @@ def forward(self,
                 seq_length = self.window_size + self.n_prediction_steps
                 targets_all, self.cached_lag_mask_encoder = get_lagged_subsequences(targets_all,
                                                                                     seq_length - 1,
-                                                                                    self.encoder.lagged_value,
+                                                                                    self.encoder_lagged_value,
                                                                                     self.cached_lag_mask_encoder)
             else:
                 if self.window_size < past_targets.shape[1]:
@@ -764,12 +770,12 @@ def forward(self,
                                                                dtype=self.dtype, device=self.device)}
 
                 x_input, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
-                                                                                              x_future=None,
-                                                                                              x_static=static_features,
-                                                                                              length_past=length_past,
-                                                                                              length_future=0,
-                                                                                              batch_size=batch_size,
-                                                                                              )
+                                                                                      x_future=None,
+                                                                                      x_static=static_features,
+                                                                                      length_past=length_past,
+                                                                                      length_future=0,
+                                                                                      batch_size=batch_size,
+                                                                                      )
             else:
                 x_input = targets_all
                 if past_features is not None:
@@ -783,10 +789,13 @@ def forward(self,
             encoder_additional = [static_context_initial_hidden]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
-            encoder2decoder, encoder_output = self.encoder(encoder_input=x_input, additional_input=encoder_additional)
+            encoder2decoder, encoder_output = self.encoder(encoder_input=x_input,
+                                                           additional_input=encoder_additional,
+                                                           output_seq=True)
 
             if self.only_generate_future_dist:
-                encoder2decoder = encoder2decoder[:, -self.n_prediction_steps:]
+                # DeepAR only receives the output of the last encoder
+                encoder2decoder = encoder2decoder[-1][:, -self.n_prediction_steps:]
             net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
             # DeepAR does not allow tf layers
             return self.rescale_output(net_output, loc, scale, self.device)
@@ -797,7 +806,7 @@ def forward(self,
                 past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
                 x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
                                                                                     self.window_size,
-                                                                                    self.encoder.lagged_value,
+                                                                                    self.encoder_lagged_value,
                                                                                     self.cached_lag_mask_encoder_test)
             else:
                 if self.window_size < past_targets.shape[1]:
@@ -816,13 +825,13 @@ def forward(self,
                                                                dtype=self.dtype, device=self.device)}
 
                 x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
-                                                                                             x_future=None,
-                                                                                             x_static=static_features,
-                                                                                             length_past=length_past,
-                                                                                             length_future=0,
-                                                                                             batch_size=batch_size,
-                                                                                             cache_static_contex=True
-                                                                                             )
+                                                                                     x_future=None,
+                                                                                     x_static=static_features,
+                                                                                     length_past=length_past,
+                                                                                     length_future=0,
+                                                                                     batch_size=batch_size,
+                                                                                     cache_static_contex=True
+                                                                                     )
             else:
                 if past_features is not None:
                     # features is one step ahead of target
@@ -849,9 +858,9 @@ def forward(self,
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
-                                                                   additional_input=encoder_additional,
-                                                                   cache_intermediate_state=True,
-                                                                   )
+                                                           additional_input=encoder_additional,
+                                                           cache_intermediate_state=True,
+                                                           )
 
             self.repeat_intermediate_values(self.encoder.cached_intermediate_state,
                                             is_hidden_states=self.encoder.encoder_has_hidden_states,
@@ -862,7 +871,7 @@ def forward(self,
                                                 repeats=self.num_samples)
 
             if self.encoder_lagged_input:
-                max_lag_seq_length = max(max(self.encoder.lagged_value), self.window_size)
+                max_lag_seq_length = max(max(self.encoder_lagged_value), self.window_size)
             else:
                 max_lag_seq_length = self.window_size
             # TODO considering padding targets here instead of inside get_lagged function
@@ -900,16 +909,18 @@ def forward(self,
 
             for k in range(1, self.n_prediction_steps):
                 if self.encoder_lagged_input:
-                    x_next = torch.cat([repeated_past_target, *all_samples], dim=1)
-                    x_next = get_lagged_subsequences_inference(x_next, 1, self.encoder.lagged_value)
+                    repeated_past_target = torch.cat([repeated_past_target, all_samples[-1]], dim=1)
+                    x_next = get_lagged_subsequences_inference(repeated_past_target, 1, self.encoder_lagged_value)
                 else:
                     x_next = next_sample
 
+                x_next = x_next.to(self.device)
+
                 if self.network_structure.variable_selection:
                     batch_size = past_targets.shape[0]
                     if past_features is None:
                         if past_features is None:
-                            x_next = {'past_targets': x_next.to(device=self.device),
+                            x_next = {'past_targets': x_next,
                                       'past_features': torch.zeros((batch_size, 1, 0),
                                                                    dtype=self.dtype, device=self.device)}
 
@@ -922,11 +933,10 @@ def forward(self,
                                                              cache_static_contex=False,
                                                              use_cached_static_contex=True,
                                                              )
-                encoder_output, _ = self.encoder(encoder_input=x_next,
-                                                 additional_input=[None] * self.network_structure.num_blocks,
-                                                 output_seq=False, cache_intermediate_state=True,
-                                                 incremental_update=True)
-
+                encoder2decoder, _ = self.encoder(encoder_input=x_next,
+                                                  additional_input=[None] * self.network_structure.num_blocks,
+                                                  output_seq=False, cache_intermediate_state=True,
+                                                  incremental_update=True)
 
                 net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index ca185d799..4ffed95ed 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -330,7 +330,7 @@ def __init__(self,
         len_cached_intermediate_states = self.num_blocks + 1 if self.has_temporal_fusion else self.num_blocks
         self.cached_intermediate_state = [torch.empty(0) for _ in range(len_cached_intermediate_states)]
 
-        self.encoder_num_hidden_states = []
+        self.encoder_num_hidden_states = [0] * self.num_blocks
         encoder = nn.ModuleDict()
         for i, block_idx in enumerate(range(1, self.num_blocks + 1)):
             block_id = f'block_{block_idx}'
@@ -348,12 +348,13 @@ def __init__(self,
             if block_id in decoder_info:
                 if decoder_info[block_id].decoder_properties.recurrent:
                     if decoder_info[block_id].decoder_properties.has_hidden_states:
-                        # RNN
+                        # RNN -> RNN
                         self.encoder_output_type[i] = EncoderOutputForm.HiddenStates
                     else:
-                        # Transformer
+                        # Transformer -> Transformer
                         self.encoder_output_type[i] = EncoderOutputForm.Sequence
                 else:
+                    # Deep AR
                     self.encoder_output_type[i] = EncoderOutputForm.SequenceLast
             if encoder_info[block_id].encoder_properties.has_hidden_states:
                 self.encoder_has_hidden_states[i] = True
@@ -374,7 +375,8 @@ def forward(self,
              encoder_input (torch.Tensor): encoder input
              additional_input (List[Optional[torch.Tensor]]) additional input to the encoder, e.g., inital hidden states
              output_seq (bool) if a sequence output is generated
-             incremental_update (bool) if an incremental update is applied, this is normally applied for auto-regressive
+             cache_intermediate_state (bool): if store the intermediate values
+             incremental_update (bool): if an incremental update is applied, this is normally applied for auto-regressive
                 model, however, ony deepAR requires encoder to do incremental update, thus the decoder only need to
                 receive the last output of the encoder
         """
@@ -407,7 +409,7 @@ def forward(self,
             if self.skip_connection:
                 fx = self.encoder[f'skip_connection_{block_id}'](fx, x)
 
-            if self.encoder_output_type == EncoderOutputForm.HiddenStates:
+            if self.encoder_output_type[i] == EncoderOutputForm.HiddenStates:
                 encoder2decoder.append(hx)
             elif self.encoder_output_type[i] == EncoderOutputForm.Sequence:
                 encoder2decoder.append(fx)
@@ -415,7 +417,8 @@ def forward(self,
                 if output_seq or incremental_update:
                     encoder2decoder.append(fx)
                 else:
-                    encoder2decoder.append(encoder_i.get_last_seq_value(fx))
+                    encoder2decoder.append(fx.squeeze(1))
+
             if cache_intermediate_state:
                 if self.encoder_has_hidden_states[i]:
                     self.cached_intermediate_state[i] = hx
@@ -502,7 +505,7 @@ def forward(self,
             if self.skip_connection:
                 fx = self.decoder[f'skip_connection_{block_id}'](fx, x)
             if cache_intermediate_state:
-                if self.encoder_has_hidden_states[i]:
+                if self.decoder_has_hidden_states[i]:
                     self.cached_intermediate_state[i] = hx
                 else:
                     if incremental_update:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 06aed6d50..6d6fba994 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -38,7 +38,7 @@ def __init__(self,
         else:
             cell = nn.GRU
         self.lagged_value = lagged_value
-        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
+        in_features = in_features
         self.lstm = cell(input_size=in_features,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 1ef4922e1..bf343a212 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -89,7 +89,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 future_in_features = X['network_encoder']['block_1'].encoder_output_shape[-1]
             else:
                 if auto_regressive:
-                    if self.decoder_properties()["lagged_input"] and hasattr(self, 'lagged_value'):
+                    if self.decoder_properties().lagged_input and hasattr(self, 'lagged_value'):
                         future_in_features += len(self.lagged_value) * output_shape[-1]
                     else:
                         future_in_features += output_shape[-1]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
index 428422dbb..9e7f67461 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -51,9 +51,9 @@ def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         get the last value of the sequential output
         Args:
             x: torch.Tensor(B, L, N): a sequential value output by the network, usually this value needs to be fed
-                to the decoder
+                to the decoder (or a 2D tensor for a flat encoder)
         Returns:
-            output: torch.Tensor(B, M): last element of the sequential value
+            output: torch.Tensor(B, 1, M): last element of the sequential value (or a 2D tensor for flat encoder)
 
         """
         raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index 413c36203..85bf72698 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -56,7 +56,7 @@ def forward(self, x: torch.Tensor, output_seq: bool = False):
                 # we need to ensure that the input size fits the network shape
                 x = x[:, -self.window_size:]  # x.shape = (B, self.window, N)
         x = x.flatten(-2)
-        return x if self.network is not None else self.network(x)
+        return x if self.network is None else self.network(x)
 
     def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         return x
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 6e3ebf48f..4779f2283 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -51,25 +51,25 @@ def forward(self,
                 hx: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
         B, T, _ = x.shape
 
-        outputs, hidden_state = self.lstm(x, hx)
+        x, hidden_state = self.lstm(x, hx)
 
         if output_seq:
-            return outputs, hidden_state
+            return x, hidden_state
         else:
             return self.get_last_seq_value(x), hidden_state
 
     def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         B, T, _ = x.shape
         if not self.config["bidirectional"]:
-            return x[:, -1, :]
+            return x[:, -1:, ]
         else:
             x_by_direction = x.view(B,
                                     T,
                                     2,
                                     self.config["hidden_size"])
             x = torch.cat([
-                x_by_direction[:, -1, 0, :],
-                x_by_direction[:, 0, 1, :]
+                x_by_direction[:, -1, [0], :],
+                x_by_direction[:, 0, [1], :]
             ], dim=-1)
             return x
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 5cb45d74c..6cc29b71c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -112,7 +112,7 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
             return self.get_last_seq_value(x)
 
     def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
-        return x[:, -1, :]
+        return x[:, -1:]
 
 
 class TCNEncoder(BaseForecastingEncoder):
@@ -121,7 +121,7 @@ class TCNEncoder(BaseForecastingEncoder):
     Temporal Convolutional Network backbone for time series data (see https://arxiv.org/pdf/1803.01271.pdf).
     """
 
-    def build_encoder(self, input_shape: Tuple[int, ...]) -> Tuple[nn.Module, int]:
+    def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_1"]]
         kernel_size = [self.config["kernel_size_1"]]
         dropout = self.config[f"dropout"] if self.config["use_dropout"] else 0.0
@@ -135,7 +135,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> Tuple[nn.Module, int]:
                                    dropout=dropout
                                    )
         self._receptive_field = encoder.receptive_field
-        return encoder, in_features
+        return encoder
 
     def n_encoder_output_feature(self) -> int:
         num_blocks = self.config["num_blocks"]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 2cabd81a5..644fbd21c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -67,7 +67,7 @@ def forward(self,
             return self.get_last_seq_value(x)
 
     def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
-        return x[:, -1, :]
+        return x[:, -1:]
 
 
 class TransformerEncoder(BaseForecastingEncoder):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index b0e566da8..d1ec77915 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -30,6 +30,7 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import ForecastingNetworkStructure
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -84,7 +85,7 @@ def get_hyperparameter_search_space(
             decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="decoder_auto_regressive",
                 value_range=(True, False),
-                default_value=False,
+                default_value=True,
             ),
             skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
                                                                                    value_range=(True, False),
@@ -188,13 +189,13 @@ def get_hyperparameter_search_space(
             cs.add_conditions(cond_skip_connections)
 
         if static_features_shape + future_feature_shapes[-1] == 0:
-            if False in variable_selection.choices and True in decoder_auto_regressive.choices:
+            if False in variable_selection.choices and False in decoder_auto_regressive.choices:
                 if variable_selection.num_choices == 1 and decoder_auto_regressive.num_choices == 1:
                     raise ValueError("When no future information is available, it is not possible to disable variable"
                                      "selection and enable auto-regressive decoder model")
                 cs.add_forbidden_clause(ForbiddenAndConjunction(
                     ForbiddenEqualsClause(variable_selection, False),
-                    ForbiddenEqualsClause(decoder_auto_regressive, True)
+                    ForbiddenEqualsClause(decoder_auto_regressive, False)
                 ))
         if True in variable_selection.choices:
             cs.add_hyperparameter(share_single_variable_networks)
@@ -312,22 +313,6 @@ def get_hyperparameter_search_space(
                     config_space,
                     # parent_hyperparameter=parent_hyperparameter
                 )
-                if not available_decoders[decoder_name].decoder_properties().recurrent:
-                    hp_encoder_choice = cs.get_hyperparameter(block_prefix + '__choice__')
-                    for encoder_single in encoder_with_single_decoder:
-                        if encoder_single in hp_encoder_choice.choices:
-                            if forbidden_decoder_ar is not None:
-                                forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
-                                    forbidden_decoder_ar,
-                                    ForbiddenEqualsClause(hp_encoder_choice, encoder_single)
-                                ))
-                    for encode_multi in encoders_with_multi_decoder:
-                        hp_decoder_type = cs.get_hyperparameter(f"{block_prefix}{encode_multi}:decoder_type")
-                        if forbidden_decoder_ar is not None:
-                            forbiddens_decoder_auto_regressive.append(ForbiddenAndConjunction(
-                                forbidden_decoder_ar,
-                                ForbiddenEqualsClause(hp_encoder_choice, encoder_single)
-                            ))
 
                 hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
                 conditions_to_add = []
@@ -391,19 +376,6 @@ def get_hyperparameter_search_space(
                         else:
                             forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
                         cs.add_forbidden_clause(forbidden)
-                if 'RNNEncoder' in available_encoders:
-                    for i in range(min_num_blocks, max_num_blocks + 1):
-                        rnn_bidirectional_hp = ':'.join([f'block_{min_num_blocks}',
-                                                         'RNNEncoder',
-                                                         'bidirectional'])
-                        if rnn_bidirectional_hp in cs:
-                            rnn_bidirectional_hp = cs.get_hyperparameter(rnn_bidirectional_hp)
-                            if 'True' in rnn_bidirectional_hp.choices:
-                                forbidden = ForbiddenAndConjunction(
-                                    ForbiddenEqualsClause(rnn_bidirectional_hp, True),
-                                    deep_ar_hp
-                                )
-                                cs.add_forbidden_clause(forbidden)
         return cs
 
     def set_hyperparameters(self,
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index cbc6281e6..f82a23431 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -69,7 +69,6 @@ def build_single_proj_layer(arg_dim):
 
             """
             if decoder_has_local_layer:
-
                 return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
                                      nn.Unflatten(-1, (*output_shape, arg_dim)))
             else:
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index bd2a2af60..1a502d411 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -11,6 +11,9 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo
+)
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
@@ -136,7 +139,13 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             X.update({'network_head': self.head})
         else:
             decoder = X['network_decoder']
-            decoder = build_NBEATS_network(decoder, self.output_shape)
+            # NBEATS is a flat encoder, it only has one decoder
+            first_decoder = decoder['block_1']
+            nbeats_decoder = build_NBEATS_network(first_decoder.decoder, self.output_shape)
+            decoder['block_1'] = DecoderBlockInfo(decoder=nbeats_decoder,
+                                                  decoder_properties=first_decoder.decoder_properties,
+                                                  decoder_output_shape=first_decoder.decoder_output_shape,
+                                                  decoder_input_shape=first_decoder.decoder_input_shape)
             X.update({'network_head': self.head,
                       'network_decoder': decoder})
         return X
@@ -232,18 +241,13 @@ def build_proj_layer(input_shape: Tuple[int, ...],
                                                      )
             return proj_layer
         elif net_out_put_type == 'regression':
-            if auto_regressive:
+            if decoder_has_local_layer:
                 proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
             else:
-                if decoder_has_local_layer:
-                    proj_layer = nn.Sequential(nn.Unflatten(-1, (n_prediction_heads, input_shape)),
-                                               nn.Linear(input_shape, np.product(output_shape[1:])),
-                                               )
-                else:
-                    proj_layer = nn.Sequential(
-                        nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
-                        nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
-                    )
+                proj_layer = nn.Sequential(
+                    nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
+                )
             return proj_layer
         else:
             raise ValueError(f"Unsupported network type "
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index cdfe802cd..293eb125e 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -10,7 +10,7 @@
             L1Loss: supports continuous output types
         Default: MSELoss
 """
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, Optional, Type, List
 
 import torch
 from torch.nn.modules.loss import (
@@ -84,6 +84,31 @@ def forward(self,
             return loss
 
 
+class QuantileLoss(Loss):
+    __constants__ = ['reduction']
+
+    def __init__(self, reduction: str = 'mean',lower=0.1, upper=0.9) -> None:
+        super(QuantileLoss, self).__init__(reduction)
+        self.quantiles = [lower, 0.5, upper]
+
+    def forward(self,
+                input: List[torch.Tensor],
+                target_tensor: torch.Tensor) -> torch.Tensor:
+        assert len(self.quantiles) == len(input)
+        losses_all = []
+        for q, y_pred in zip(self.quantiles, input):
+            diff = target_tensor - y_pred
+            loss_q = max(torch.max(q * diff), (1-q) * diff)
+            losses_all.append(loss_q.unsqueeze(0))
+        losses_all = torch.concat(losses_all)
+        if self.reduction == 'mean':
+            return losses_all.mean()
+        elif self.reduction == 'sum':
+            return losses_all.sum()
+        else:
+            return losses_all
+
+
 losses = dict(
     classification=dict(
         CrossEntropyLoss=dict(
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 576640c12..cb7745098 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -273,7 +273,7 @@ def _get_hyperparameter_search_space(self,
 
             # NBEATS
             forbidden_NBEATS = []
-            encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'NBEATSEncoder']
+            encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'flat_encoder']
             loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
             data_loader_backcast = cs.get_hyperparameter('data_loader:backcast')
 
@@ -282,37 +282,16 @@ def _get_hyperparameter_search_space(self,
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
             forbidden_backcast_false = ForbiddenEqualsClause(data_loader_backcast, False)
 
+
+            hp_flat_encoder =  cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
+
+
             # Ensure that NBEATS encoder only works with NBEATS decoder
-            if 'NBEATSEncoder' in network_encoder_hp.choices:
+            if 'NBEATSEncoder' in hp_flat_encoder.choices:
                 forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
+                    ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
                     forbidden_loss_non_regression)
                 )
-                """
-                forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(network_encoder_hp, 'NBEATSEncoder'),
-                    forbidden_backcast_false)
-                )
-                """
-            """
-            if 'NBEATSDecoder' in network_decoder_hp.choices:
-                forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
-                    forbidden_encoder_NBEATS)
-                )
-                forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
-                    forbidden_loss_non_regression)
-                )
-                forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(network_decoder_hp, 'NBEATSDecoder'),
-                    forbidden_backcast_false)
-                )
-            forbidden_NBEATS.append(ForbiddenAndConjunction(
-                forbidden_backcast,
-                forbidden_decoder_NBEATS
-            ))
-            """
             forbidden_NBEATS.append(ForbiddenAndConjunction(
                 forbidden_backcast,
                 forbidden_encoder_NBEATS

From 0f0dbf088e1fae3ef66a0730328b387fe5d93a8e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 7 Mar 2022 13:20:02 +0100
Subject: [PATCH 174/347] move tft layer to backbone

---
 .../setup/network/forecasting_architecture.py |  10 +-
 .../forecasting_backbone/cells.py             |  14 +-
 .../seq_encoder/__init__.py                   |  27 ++++
 .../other_components/TemporalFusion.py        | 145 ++++++++++++++++++
 .../other_components/__init__.py              |   0
 .../forecasting_head.py                       |  82 +---------
 6 files changed, 189 insertions(+), 89 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
 create mode 100644 autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/__init__.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 14ce8bfdd..ca8eacfab 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -162,6 +162,7 @@ def __init__(self,
                  network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
                  network_encoder: Dict[str, EncoderBlockInfo],
                  network_decoder: Dict[str, DecoderBlockInfo],
+                 temporal_fusion: Optional[TemporalFusionLayer],
                  network_head: Optional[nn.Module],
                  window_size: int,
                  target_scaler: BaseTargetScaler,
@@ -203,7 +204,7 @@ def __init__(self,
                                                       dataset_properties=dataset_properties,
                                                       network_encoder=network_encoder,
                                                       auto_regressive=auto_regressive)
-        has_temporal_fusion = "temporal_fusion" in network_head
+        has_temporal_fusion = temporal_fusion is not None
         self.encoder = StackedEncoder(network_structure=network_structure,
                                       has_temporal_fusion=has_temporal_fusion,
                                       encoder_info=network_encoder,
@@ -213,9 +214,9 @@ def __init__(self,
                                       encoder_info=network_encoder,
                                       decoder_info=network_decoder)
         if has_temporal_fusion:
-            self.temporal_fusion = network_head['temporal_fusion']  # type: TemporalFusionLayer
+            self.temporal_fusion = temporal_fusion  # type: TemporalFusionLayer
         self.has_temporal_fusion = has_temporal_fusion
-        self.head = network_head['head']
+        self.head = network_head
 
         first_decoder = 0
         for i in range(1, network_structure.num_blocks + 1):
@@ -419,6 +420,7 @@ def forward(self,
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
                                                   encoder_lengths=encoder_lengths,
+                                                  decoder_lenght=self.n_prediction_steps,
                                                   static_embedding=x_static
                                                   )
         output = self.head(decoder_output)
@@ -539,6 +541,7 @@ def forward(self,
                 decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                       decoder_output=decoder_output,
                                                       encoder_lengths=encoder_lengths,
+                                                      self.n_prediction_steps,
                                                       static_embedding=x_static
                                                       )
             net_output = self.head(decoder_output)
@@ -587,6 +590,7 @@ def forward(self,
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
                                                               encoder_lengths=encoder_lengths,
+                                                              decoder_length = idx_pred + 1,
                                                               static_embedding=x_static
                                                               )[:, -1:]
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 4ffed95ed..4e9346e48 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -32,7 +32,6 @@ class TemporalFusionLayer(nn.Module):
 
     def __init__(self,
                  window_size: int,
-                 n_prediction_steps: int,
                  network_structure: NetworkStructure,
                  network_encoder: Dict[str, EncoderBlockInfo],
                  n_decoder_output_features: int,
@@ -44,8 +43,6 @@ def __init__(self,
         last_block = f'block_{num_blocks}'
         n_encoder_output = network_encoder[last_block].encoder_output_shape[-1]
         self.window_size = window_size
-        self.n_prediction_steps = n_prediction_steps
-        self.timestep = window_size + n_prediction_steps
 
         if n_decoder_output_features != n_encoder_output:
             self.decoder_proj_layer = nn.Linear(n_decoder_output_features, n_encoder_output, bias=False)
@@ -94,13 +91,18 @@ def __init__(self,
                 self.residual_connection = GateAddNorm(d_model, skip_size=n_encoder_output,
                                                        dropout=None, trainable_add=False)
 
-    def forward(self, encoder_output: torch.Tensor, decoder_output: torch.Tensor, encoder_lengths: torch.LongTensor,
+    def forward(self,
+                encoder_output: torch.Tensor,
+                decoder_output: torch.Tensor,
+                encoder_lengths: torch.LongTensor,
+                decoder_length: int,
                 static_embedding: Optional[torch.Tensor] = None):
         """
         Args:
             encoder_output: the output of the last layer of encoder network
             decoder_output: the output of the last layer of decoder network
             encoder_lengths: length of encoder network
+            decoder_length: length of decoder network
             static_embedding: output of static variable selection network (if applible)
         """
         if self.decoder_proj_layer is not None:
@@ -110,7 +112,7 @@ def forward(self, encoder_output: torch.Tensor, decoder_output: torch.Tensor, en
         if self.enrich_with_static:
             static_context_enrichment = self.static_context_enrichment(static_embedding)
             attn_input = self.enrichment(
-                network_output, static_context_enrichment[:, None].expand(-1, self.timesteps, -1)
+                network_output, static_context_enrichment[:, None].expand(-1, self.window_size + decoder_length, -1)
             )
         else:
             attn_input = self.enrichment(network_output)
@@ -121,7 +123,7 @@ def forward(self, encoder_output: torch.Tensor, decoder_output: torch.Tensor, en
             k=attn_input,
             v=attn_input,
             mask=self.get_attention_mask(
-                encoder_lengths=encoder_lengths, decoder_length=self.n_prediction_steps
+                encoder_lengths=encoder_lengths, decoder_length=decoder_length
             ),
         )
         # skip connection over attention
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index d1ec77915..6ec999821 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -31,6 +31,7 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import ForecastingNetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.TemporalFusion import TemporalFusion
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -46,6 +47,7 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
 class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
     deepAR_decoder_name = 'MLPDecoder'
     deepAR_decoder_prefix = 'block_1'
+    tf_prefix = "temporal_fusion"
 
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
@@ -342,6 +344,20 @@ def get_hyperparameter_search_space(
 
                 cs.add_conditions(conditions_to_add)
 
+
+        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
+        cs.add_hyperparameter(use_temporal_fusion)
+        if True in use_temporal_fusion.choices:
+            update = self._get_search_space_updates(prefix=self.tf_prefix)
+            cs_tf = TemporalFusion.get_hyperparameter_search_space(dataset_properties,
+                                                                   **update)
+            parent_hyperparameter = {'parent': use_temporal_fusion, 'value': True}
+            cs.add_configuration_space(
+                self.tf_prefix,
+                cs_tf,
+                parent_hyperparameter=parent_hyperparameter
+            )
+
         for encoder_name, encoder in available_encoders.items():
             encoder_is_casual = encoder.encoder_properties()
             if not encoder_is_casual:
@@ -481,6 +497,17 @@ def set_hyperparameters(self,
             self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
 
+        use_temporal_fusion = params["use_temporal_fusion"]
+        new_params = []
+        if use_temporal_fusion:
+            for param, value in params.items():
+                if param.startswith(self.tf_prefix):
+                    param = param.replace(self.tf_prefix + ':', '')
+                    new_params[param] = value
+            temporal_fusion = TemporalFusion(self.random_state,
+                                             **new_params)
+            pipeline_steps.extend([(f'temporal_fusion', temporal_fusion)])
+
         self.pipeline = Pipeline(pipeline_steps)
         self.choice = self.encoder_choice[0]
         return self
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
new file mode 100644
index 000000000..5a2d183b9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -0,0 +1,145 @@
+import numpy as np
+
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
+    UniformIntegerHyperparameter
+from ConfigSpace.conditions import EqualsCondition
+from autoPyTorch.utils.common import FitRequirement
+from typing import Any, Dict, Iterable, Optional, Tuple, List, Union, NamedTuple
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderProperties, EncoderBlockInfo, EncoderNetwork
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import TemporalFusionLayer
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+
+
+class TemporalFusion(autoPyTorchComponent):
+    """
+    Base class for network backbones. Holds the backbone module and the config which was used to create it.
+    """
+    _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
+
+    def __init__(self,
+                 random_state: Optional[np.random.RandomState] = None,
+                 attention_n_head_log: int = 2,
+                 attention_d_model_log: int = 4,
+                 use_dropout: bool = False,
+                 dropout_rate: Optional[float] = None,):
+        autoPyTorchComponent.__init__(self)
+        self.add_fit_requirements(
+            self._required_fit_arguments
+        )
+        self.attention_n_head_log = attention_n_head_log
+        self.attention_d_model_log = attention_d_model_log
+        self.use_dropout = use_dropout
+        self.dropout_rate = dropout_rate
+
+        self.temporal_fusion = None
+        self.n_decoder_output_features = 0
+
+    @property
+    def _required_fit_requirements(self) -> List[FitRequirement]:
+        return [
+            FitRequirement('window_size', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_encoder', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
+        ]
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
+        network_structure = X['network_structure']  # type: NetworkStructure
+
+        self.temporal_fusion = TemporalFusionLayer(window_size=X['window_size'],
+                                              network_structure=network_structure,
+                                              network_encoder=X['network_encoder'],
+                                              n_decoder_output_features=X['n_decoder_output_features'],
+                                              d_model=2 ** self.attention_d_model_log,
+                                              n_head=2 ** self.attention_n_head_log,
+                                              dropout=self.dropout_rate
+                                              )
+        self.n_decoder_output_features = 2 ** self.attention_d_model_log
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({"n_decoder_output_features": self.n_decoder_output_features,
+                  "temporal_fusion": self.temporal_fusion})
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
+        return {
+            'shortname': 'TemporalFusion',
+            'name': 'TemporalFusion',
+            'handles_tabular': False,
+            'handles_image': False,
+            'handles_time_series': True,
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_temporal_fusion',
+                value_range=(True, False),
+                default_value=False),
+            attention_n_head_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='attention_n_head_log',
+                value_range=(1, 3),
+                default_value=2,
+            ),
+            attention_d_model_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='attention_d_model_log',
+                value_range=(4, 8),
+                default_value=4,
+            ),
+            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='use_dropout',
+                value_range=(True, False),
+                default_value=True,
+            ),
+            dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='dropout_rate',
+                value_range=(0.0, 0.8),
+                default_value=0.1,
+            )
+    ) -> ConfigurationSpace:
+        """Return the configuration space of this classification algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, Union[str, int]]):
+                Describes the dataset to work on
+            use_temporal_fusion (HyperparameterSearchSpace):
+                if attention fusion layer is applied (Lim et al.
+                Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
+                https://arxiv.org/abs/1912.09363)
+            attention_n_head_log (HyperparameterSearchSpace):
+                log value of number of heads for interpretable
+            attention_d_model_log (HyperparameterSearchSpace):
+                log value of input of attention model
+            use_dropout (HyperparameterSearchSpace):
+                if dropout is applied to temporal fusion layer
+            dropout_rate (HyperparameterSearchSpace):
+                dropout rate of the temporal fusion  layer
+        Returns:
+            ConfigurationSpace:
+                The configuration space of this algorithm.
+        """
+        cs = ConfigurationSpace()
+
+        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
+        attention_n_head_log = get_hyperparameter(attention_n_head_log, UniformIntegerHyperparameter)
+        attention_d_model_log = get_hyperparameter(attention_d_model_log, UniformIntegerHyperparameter)
+        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
+        dropout_rate = get_hyperparameter(dropout_rate, UniformFloatHyperparameter)
+
+        cs.add_hyperparameters([use_temporal_fusion, attention_n_head_log, attention_d_model_log, use_dropout,
+                                dropout_rate])
+        cond_attention_n_head_log = EqualsCondition(attention_n_head_log, use_temporal_fusion, True)
+        cond_attention_d_model_log = EqualsCondition(attention_d_model_log, use_temporal_fusion, True)
+        cond_use_dropout = EqualsCondition(use_dropout, use_temporal_fusion, True)
+        cond_dropout_rate = EqualsCondition(dropout_rate, use_dropout, True)
+        cs.add_conditions([cond_attention_n_head_log, cond_attention_d_model_log, cond_use_dropout, cond_dropout_rate])
+        return cs
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 1a502d411..da3fee598 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -33,11 +33,6 @@ class ForecastingHead(NetworkHeadComponent):
 
     def __init__(self,
                  random_state: Optional[np.random.RandomState] = None,
-                 use_temporal_fusion: bool = False,
-                 attention_n_head_log: int = 2,
-                 attention_d_model_log: int = 4,
-                 use_dropout: bool = False,
-                 dropout_rate: Optional[float] = None,
                  ):
         super(NetworkHeadComponent, self).__init__(random_state=random_state)
 
@@ -45,11 +40,6 @@ def __init__(self,
         self.head: Optional[nn.Module] = None
         self.required_net_out_put_type: Optional[str] = None
         self.output_shape = None
-        self.use_temporal_fusion = use_temporal_fusion
-        self.attention_n_head_log = attention_n_head_log
-        self.attention_d_model_log = attention_d_model_log
-        self.use_dropout = use_dropout
-        self.dropout_rate = dropout_rate
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -62,7 +52,6 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -99,23 +88,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         head_input_shape = X["n_decoder_output_features"]
         n_prediction_heads = X["n_prediction_heads"]
 
-        network_structure = X['network_structure']  # type: NetworkStructure
-        head_components = {}
-
-        if self.use_temporal_fusion:
-            temporal_fusion = TemporalFusionLayer(window_size=X['window_size'],
-                                                  n_prediction_steps=X['dataset_properties']['n_prediction_steps'],
-                                                  network_structure=network_structure,
-                                                  network_encoder=X['network_encoder'],
-                                                  n_decoder_output_features=X['n_decoder_output_features'],
-                                                  d_model=2 ** self.attention_d_model_log,
-                                                  n_head=2 ** self.attention_n_head_log
-                                                  )
-            head_components['temporal_fusion'] = temporal_fusion
-            head_input_shape = 2 ** self.attention_d_model_log
-
         decoder_has_local_layer = X.get('mlp_has_local_layer', True)
-        head_components['head'] = self.build_head(
+        head_components = self.build_head(
             input_shape=head_input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
@@ -256,65 +230,13 @@ def build_proj_layer(input_shape: Tuple[int, ...],
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter='use_temporal_fusion',
-                value_range=(True, False),
-                default_value=False),
-            attention_n_head_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter='attention_n_head_log',
-                value_range=(1, 3),
-                default_value=2,
-            ),
-            attention_d_model_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter='attention_d_model_log',
-                value_range=(4, 8),
-                default_value=4,
-            ),
-            use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter='use_dropout',
-                value_range=(True, False),
-                default_value=True,
-            ),
-            dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter='dropout_rate',
-                value_range=(0.0, 0.8),
-                default_value=0.1,
-            )
     ) -> ConfigurationSpace:
-        """Return the configuration space of this classification algorithm.
+        """Return the configuration space of network head.
 
-        Args:
-            dataset_properties (Optional[Dict[str, Union[str, int]]):
-                Describes the dataset to work on
-            use_temporal_fusion (HyperparameterSearchSpace):
-                if attention fusion layer is applied (Lim et al.
-                Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
-                https://arxiv.org/abs/1912.09363)
-            attention_n_head_log (HyperparameterSearchSpace):
-                log value of number of heads for interpretable
-            attention_d_model_log (HyperparameterSearchSpace):
-                log value of input of attention model
-            use_dropout (HyperparameterSearchSpace):
-                if dropout is applied to temporal fusion layer
-            dropout_rate (HyperparameterSearchSpace):
-                dropout rate of the temporal fusion  layer
         Returns:
             ConfigurationSpace:
                 The configuration space of this algorithm.
         """
         cs = ConfigurationSpace()
 
-        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
-        attention_n_head_log = get_hyperparameter(attention_n_head_log, UniformIntegerHyperparameter)
-        attention_d_model_log = get_hyperparameter(attention_d_model_log, UniformIntegerHyperparameter)
-        use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
-        dropout_rate = get_hyperparameter(dropout_rate, UniformFloatHyperparameter)
-
-        cs.add_hyperparameters([use_temporal_fusion, attention_n_head_log, attention_d_model_log, use_dropout,
-                                dropout_rate])
-        cond_attention_n_head_log = EqualsCondition(attention_n_head_log, use_temporal_fusion, True)
-        cond_attention_d_model_log = EqualsCondition(attention_d_model_log, use_temporal_fusion, True)
-        cond_use_dropout = EqualsCondition(use_dropout, use_temporal_fusion, True)
-        cond_dropout_rate = EqualsCondition(dropout_rate, use_dropout, True)
-        cs.add_conditions([cond_attention_n_head_log, cond_attention_d_model_log, cond_use_dropout, cond_dropout_rate])
         return cs

From 9e686291089436eedf3aa2224ef01ffb85a9b795 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 7 Mar 2022 13:51:21 +0100
Subject: [PATCH 175/347] maint

---
 .../setup/network/forecasting_architecture.py   |  4 ++--
 .../setup/network/forecasting_network.py        |  1 +
 .../forecasting_encoder/seq_encoder/__init__.py | 17 +++++++++--------
 .../forecasting_base_trainer.py                 |  6 +++---
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index ca8eacfab..3c3b6eb77 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -204,7 +204,7 @@ def __init__(self,
                                                       dataset_properties=dataset_properties,
                                                       network_encoder=network_encoder,
                                                       auto_regressive=auto_regressive)
-        has_temporal_fusion = temporal_fusion is not None
+        has_temporal_fusion = network_structure.use_temporal_fusion
         self.encoder = StackedEncoder(network_structure=network_structure,
                                       has_temporal_fusion=has_temporal_fusion,
                                       encoder_info=network_encoder,
@@ -541,7 +541,7 @@ def forward(self,
                 decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                       decoder_output=decoder_output,
                                                       encoder_lengths=encoder_lengths,
-                                                      self.n_prediction_steps,
+                                                      decoder_length=self.n_prediction_steps,
                                                       static_embedding=x_static
                                                       )
             net_output = self.head(decoder_output)
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 5fe6b6288..c134ffdaa 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -85,6 +85,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    network_embedding=X['network_embedding'],
                                    network_encoder=network_encoder,
                                    network_decoder=network_decoder,
+                                   temporal_fusion=X.get("temporal_fusion", None),
                                    network_head=X['network_head'],
                                    auto_regressive=X['auto_regressive'],
                                    window_size=X['window_size'],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 6ec999821..e1a2a8fe3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -30,8 +30,10 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import ForecastingNetworkStructure
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.TemporalFusion import TemporalFusion
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    ForecastingNetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.TemporalFusion import \
+    TemporalFusion
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -83,7 +85,8 @@ def get_hyperparameter_search_space(
             use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter='use_temporal_fusion',
                 value_range=(True, False),
-                default_value=False),
+                default_value=False,
+            ),
             decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="decoder_auto_regressive",
                 value_range=(True, False),
@@ -156,14 +159,13 @@ def get_hyperparameter_search_space(
         variable_selection = get_hyperparameter(variable_selection, CategoricalHyperparameter)
         share_single_variable_networks = get_hyperparameter(share_single_variable_networks, CategoricalHyperparameter)
 
-        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
         decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
         num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
 
         skip_connection = get_hyperparameter(skip_connection, CategoricalHyperparameter)
 
         hp_network_structures = [num_blocks, decoder_auto_regressive, variable_selection,
-                                 skip_connection, use_temporal_fusion]
+                                 skip_connection]
         cond_skip_connections = []
         if True in skip_connection.choices:
             skip_connection_type = get_hyperparameter(skip_connection_type, CategoricalHyperparameter)
@@ -344,7 +346,6 @@ def get_hyperparameter_search_space(
 
                 cs.add_conditions(conditions_to_add)
 
-
         use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
         cs.add_hyperparameter(use_temporal_fusion)
         if True in use_temporal_fusion.choices:
@@ -417,8 +418,9 @@ def set_hyperparameters(self,
         params = configuration.get_dictionary()
         num_blocks = params['num_blocks']
         decoder_auto_regressive = params['decoder_auto_regressive']
+        use_temporal_fusion = params['use_temporal_fusion']
         forecasting_structure_kwargs = dict(num_blocks=num_blocks,
-                                            use_temporal_fusion=params['use_temporal_fusion'],
+                                            use_temporal_fusion=use_temporal_fusion,
                                             variable_selection=params['variable_selection'],
                                             skip_connection=params['skip_connection'])
         if 'share_single_variable_networks' in params:
@@ -497,7 +499,6 @@ def set_hyperparameters(self,
             self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
 
-        use_temporal_fusion = params["use_temporal_fusion"]
         new_params = []
         if use_temporal_fusion:
             for param, value in params.items():
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 18bd5b691..f59762003 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -101,7 +101,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
                 outputs_data.append(outputs.detach().cpu())
                 targets_data.append(targets.detach().cpu())
 
-            batch_size = data["past_target"].size(0)
+            batch_size = data["past_targets"].size(0)
             loss_sum += loss * batch_size
             N += batch_size
 
@@ -142,7 +142,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        past_target = data['past_target'].float()
+        past_target = data['past_targets'].float()
 
         future_targets = self.cast_targets(future_targets)
 
@@ -224,7 +224,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
         with torch.no_grad():
             for step, (data, future_targets) in enumerate(test_loader):
-                past_target = data['past_target'].float()
+                past_target = data['past_targets'].float()
 
                 mase_coefficients.append(data['mase_coefficient'])
                 if isinstance(self.criterion, MASELoss):

From ed99ba1c65cd2cc6a1a7e8eadd736b7522d5378e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 7 Mar 2022 19:11:15 +0100
Subject: [PATCH 176/347] quantile loss

---
 .../DistributionLoss.py                       | 44 ++++++++---
 .../forecasting_training_loss/QuantileLoss.py | 22 ++----
 .../RegressionLoss.py                         |  2 +-
 .../base_forecasting_loss.py                  |  4 +-
 .../setup/network/forecasting_network.py      | 57 +++------------
 .../forecasting_network_head/distribution.py  | 10 ++-
 .../forecasting_head.py                       | 73 ++++++++++++++-----
 .../pipeline/components/training/losses.py    |  4 +-
 .../pipeline/time_series_forecasting.py       | 24 +-----
 9 files changed, 118 insertions(+), 122 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
index 067882e75..7c3feda3d 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -1,10 +1,13 @@
-from typing import Optional, Dict, Union, Any
+from typing import Optional, Dict, Union, Any, NamedTuple
 import numpy as np
 
 from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
@@ -14,15 +17,22 @@
 
 class DistributionLoss(ForecastingLossComponents):
     loss = LogProbLoss
-    required_net_out_put_type = 'distribution'
+    net_output_type = 'distribution'
 
     def __init__(self,
                  dist_cls: str,
                  random_state: Optional[np.random.RandomState] = None,
+                 forecast_strategy: str = "sample",
+                 num_samples: int = 100,
+                 aggregation: str = "mean",
                  ):
         super(DistributionLoss, self).__init__()
         self.dist_cls = dist_cls
         self.random_state = random_state
+        self.forecasting_strategy = DisForecastingStrategy(dist_cls=dist_cls,
+                                                           forecast_strategy=forecast_strategy,
+                                                           num_samples=num_samples,
+                                                           aggregation=aggregation)
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
@@ -39,18 +49,30 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         required_padding_value = ALL_DISTRIBUTIONS[self.dist_cls].value_in_support
-        X.update({"dist_cls": self.dist_cls,
-                  "required_padding_value": required_padding_value})
+        X.update({"required_padding_value": required_padding_value,
+                  "dist_forecasting_strategy": self.forecasting_strategy})
         return super().transform(X)
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
-                                                                        value_range=tuple(ALL_DISTRIBUTIONS.keys()),
-                                                                        default_value=
-                                                                        list(ALL_DISTRIBUTIONS.keys())[0])
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
+                                                                            value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                                                                            default_value=
+                                                                            list(ALL_DISTRIBUTIONS.keys())[0]),
+            forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
+                                                                                     value_range=('sample', 'mean'),
+                                                                                     default_value='sample'),
+            num_samples: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_samples',
+                                                                               value_range=(50, 200),
+                                                                               default_value=100),
+            aggregation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='aggregation',
+                                                                               value_range=('mean', 'median'),
+                                                                               default_value='mean')
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
+        add_hyperparameter(cs, forecast_strategy, CategoricalHyperparameter)
+        add_hyperparameter(cs, num_samples, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, aggregation, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index 220aacbea..69a260c57 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -14,16 +14,16 @@
 
 class NetworkQuantileLoss(ForecastingLossComponents):
     loss = QuantileLoss
-    required_net_out_put_type = 'quantile'
+    net_output_type = 'quantile'
 
     def __init__(self,
                  random_state: Optional[np.random.RandomState] = None,
-                 lower_quantile: float=0.1,
-                 upper_quantile: float=0.9,
+                 lower_quantile: float = 0.1,
+                 upper_quantile: float = 0.9,
                  ):
-        super(QuantileLoss, self).__init__()
+        super().__init__()
         self.random_state = random_state
-        self.loss = QuantileLoss(lower=lower_quantile, upper=upper_quantile)
+        self.loss = QuantileLoss(quantiles=[lower_quantile, 0.5, upper_quantile])
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
@@ -34,22 +34,16 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             "handles_tabular": False,
             "handles_image": False,
             "handles_time_series": True,
-            'handles_regression': False,
+            'handles_regression': True,
             'handles_classification': False
         }
 
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        required_padding_value = ALL_DISTRIBUTIONS[self.dist_cls].value_in_support
-        X.update({"dist_cls": self.dist_cls,
-                  "required_padding_value": required_padding_value})
-        return super().transform(X)
-
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             lower_quantile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='lower_quantile',
-                                                                     value_range=(0.0, 0.4),
-                                                                     default_value=0.1),
+                                                                                  value_range=(0.0, 0.4),
+                                                                                  default_value=0.1),
             upper_quantile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='upper_quantile',
                                                                                   value_range=(0.6, 1.0),
                                                                                   default_value=0.9)
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
index a21a10a04..7aa32f285 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
@@ -15,7 +15,7 @@
 
 
 class RegressionLoss(ForecastingLossComponents):
-    required_net_out_put_type = 'regression'
+    net_output_type = 'regression'
 
     def __init__(self,
                  loss_name: str,
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
index 1f09dd4e1..c49e56263 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
@@ -9,7 +9,7 @@ class ForecastingLossComponents(autoPyTorchComponent):
     _required_properties = ["name", "handles_tabular", "handles_image", "handles_time_series",
                             'handles_regression', 'handles_classification']
     loss = None
-    required_net_out_put_type = None
+    net_output_type = None
 
     def __init__(self,
                  **kwargs: Any):
@@ -24,5 +24,5 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({"loss": self.loss,
-                  'required_net_out_put_type': self.required_net_out_put_type})
+                  'net_output_type': self.net_output_type})
         return X
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index c134ffdaa..082211e0f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -38,17 +38,8 @@ def __init__(
             self,
             network: Optional[torch.nn.Module] = None,
             random_state: Optional[np.random.RandomState] = None,
-            net_out_type: str = 'regression',
-            forecast_strategy: Optional[str] = 'mean',
-            num_samples: Optional[int] = None,
-            aggregation: Optional[str] = None,
-
     ) -> None:
         super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
-        self.net_out_type = net_out_type
-        self.forecast_strategy = forecast_strategy
-        self.num_samples = num_samples
-        self.aggregation = aggregation
 
     @property
     def _required_fit_requirements(self):
@@ -64,7 +55,7 @@ def _required_fit_requirements(self):
             FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
             FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
             FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
-            FitRequirement("required_net_out_put_type", (str,), user_defined=False, dataset_property=False),
+            FitRequirement("net_output_type", (str,), user_defined=False, dataset_property=False),
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
@@ -72,15 +63,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         # information to fit this stage
         self.check_requirements(X, y)
 
-        if self.net_out_type != X['required_net_out_put_type']:
-            raise ValueError(f"network output type must be the same as required_net_out_put_type defiend by "
-                             f"loss function. However, net_out_type is {self.net_out_type} and "
-                             f"required_net_out_put_type is {X['required_net_out_put_type']}")
-
         network_structure = X['network_structure']
         network_encoder = X['network_encoder']
         network_decoder = X['network_decoder']
 
+        net_output_type = X['net_output_type']
         network_init_kwargs = dict(network_structure=network_structure,
                                    network_embedding=X['network_embedding'],
                                    network_encoder=network_encoder,
@@ -91,10 +78,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    window_size=X['window_size'],
                                    dataset_properties=X['dataset_properties'],
                                    target_scaler=X['target_scaler'],
-                                   output_type=self.net_out_type,
-                                   forecast_strategy=self.forecast_strategy,
-                                   num_samples=self.num_samples,
-                                   aggregation=self.aggregation, )
+                                   output_type=net_output_type,)
+        if net_output_type == 'distribution':
+            dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
+
+            network_init_kwargs.update(dict(forecast_strategy=dist_forecasting_strategy.forecast_strategy,
+                                            num_samples=dist_forecasting_strategy.num_samples,
+                                            aggregation=dist_forecasting_strategy.aggregation, ))
 
         if X['auto_regressive']:
             first_decoder = next(iter(network_decoder.items()))[1]
@@ -162,38 +152,9 @@ def predict(self, loader: torch.utils.data.DataLoader,
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            net_out_type: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='net_out_type',
-                                                                                value_range=('regression',
-                                                                                             'distribution'),
-                                                                                default_value='distribution'
-                                                                                ),
-            forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
-                                                                                     value_range=('sample', 'mean'),
-                                                                                     default_value='sample'),
-            num_samples: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='num_samples',
-                                                                               value_range=(50, 200),
-                                                                               default_value=100),
-            aggregation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='aggregation',
-                                                                               value_range=('mean', 'median'),
-                                                                               default_value='mean')
     ) -> ConfigurationSpace:
         """
-        prediction steagy
         """
         cs = ConfigurationSpace()
 
-        net_out_type = get_hyperparameter(net_out_type, CategoricalHyperparameter)
-
-        forecast_strategy = get_hyperparameter(forecast_strategy, CategoricalHyperparameter)
-        num_samples = get_hyperparameter(num_samples, UniformIntegerHyperparameter)
-        aggregation = get_hyperparameter(aggregation, CategoricalHyperparameter)
-
-        cond_net_out_type = EqualsCondition(forecast_strategy, net_out_type, 'distribution')
-
-        cond_num_sample = EqualsCondition(num_samples, forecast_strategy, 'sample')
-        cond_aggregation = EqualsCondition(aggregation, forecast_strategy, 'sample')
-
-        cs.add_hyperparameters([net_out_type, forecast_strategy, num_samples, aggregation])
-        cs.add_conditions([cond_net_out_type, cond_aggregation, cond_num_sample])
-
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index f82a23431..7c94e521e 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -17,7 +17,7 @@
 # Additionally, scale information is not presented here to avoid
 
 
-from typing import Dict, Tuple
+from typing import Dict, Tuple, NamedTuple
 
 from abc import abstractmethod
 
@@ -201,6 +201,14 @@ def dist_cls(self) -> type(Distribution):
                      # 'poisson': PoissonOutput
                      }  # type: Dict[str, ProjectionLayer]
 
+
+class DisForecastingStrategy(NamedTuple):
+    dist_cls: str
+    forecast_strategy: str = "sample"
+    num_samples: int = 100
+    aggregation: str = "mean"
+
+
 # TODO find components that are compatible with beta, gamma and poisson distrubtion!
 
 # TODO consider how to implement NegativeBinomialOutput without scale information
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index da3fee598..6e7386aed 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -17,7 +17,7 @@
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
-    ALL_DISTRIBUTIONS
+    ALL_DISTRIBUTIONS, DisForecastingStrategy
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
@@ -38,7 +38,6 @@ def __init__(self,
 
         self.add_fit_requirements(self._required_fit_requirements)
         self.head: Optional[nn.Module] = None
-        self.required_net_out_put_type: Optional[str] = None
         self.output_shape = None
 
     @property
@@ -68,20 +67,26 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         output_shape = X['dataset_properties']['output_shape']
 
-        self.required_net_out_put_type = X['required_net_out_put_type']
+        net_output_type = X['net_output_type']
 
         if 'block_1' in X['network_decoder'] and X['network_decoder']['block_1'].decoder_properties.multi_blocks:
             # if the decoder is a stacked block, we directly build head inside the decoder
-            if self.required_net_out_put_type != 'regression':
+            if net_output_type != 'regression':
                 raise ValueError("decoder with multi block structure only allow regression loss!")
             self.output_shape = output_shape
             return self
 
-        if self.required_net_out_put_type == 'distribution':
-            if 'dist_cls' not in X:
-                raise ValueError('Distribution output type must contain dist_cls!')
-
-        dist_cls = X.get('dist_cls', None)
+        num_quantile = 0
+        dist_cls = None
+        if net_output_type == 'distribution':
+            if 'dist_forecasting_strategy' not in X:
+                raise ValueError('Distribution output type must contain dis_forecasting_strategy!')
+            dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
+            dist_cls = dist_forecasting_strategy.dist_cls
+        elif net_output_type == 'quantile':
+            if not hasattr(X['loss'], 'quantiles'):
+                raise ValueError("For Quantile losses, the attribute quantile must be given!")
+            num_quantile = len(X['loss'].quantile)
 
         auto_regressive = X.get('auto_regressive', False)
 
@@ -89,6 +94,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         n_prediction_heads = X["n_prediction_heads"]
 
         decoder_has_local_layer = X.get('mlp_has_local_layer', True)
+
         head_components = self.build_head(
             input_shape=head_input_shape,
             output_shape=output_shape,
@@ -96,6 +102,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             dist_cls=dist_cls,
             decoder_has_local_layer=decoder_has_local_layer,
             n_prediction_heads=n_prediction_heads,
+            num_quantile=num_quantile,
         )
         self.head = head_components
         return self
@@ -150,8 +157,11 @@ def build_head(self,
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
                    decoder_has_local_layer: bool = True,
+                   net_out_put_type: str="distribution",
                    dist_cls: Optional[str] = None,
-                   n_prediction_heads: int = 1) -> nn.Module:
+                   n_prediction_heads: int = 1,
+                   num_quantile:int = 3,
+                   ) -> nn.Module:
         """
         Builds the head module and returns it
 
@@ -160,22 +170,45 @@ def build_head(self,
             output_shape (Tuple[int, ...]): shape of the output of the head
             auto_regressive (bool): if the network is auto-regressive
             decoder_has_local_layer (bool): if the decoder has local layer
+            net_out_put_type (str): network output type
             dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
             n_prediction_heads (Dict): additional paramter for initializing architectures. How many heads to predict
+            num_quantile (int): number of quantile losses
 
         Returns:
             nn.Module: head module
         """
-        head_layer = self.build_proj_layer(
-            input_shape=input_shape,
-            output_shape=output_shape,
-            auto_regressive=auto_regressive,
-            decoder_has_local_layer=decoder_has_local_layer,
-            net_out_put_type=self.required_net_out_put_type,
-            dist_cls=dist_cls,
-            n_prediction_heads=n_prediction_heads
-        )
-        return head_layer
+        if net_out_put_type == 'distribution':
+            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=input_shape,
+                                                     output_shape=output_shape[1:],
+                                                     n_prediction_heads=n_prediction_heads,
+                                                     auto_regressive=auto_regressive,
+                                                     decoder_has_local_layer=decoder_has_local_layer
+                                                     )
+            return proj_layer
+        elif net_out_put_type == 'regression':
+            if decoder_has_local_layer:
+                proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
+            else:
+                proj_layer = nn.Sequential(
+                    nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
+                )
+            return proj_layer
+        elif net_out_put_type == "quantile":
+            if decoder_has_local_layer:
+                proj_layer = [nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
+                              for _ in range(num_quantile)]
+            else:
+                proj_layer = [nn.Sequential(
+                    nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
+                ) for _ in range(num_quantile)]
+            proj_layer = nn.ModuleList[proj_layer]
+            return proj_layer
+        else:
+            raise NotImplementedError(f"Unsupported network type "
+                             f"{net_out_put_type} (should be regression or distribution)")
 
     @staticmethod
     def build_proj_layer(input_shape: Tuple[int, ...],
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 293eb125e..d1e85b86f 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -87,9 +87,9 @@ def forward(self,
 class QuantileLoss(Loss):
     __constants__ = ['reduction']
 
-    def __init__(self, reduction: str = 'mean',lower=0.1, upper=0.9) -> None:
+    def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5]) -> None:
         super(QuantileLoss, self).__init__(reduction)
-        self.quantiles = [lower, 0.5, upper]
+        self.quantiles = quantiles
 
     def forward(self,
                 input: List[torch.Tensor],
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index cb7745098..d2aa5aa95 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -218,27 +218,6 @@ def _get_hyperparameter_search_space(self,
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
                     forbidden_losses_all.append(forbidden_hp_dist)
 
-            hp_net_output_type = []
-            if 'network' in self.named_steps.keys():
-                hp_net_output_type.append(cs.get_hyperparameter('network:net_out_type'))
-
-            if 'RegressionLoss' in hp_loss.choices:
-                # TODO Quantile loses need to be added here
-                forbidden_hp_loss = ForbiddenInClause(hp_loss, ['RegressionLoss'])
-                # RegressionLos only allow regression hp_net_out
-                for hp_net_out in hp_net_output_type:
-                    forbidden_hp_dist = ForbiddenInClause(hp_net_out, ['distribution'])
-                    forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_loss)
-                    forbidden_losses_all.append(forbidden_hp_dist)
-
-            if 'DistributionLoss' in hp_loss.choices:
-                # TODO Quantile loses need to be added here
-                forbidden_hp_loss = ForbiddenInClause(hp_loss, ['DistributionLoss'])
-                # DistributionLoss only allow distribution hp_net_out
-                for hp_net_out in hp_net_output_type:
-                    forbidden_hp_dist = ForbiddenInClause(hp_net_out, ['regression'])
-                    forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_loss)
-                    forbidden_losses_all.append(forbidden_hp_dist)
 
             network_encoder_hp = cs.get_hyperparameter('network_backbone:__choice__')
 
@@ -260,7 +239,7 @@ def _get_hyperparameter_search_space(self,
                     forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                     forbidden_losses_all.append(forbidden_hp_ar_mlp)
 
-            forecast_strategy = cs.get_hyperparameter('network:forecast_strategy')
+            forecast_strategy = cs.get_hyperparameter('loss:DistributionLoss:forecast_strategy')
             if 'mean' in forecast_strategy.choices:
                 for hp_ar in hp_auto_regressive:
                     forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
@@ -282,7 +261,6 @@ def _get_hyperparameter_search_space(self,
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
             forbidden_backcast_false = ForbiddenEqualsClause(data_loader_backcast, False)
 
-
             hp_flat_encoder =  cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
 
 

From 45535ba5ed5ce57204bce9231cd1edb8da94d9b2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 11:14:46 +0100
Subject: [PATCH 177/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py   |  4 +-
 .../forecasting_training_loss/QuantileLoss.py |  9 +-
 .../setup/network/forecasting_architecture.py | 50 ++++++----
 .../setup/network/forecasting_network.py      |  3 +
 .../forecasting_backbone/cells.py             | 60 ++++++++----
 .../base_forecasting_encoder.py               |  4 +-
 .../seq_encoder/__init__.py                   |  9 +-
 .../other_components/TemporalFusion.py        | 23 ++---
 .../forecasting_head.py                       | 93 ++++++-------------
 .../pipeline/components/training/losses.py    |  6 +-
 .../trainer/forecasting_trainer/__init__.py   |  3 +-
 .../forecasting_base_trainer.py               | 12 ++-
 12 files changed, 141 insertions(+), 135 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index cfe98b692..1ea7eed7e 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -148,8 +148,8 @@ def __getitem__(self, index: int, train: bool = True) \
                 "future_features": future_features,
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
-                'encoder_length': past_target.shape[0],
-                'decoder_length': None if targets_future is None else targets_future.shape[0] }, targets_future
+                'encoder_lengths': past_target.shape[0],
+                'decoder_lengths': None if targets_future is None else targets_future.shape[0] }, targets_future
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index 69a260c57..72889e8bd 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -1,10 +1,10 @@
 from typing import Optional, Dict, Union, Any
+from functools import partial
 import numpy as np
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import UniformFloatHyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import ALL_DISTRIBUTIONS
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
@@ -23,7 +23,12 @@ def __init__(self,
                  ):
         super().__init__()
         self.random_state = random_state
-        self.loss = QuantileLoss(quantiles=[lower_quantile, 0.5, upper_quantile])
+        self.quantiles = [0.5, lower_quantile, upper_quantile]
+        self.loss = partial(QuantileLoss, quantiles=self.quantiles)
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        X.update({"quantile_values": self.quantiles})
+        return super().transform(X)
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 3c3b6eb77..e609ace29 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -156,6 +156,7 @@ def get_lagged_subsequences_inference(
 
 class AbstractForecastingNet(nn.Module):
     future_target_required = False
+    dtype = torch.float
 
     def __init__(self,
                  network_structure: NetworkStructure,
@@ -199,11 +200,14 @@ def __init__(self,
         super().__init__()
         self.network_structure = network_structure
         self.embedding = network_embedding
+        # modules that generate tensors while doing forward pass
+        self.lazy_modules = []
         if network_structure.variable_selection:
             self.variable_selector = VariableSelector(network_structure=network_structure,
                                                       dataset_properties=dataset_properties,
                                                       network_encoder=network_encoder,
                                                       auto_regressive=auto_regressive)
+            self.lazy_modules.append(self.variable_selector)
         has_temporal_fusion = network_structure.use_temporal_fusion
         self.encoder = StackedEncoder(network_structure=network_structure,
                                       has_temporal_fusion=has_temporal_fusion,
@@ -215,6 +219,7 @@ def __init__(self,
                                       decoder_info=network_decoder)
         if has_temporal_fusion:
             self.temporal_fusion = temporal_fusion  # type: TemporalFusionLayer
+            self.lazy_modules.append(self.temporal_fusion)
         self.has_temporal_fusion = has_temporal_fusion
         self.head = network_head
 
@@ -238,7 +243,6 @@ def __init__(self,
         self.num_samples = num_samples
         self.aggregation = aggregation
 
-        # self.mask_futur_features = decoder_properties['mask_future_features']
         self._device = torch.device('cpu')
 
         if not network_structure.variable_selection:
@@ -263,12 +267,16 @@ def device(self):
     def device(self, device: torch.device):
         self.to(device)
         self._device = device
+        for model in self.lazy_modules:
+            model.device = device
 
     def rescale_output(self,
-                       outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                       outputs: Union[torch.distributions.Distribution, torch.Tensor, List[torch.Tensor]],
                        loc: Optional[torch.Tensor],
                        scale: Optional[torch.Tensor],
                        device: torch.device = torch.device('cpu')):
+        if isinstance(outputs, List):
+            return [self.rescale_output(output, loc, scale, device) for output in outputs]
         if loc is not None or scale is not None:
             if isinstance(outputs, torch.distributions.Distribution):
                 transform = AffineTransform(loc=0.0 if loc is None else loc.to(device),
@@ -366,14 +374,14 @@ def pre_processing(self,
             if length_past > 0:
                 if past_features is None:
                     x_past = {'past_targets': x_past.to(device=self.device),
-                              'past_features': torch.zeros((batch_size, length_past, 0),
-                                                           dtype=self.dtype, device=self.device)}
+                              'features': torch.zeros((batch_size, length_past, 1),
+                                                           dtype=past_targets.dtype, device=self.device)}
             else:
                 x_past = None
             if length_future > 0:
                 if future_features is None:
-                    x_future = {'future_features': torch.zeros((batch_size, length_future, 0),
-                                                               dtype=self.dtype, device=self.device)}
+                    x_future = {'features': torch.zeros((batch_size, length_future, 1),
+                                                               dtype=past_targets.dtype, device=self.device)}
             else:
                 x_future = None
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
@@ -411,6 +419,7 @@ def forward(self,
             length_past=self.window_size,
             length_future=self.n_prediction_steps
         )
+
         encoder_additional = [static_context_initial_hidden]
         encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
         encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
@@ -420,7 +429,7 @@ def forward(self,
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
                                                   encoder_lengths=encoder_lengths,
-                                                  decoder_lenght=self.n_prediction_steps,
+                                                  decoder_length=self.n_prediction_steps,
                                                   static_embedding=x_static
                                                   )
         output = self.head(decoder_output)
@@ -429,6 +438,8 @@ def forward(self,
     def pred_from_net_output(self, net_output):
         if self.output_type == 'regression':
             return net_output
+        elif self.output_type == 'quantile':
+            return net_output[0]
         elif self.output_type == 'distribution':
             if self.forecast_strategy == 'mean':
                 if isinstance(net_output, list):
@@ -455,9 +466,10 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
+                static_features: Optional[torch.Tensor] = None,
+                encoder_lengths: Optional[torch.LongTensor] = None,
                 ):
-        net_output = self(past_targets, past_features)
+        net_output = self(past_targets, past_features, encoder_lengths=encoder_lengths)
         return self.pred_from_net_output(net_output)
 
 
@@ -483,8 +495,8 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         if future_features is None:
             x_future = {
                 'future_prediction': future_targets.to(self.device),
-                'future_features': torch.zeros((batch_size, length_future, 0),
-                                               dtype=self.dtype, device=self.device)}
+                'features': torch.zeros((batch_size, length_future, 0),
+                                               dtype=future_targets.dtype, device=self.device)}
         _, x_future, _, _ = self.variable_selector(x_past=None,
                                                    x_future=x_future,
                                                    x_static=None,
@@ -724,8 +736,8 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         if future_features is None:
             x_future = {
                 'future_prediction': future_targets.to(self.device),
-                'future_features': torch.zeros((batch_size, length_future, 0),
-                                               dtype=self.dtype, device=self.device)}
+                'features': torch.zeros((batch_size, length_future, 0),
+                                               dtype=future_targets.dtype, device=self.device)}
         _, x_future, _, _ = self.variable_selector(x_past=None,
                                                    x_future=x_future,
                                                    x_static=None,
@@ -770,8 +782,8 @@ def forward(self,
                 if past_features is None:
                     if past_features is None:
                         x_past = {'past_targets': targets_all.to(device=self.device),
-                                  'past_features': torch.zeros((batch_size, length_past, 0),
-                                                               dtype=self.dtype, device=self.device)}
+                                  'features': torch.zeros((batch_size, length_past, 0),
+                                                               dtype=targets_all.dtype, device=self.device)}
 
                 x_input, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
                                                                                       x_future=None,
@@ -825,8 +837,8 @@ def forward(self,
                 if past_features is None:
                     if past_features is None:
                         x_past = {'past_targets': past_targets.to(device=self.device),
-                                  'past_features': torch.zeros((batch_size, length_past, 0),
-                                                               dtype=self.dtype, device=self.device)}
+                                  'features': torch.zeros((batch_size, length_past, 0),
+                                                               dtype=past_targets.dtype, device=self.device)}
 
                 x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
                                                                                      x_future=None,
@@ -925,8 +937,8 @@ def forward(self,
                     if past_features is None:
                         if past_features is None:
                             x_next = {'past_targets': x_next,
-                                      'past_features': torch.zeros((batch_size, 1, 0),
-                                                                   dtype=self.dtype, device=self.device)}
+                                      'features': torch.zeros((batch_size, 1, 0),
+                                                                   dtype=x_next.dtype, device=self.device)}
 
                     x_next, _, _, _ = self.variable_selector(x_past=x_next,
                                                              x_future=None,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 082211e0f..79d5c8f6c 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -129,6 +129,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
             past_features = X_batch['past_features']
             future_features = X_batch["future_features"]
             static_features = X_batch["static_features"]
+            encoder_lengths = X_batch['encoder_lengths']
 
             if past_targets.ndim == 2:
                 past_targets = past_targets.unsqueeze(-1)
@@ -142,6 +143,8 @@ def predict(self, loader: torch.utils.data.DataLoader,
                 if pred_kwargs[key] is not None:
                     pred_kwargs[key] = pred_kwargs[key].float()
 
+            pred_kwargs.update({'encoder_lengths': encoder_lengths})
+
             with torch.no_grad():
                 Y_batch_pred = self.network.predict(**pred_kwargs)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 4e9346e48..6e0d6cb7a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -81,7 +81,7 @@ def __init__(self,
         )
         self.post_attn_gate_norm = GateAddNorm(d_model, dropout=dropout, trainable_add=False)
         self.pos_wise_ff = GatedResidualNetwork(input_size=d_model, hidden_size=d_model,
-                                                output_size=d_model, dropout=self.hparams.dropout)
+                                                output_size=d_model, dropout=dropout)
 
         self.network_structure = network_structure
         if network_structure.skip_connection:
@@ -118,6 +118,7 @@ def forward(self,
             attn_input = self.enrichment(network_output)
 
         # Attention
+        encoder_lengths = torch.where(encoder_lengths < self.window_size, encoder_lengths, self.window_size)
         attn_output, attn_output_weights = self.attention_fusion(
             q=attn_input[:, self.window_size:],  # query only for predictions
             k=attn_input,
@@ -135,6 +136,15 @@ def forward(self,
         else:
             return output
 
+    @property
+    def device(self):
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device):
+        self.to(device)
+        self._device = device
+
     def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length: int):
         """
         https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
@@ -177,21 +187,26 @@ def __init__(self,
         static_input_sizes = dataset_properties['static_features_shape']
         self.hidden_size = first_encoder_output_shape
         variable_selector = nn.ModuleDict()
-        self.static_variable_selection = VariableSelectionNetwork(
-            input_sizes=static_input_sizes,
-            hidden_size=self.hidden_size,
-            input_embedding_flags={},
-            dropout=network_structure.grn_dropout_rate,
-        )
+
+        self._device = torch.device('cpu')
+
+        if not dataset_properties['uni_variant']:
+            # TODO
+            self.static_variable_selection = VariableSelectionNetwork(
+                input_sizes=static_input_sizes,
+                hidden_size=self.hidden_size,
+                input_embedding_flags={},
+                dropout=network_structure.grn_dropout_rate,
+            )
         self.static_input_sizes = static_input_sizes
         if dataset_properties['uni_variant']:
             # variable selection for encoder and decoder
             encoder_input_sizes = {
                 'past_targets': dataset_properties['input_shape'][-1],
-                'past_features': 0
+                'features': 1
             }
             decoder_input_sizes = {
-                'future_features': 0
+                'features': 1
             }
             if auto_regressive:
                 decoder_input_sizes.update({'future_prediction': dataset_properties['output_shape'][-1]})
@@ -263,6 +278,15 @@ def __init__(self,
         self.cached_static_contex = None
         self.cached_static_embedding = None
 
+    @property
+    def device(self):
+        return self._device
+
+    @device.setter
+    def device(self, device: torch.device):
+        self.to(device)
+        self._device = device
+
     def forward(self,
                 x_past: Optional[Dict[str,torch.Tensor]],
                 x_future: Optional[Dict[str, torch.Tensor]],
@@ -282,13 +306,14 @@ def forward(self,
             if self.static_input_sizes > 0:
                 static_embedding, _ = self.static_variable_selection(x_static)
             else:
+                model_dtype = next(iter(x_past.values())).dtype if length_past > 0 else next(iter(x_future.values())).dtype
                 static_embedding = torch.zeros(
-                    (batch_size, self.hidden_size), dtype=self.dtype, device=self.device
+                    (batch_size, self.hidden_size), dtype=model_dtype, device=self.device
                 )
-                static_variable_selection = torch.zeros((batch_size, 0), dtype=self.dtype, device=self.device)
+                static_variable_selection = torch.zeros((batch_size, 0), dtype=model_dtype, device=self.device)
 
             static_context_variable_selection = self.static_context_variable_selection(static_embedding)[:, None]
-            static_context_initial_hidden = (init_hidden(static_embedding) for init_hidden in
+            static_context_initial_hidden = tuple(init_hidden(static_embedding) for init_hidden in
                                              self.static_context_initial_hidden)
             if cache_static_contex:
                 self.cached_static_contex = static_context_variable_selection
@@ -297,7 +322,7 @@ def forward(self,
             static_embedding = self.cached_static_embedding
             static_context_initial_hidden = None
             static_context_variable_selection = self.cached_static_contex
-        static_context_variable_selection = static_context_variable_selection[:, None].expand(-1, timesteps, -1)
+        static_context_variable_selection = static_context_variable_selection.expand(-1, timesteps, -1)
         if x_past is not None:
             embeddings_varying_encoder, _ = self.encoder_variable_selection(
                 x_past,
@@ -394,13 +419,13 @@ def forward(self,
                 else:
                     rnn_num_layers = encoder_i.config['num_layers']
                     hx = additional_input[i]
-                    if rnn_num_layers == 1 or hx is None:
+                    if hx is None:
                         fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
                     else:
                         if self.encoder_num_hidden_states[i] == 1:
                             fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx.expand((rnn_num_layers, -1, -1)))
                         else:
-                            hx = (hx_i.expand(rnn_num_layers, -1, -1) for hx_i in hx)
+                            hx = tuple(hx_i.expand(rnn_num_layers, -1, -1) for hx_i in hx)
                             fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
             else:
                 if incremental_update:
@@ -431,14 +456,15 @@ def forward(self,
                         self.cached_intermediate_state[i] = x
                     # otherwise the decoder does not exist for this layer
             x = fx
+
         if self.has_temporal_fusion:
             if incremental_update:
                 self.cached_intermediate_state[i + 1] = torch.cat([self.cached_intermediate_state[i+1], x], dim=1)
             else:
                 self.cached_intermediate_state[i + 1] = x
-            return encoder2decoder, None
-        else:
             return encoder2decoder, x
+        else:
+            return encoder2decoder, None
 
 
 class StackedDecoder(nn.Module):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index ce18f5823..394a88f94 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -19,6 +19,7 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderProperties, EncoderBlockInfo, EncoderNetwork
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 
 
 class BaseForecastingEncoder(autoPyTorchComponent):
@@ -52,6 +53,7 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('static_features_shape', (int,), user_defined=True, dataset_property=True),
+            FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -77,7 +79,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
             in_features = input_shape[-1]
 
-            variable_selection = X.get("variable_selection", False)
+            variable_selection = X['network_structure'].variable_selection
             if variable_selection:
                 in_features = self.n_encoder_output_feature()
             elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index e1a2a8fe3..b7acffff0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -413,7 +413,6 @@ def set_hyperparameters(self,
         Returns:
             self: returns an instance of self
         """
-        new_params = {}
 
         params = configuration.get_dictionary()
         num_blocks = params['num_blocks']
@@ -424,8 +423,8 @@ def set_hyperparameters(self,
                                             variable_selection=params['variable_selection'],
                                             skip_connection=params['skip_connection'])
         if 'share_single_variable_networks' in params:
-            forecasting_structure_kwargs['forecasting_structure_kwargs'] = params['forecasting_structure_kwargs']
-            del params['forecasting_structure_kwargs']
+            forecasting_structure_kwargs['share_single_variable_networks'] = params['share_single_variable_networks']
+            del params['share_single_variable_networks']
 
         del params['num_blocks']
         del params['use_temporal_fusion']
@@ -451,6 +450,8 @@ def set_hyperparameters(self,
         decoder_components = self.get_decoder_components()
 
         for i in range(1, num_blocks + 1):
+            new_params = {}
+
             block_prefix = f'block_{i}:'
             choice = params[block_prefix + '__choice__']
             del params[block_prefix + '__choice__']
@@ -499,7 +500,7 @@ def set_hyperparameters(self,
             self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
 
-        new_params = []
+        new_params = {}
         if use_temporal_fusion:
             for param, value in params.items():
                 if param.startswith(self.tf_prefix):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index 5a2d183b9..9244f9b01 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -32,7 +32,7 @@ def __init__(self,
                  dropout_rate: Optional[float] = None,):
         autoPyTorchComponent.__init__(self)
         self.add_fit_requirements(
-            self._required_fit_arguments
+            self._required_fit_requirements
         )
         self.attention_n_head_log = attention_n_head_log
         self.attention_d_model_log = attention_d_model_log
@@ -67,6 +67,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({"n_decoder_output_features": self.n_decoder_output_features,
                   "temporal_fusion": self.temporal_fusion})
+        return X
 
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
@@ -81,10 +82,6 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            use_temporal_fusion: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter='use_temporal_fusion',
-                value_range=(True, False),
-                default_value=False),
             attention_n_head_log: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter='attention_n_head_log',
                 value_range=(1, 3),
@@ -128,18 +125,12 @@ def get_hyperparameter_search_space(
                 The configuration space of this algorithm.
         """
         cs = ConfigurationSpace()
-
-        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
-        attention_n_head_log = get_hyperparameter(attention_n_head_log, UniformIntegerHyperparameter)
-        attention_d_model_log = get_hyperparameter(attention_d_model_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, attention_n_head_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, attention_d_model_log, UniformIntegerHyperparameter)
         use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
         dropout_rate = get_hyperparameter(dropout_rate, UniformFloatHyperparameter)
 
-        cs.add_hyperparameters([use_temporal_fusion, attention_n_head_log, attention_d_model_log, use_dropout,
-                                dropout_rate])
-        cond_attention_n_head_log = EqualsCondition(attention_n_head_log, use_temporal_fusion, True)
-        cond_attention_d_model_log = EqualsCondition(attention_d_model_log, use_temporal_fusion, True)
-        cond_use_dropout = EqualsCondition(use_dropout, use_temporal_fusion, True)
-        cond_dropout_rate = EqualsCondition(dropout_rate, use_dropout, True)
-        cs.add_conditions([cond_attention_n_head_log, cond_attention_d_model_log, cond_use_dropout, cond_dropout_rate])
+        cs.add_hyperparameters([use_dropout, dropout_rate])
+        cond_dropout = EqualsCondition(dropout_rate, use_dropout, True)
+        cs.add_condition(cond_dropout)
         return cs
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 6e7386aed..81d3985a1 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -24,6 +24,15 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import TemporalFusionLayer
 
 
+class QuantileHead(nn.Module):
+    def __init__(self, head_components: List[nn.Module]):
+        super().__init__()
+        self.net = nn.ModuleList(head_components)
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        return [net(x) for net in self.net]
+
+
 class ForecastingHead(NetworkHeadComponent):
     """
     Base class for network heads used for forecasting.
@@ -76,7 +85,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             self.output_shape = output_shape
             return self
 
-        num_quantile = 0
+        num_quantiles = 0
         dist_cls = None
         if net_output_type == 'distribution':
             if 'dist_forecasting_strategy' not in X:
@@ -84,9 +93,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
             dist_cls = dist_forecasting_strategy.dist_cls
         elif net_output_type == 'quantile':
-            if not hasattr(X['loss'], 'quantiles'):
-                raise ValueError("For Quantile losses, the attribute quantile must be given!")
-            num_quantile = len(X['loss'].quantile)
+            if 'quantile_values' not in X:
+                raise ValueError("For Quantile losses, quantiles must be given in X!")
+            num_quantiles = len(X['quantile_values'])
 
         auto_regressive = X.get('auto_regressive', False)
 
@@ -99,10 +108,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             input_shape=head_input_shape,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
-            dist_cls=dist_cls,
             decoder_has_local_layer=decoder_has_local_layer,
+            net_output_type=net_output_type,
+            dist_cls=dist_cls,
             n_prediction_heads=n_prediction_heads,
-            num_quantile=num_quantile,
+            num_quantiles=num_quantiles,
         )
         self.head = head_components
         return self
@@ -157,10 +167,10 @@ def build_head(self,
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
                    decoder_has_local_layer: bool = True,
-                   net_out_put_type: str="distribution",
+                   net_output_type: str="distribution",
                    dist_cls: Optional[str] = None,
                    n_prediction_heads: int = 1,
-                   num_quantile:int = 3,
+                   num_quantiles:int = 3,
                    ) -> nn.Module:
         """
         Builds the head module and returns it
@@ -170,7 +180,7 @@ def build_head(self,
             output_shape (Tuple[int, ...]): shape of the output of the head
             auto_regressive (bool): if the network is auto-regressive
             decoder_has_local_layer (bool): if the decoder has local layer
-            net_out_put_type (str): network output type
+            net_output_type (str): network output type
             dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
             n_prediction_heads (Dict): additional paramter for initializing architectures. How many heads to predict
             num_quantile (int): number of quantile losses
@@ -178,7 +188,7 @@ def build_head(self,
         Returns:
             nn.Module: head module
         """
-        if net_out_put_type == 'distribution':
+        if net_output_type == 'distribution':
             proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=input_shape,
                                                      output_shape=output_shape[1:],
                                                      n_prediction_heads=n_prediction_heads,
@@ -186,7 +196,7 @@ def build_head(self,
                                                      decoder_has_local_layer=decoder_has_local_layer
                                                      )
             return proj_layer
-        elif net_out_put_type == 'regression':
+        elif net_output_type == 'regression':
             if decoder_has_local_layer:
                 proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
             else:
@@ -195,70 +205,21 @@ def build_head(self,
                     nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
                 )
             return proj_layer
-        elif net_out_put_type == "quantile":
+        elif net_output_type == "quantile":
             if decoder_has_local_layer:
                 proj_layer = [nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
-                              for _ in range(num_quantile)]
+                              for _ in range(num_quantiles)]
             else:
                 proj_layer = [nn.Sequential(
                     nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
                     nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
-                ) for _ in range(num_quantile)]
-            proj_layer = nn.ModuleList[proj_layer]
+                ) for _ in range(num_quantiles)]
+            proj_layer = QuantileHead(proj_layer)
             return proj_layer
         else:
             raise NotImplementedError(f"Unsupported network type "
-                             f"{net_out_put_type} (should be regression or distribution)")
-
-    @staticmethod
-    def build_proj_layer(input_shape: Tuple[int, ...],
-                         output_shape: Tuple[int, ...],
-                         n_prediction_heads: int,
-                         auto_regressive: bool,
-                         decoder_has_local_layer: bool,
-                         net_out_put_type: str,
-                         dist_cls: Optional[str] = None) -> torch.nn.Module:
-        """
-        a final layer that project the head output to the final distribution
-        Args:
-            input_shape (int): input shape to build the header,
-            is used to initialize size of the linear layer
-            output_shape (Tuple[int, ..]): deserved output shape
-            n_prediction_heads: int, how many steps the head want to predict
-            auto_regressive (bool): if the network is auto-regressive
-            decoder_has_local_layer (bool): if the decoder has local layer
-            net_out_put_type (str), type of the loss, it determines the output of the network
-            dist_cls (str), distribution class, only activate if output is a distribution
-
-        Returns:
-            proj_layer: nn.Module
-            projection layer that maps the features to the final output
-            required_padding_value: float,
-            which values need to be padded when loadding the data
-
-        """
-        if net_out_put_type == 'distribution':
-            if dist_cls not in ALL_DISTRIBUTIONS.keys():
-                raise NotImplementedError(f'Unsupported distribution class type: {dist_cls}')
-            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=input_shape,
-                                                     output_shape=output_shape[1:],
-                                                     n_prediction_heads=n_prediction_heads,
-                                                     auto_regressive=auto_regressive,
-                                                     decoder_has_local_layer=decoder_has_local_layer
-                                                     )
-            return proj_layer
-        elif net_out_put_type == 'regression':
-            if decoder_has_local_layer:
-                proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
-            else:
-                proj_layer = nn.Sequential(
-                    nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
-                    nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
-                )
-            return proj_layer
-        else:
-            raise ValueError(f"Unsupported network type "
-                             f"{net_out_put_type} (should be regression or distribution)")
+                                      f"{net_output_type} (should be one of the following: "
+                                      f"regression, distribution or quantiles)")
 
     @staticmethod
     def get_hyperparameter_search_space(
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index d1e85b86f..b555e6f2a 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -91,6 +91,9 @@ def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5]) -> N
         super(QuantileLoss, self).__init__(reduction)
         self.quantiles = quantiles
 
+    def set_quantiles(self, quantiles = List[float]):
+        self.quantiles = quantiles
+
     def forward(self,
                 input: List[torch.Tensor],
                 target_tensor: torch.Tensor) -> torch.Tensor:
@@ -98,9 +101,10 @@ def forward(self,
         losses_all = []
         for q, y_pred in zip(self.quantiles, input):
             diff = target_tensor - y_pred
-            loss_q = max(torch.max(q * diff), (1-q) * diff)
+            loss_q = torch.max(q * diff, (1-q) * diff)
             losses_all.append(loss_q.unsqueeze(0))
         losses_all = torch.concat(losses_all)
+
         if self.reduction == 'mean':
             return losses_all.mean()
         elif self.reduction == 'sum':
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index c40e3e2f8..42f7b5095 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -70,8 +70,6 @@ def prepare_trainer(self, X):
         if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
             metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
 
-        additional_losses = X['additional_losses'] if 'additional_losses' in X else None
-
         self.choice.prepare(
             model=X['network'],
             metrics=metrics,
@@ -90,6 +88,7 @@ def prepare_trainer(self, X):
             backcast_loss_ratio=X.get('backcast_loss_ratio', 0.0)
         )
 
+
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available trainer components
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index f59762003..53902cb46 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -143,6 +143,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             float: the loss incurred in the prediction
         """
         past_target = data['past_targets'].float()
+        encoder_lengths = data['encoder_lengths']
 
         future_targets = self.cast_targets(future_targets)
 
@@ -157,7 +158,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             past_target, criterion_kwargs_past = self.data_preparation(past_target,
                                                                        past_target.to(self.device))
             past_target, criterion_kwargs_future = self.data_preparation(past_target, future_targets.to(self.device))
-            backcast, forecast = self.model(past_target)
+            backcast, forecast = self.model(past_targets=past_target, encoder_lengths=encoder_lengths)
 
             loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
             loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
@@ -181,7 +182,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             else:
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets.to(self.device))
 
-            outputs = self.model(past_target, future_targets)
+            outputs = self.model(past_targets=past_target, future_targets=future_targets, encoder_lengths=encoder_lengths)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
 
@@ -225,6 +226,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         with torch.no_grad():
             for step, (data, future_targets) in enumerate(test_loader):
                 past_target = data['past_targets'].float()
+                encoder_lengths = data['encoder_lengths']
 
                 mase_coefficients.append(data['mase_coefficient'])
                 if isinstance(self.criterion, MASELoss):
@@ -237,14 +239,14 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
                 if isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
-                    outputs = self.model(past_target, future_targets)
+                    outputs = self.model(past_targets=past_target, future_targets=future_targets, encoder_lengths=encoder_lengths)
                 else:
-                    outputs = self.model(past_target)
+                    outputs = self.model(past_targets=past_target, encoder_lengths=encoder_lengths)
 
                 # prepare
                 future_targets = future_targets.to(self.device)
 
-                if isinstance(outputs, list):
+                if isinstance(outputs, list) and self.model.output_type != 'quantile':
                     loss = [self.criterion(output, future_targets) for output in outputs]
                     loss = torch.mean(torch.Tensor(loss))
                 else:

From 9fac9fe659822333d94bb023fbfdac169b87fd20 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 13:02:00 +0100
Subject: [PATCH 178/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py   |  8 +--
 .../setup/network/forecasting_architecture.py | 51 ++++++++++++-------
 .../forecasting_backbone/cells.py             |  5 +-
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 1ea7eed7e..10bc838bb 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -265,14 +265,8 @@ def __init__(self,
                         "configuration space on the hyperparameter window_size, if you want to adapt this value"
                         "you could pass freq with a numerical value")
             freq_value = SEASONALITY_MAP.get(freq, None)
-        if isinstance(freq, list):
-            if np.max(freq) < n_prediction_steps:
-                tmp_freq = n_prediction_steps
-            else:
-                tmp_freq = min([freq_value for freq_value in freq if freq_value >= n_prediction_steps])
-            freq_value = tmp_freq
         else:
-            freq_value = min(1, n_prediction_steps)
+            freq_value = freq
 
         if isinstance(freq_value, list):
             if np.max(freq_value) < n_prediction_steps:
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index e609ace29..3fd95b5a3 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -375,13 +375,13 @@ def pre_processing(self,
                 if past_features is None:
                     x_past = {'past_targets': x_past.to(device=self.device),
                               'features': torch.zeros((batch_size, length_past, 1),
-                                                           dtype=past_targets.dtype, device=self.device)}
+                                                      dtype=past_targets.dtype, device=self.device)}
             else:
                 x_past = None
             if length_future > 0:
                 if future_features is None:
                     x_future = {'features': torch.zeros((batch_size, length_future, 1),
-                                                               dtype=past_targets.dtype, device=self.device)}
+                                                        dtype=past_targets.dtype, device=self.device)}
             else:
                 x_future = None
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
@@ -495,8 +495,8 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         if future_features is None:
             x_future = {
                 'future_prediction': future_targets.to(self.device),
-                'features': torch.zeros((batch_size, length_future, 0),
-                                               dtype=future_targets.dtype, device=self.device)}
+                'features': torch.zeros((batch_size, length_future, 1),
+                                        dtype=future_targets.dtype, device=self.device)}
         _, x_future, _, _ = self.variable_selector(x_past=None,
                                                    x_future=x_future,
                                                    x_static=None,
@@ -520,6 +520,7 @@ def forward(self,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
+            length_past=self.window_size,
             length_future=0,
             variable_selector_kwargs={'cache_static_contex': True}
         )
@@ -602,7 +603,7 @@ def forward(self,
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
                                                               encoder_lengths=encoder_lengths,
-                                                              decoder_length = idx_pred + 1,
+                                                              decoder_length=idx_pred + 1,
                                                               static_embedding=x_static
                                                               )[:, -1:]
 
@@ -612,8 +613,10 @@ def forward(self,
 
                     all_predictions.append(net_output)
 
-                if self.output_type != 'distribution':
+                if self.output_type == 'regression':
                     all_predictions = torch.cat(all_predictions, dim=1)
+                elif self.output_type == 'quantile':
+                    all_predictions = torch.cat([self.pred_from_net_output(pred) for pred in all_predictions], dim=1)
                 else:
                     all_predictions = self.pred_from_net_output(all_predictions)
 
@@ -698,10 +701,15 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
+                static_features: Optional[torch.Tensor] = None,
+                encoder_lengths: Optional[torch.LongTensor] = None,
                 ):
-        net_output = self(past_targets, past_features, future_features)
-        if self.output_type != 'distribution':
+        net_output = self(past_targets=past_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          static_features=static_features,
+                          encoder_lengths=encoder_lengths)
+        if self.output_type == 'regression':
             return self.pred_from_net_output(net_output)
         else:
             return net_output
@@ -736,8 +744,8 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         if future_features is None:
             x_future = {
                 'future_prediction': future_targets.to(self.device),
-                'features': torch.zeros((batch_size, length_future, 0),
-                                               dtype=future_targets.dtype, device=self.device)}
+                'features': torch.zeros((batch_size, length_future, 1),
+                                        dtype=future_targets.dtype, device=self.device)}
         _, x_future, _, _ = self.variable_selector(x_past=None,
                                                    x_future=x_future,
                                                    x_static=None,
@@ -782,8 +790,8 @@ def forward(self,
                 if past_features is None:
                     if past_features is None:
                         x_past = {'past_targets': targets_all.to(device=self.device),
-                                  'features': torch.zeros((batch_size, length_past, 0),
-                                                               dtype=targets_all.dtype, device=self.device)}
+                                  'features': torch.zeros((batch_size, length_past, 1),
+                                                          dtype=targets_all.dtype, device=self.device)}
 
                 x_input, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
                                                                                       x_future=None,
@@ -837,8 +845,8 @@ def forward(self,
                 if past_features is None:
                     if past_features is None:
                         x_past = {'past_targets': past_targets.to(device=self.device),
-                                  'features': torch.zeros((batch_size, length_past, 0),
-                                                               dtype=past_targets.dtype, device=self.device)}
+                                  'features': torch.zeros((batch_size, length_past, 1),
+                                                          dtype=past_targets.dtype, device=self.device)}
 
                 x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
                                                                                      x_future=None,
@@ -937,8 +945,8 @@ def forward(self,
                     if past_features is None:
                         if past_features is None:
                             x_next = {'past_targets': x_next,
-                                      'features': torch.zeros((batch_size, 1, 0),
-                                                                   dtype=x_next.dtype, device=self.device)}
+                                      'features': torch.zeros((batch_size, 1, 1),
+                                                              dtype=x_next.dtype, device=self.device)}
 
                     x_next, _, _, _ = self.variable_selector(x_past=x_next,
                                                              x_future=None,
@@ -976,9 +984,14 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
+                static_features: Optional[torch.Tensor] = None,
+                encoder_lengths: Optional[torch.LongTensor] = None,
                 ):
-        net_output = self(past_targets, past_features, future_features)
+        net_output = self(past_targets=past_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          static_features=static_features,
+                          encoder_lengths=encoder_lengths)
         return net_output
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 6e0d6cb7a..f52bde9c8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -186,7 +186,6 @@ def __init__(self,
         first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
         static_input_sizes = dataset_properties['static_features_shape']
         self.hidden_size = first_encoder_output_shape
-        variable_selector = nn.ModuleDict()
 
         self._device = torch.device('cpu')
 
@@ -243,7 +242,7 @@ def __init__(self,
             context_size=self.hidden_size,
             single_variable_grns={}
             if not network_structure.share_single_variable_networks
-            else variable_selector['shared_single_variable_grns'],
+            else self.shared_single_variable_grns,
         )
 
         self.decoder_variable_selection = VariableSelectionNetwork(
@@ -254,7 +253,7 @@ def __init__(self,
             context_size=self.hidden_size,
             single_variable_grns={}
             if not network_structure.share_single_variable_networks
-            else variable_selector['shared_single_variable_grns'],
+            else self.shared_single_variable_grns,
         )
 
         self.static_context_variable_selection = GatedResidualNetwork(

From 75570c270a8b17c929a737d69d0b7a9077823728 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 14:28:57 +0100
Subject: [PATCH 179/347] maint

---
 .../forecasting_training_loss/DistributionLoss.py  | 14 +++++++++++---
 .../network_backbone/forecasting_backbone/cells.py |  2 +-
 .../base_forecasting_decoder.py                    |  3 ++-
 autoPyTorch/pipeline/components/training/losses.py |  2 +-
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
index 7c3feda3d..87f936c93 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -3,6 +3,7 @@
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+from ConfigSpace.conditions import EqualsCondition
 
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
     ALL_DISTRIBUTIONS,
@@ -72,7 +73,14 @@ def get_hyperparameter_search_space(
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         add_hyperparameter(cs, dist_cls, CategoricalHyperparameter)
-        add_hyperparameter(cs, forecast_strategy, CategoricalHyperparameter)
-        add_hyperparameter(cs, num_samples, UniformIntegerHyperparameter)
-        add_hyperparameter(cs, aggregation, CategoricalHyperparameter)
+
+        forecast_strategy = get_hyperparameter(forecast_strategy, CategoricalHyperparameter)
+        num_samples = get_hyperparameter(num_samples, UniformIntegerHyperparameter)
+        aggregation = get_hyperparameter(aggregation, CategoricalHyperparameter)
+
+        cs.add_hyperparameters([forecast_strategy, num_samples, aggregation])
+
+        cond_n_samples = EqualsCondition(num_samples, forecast_strategy, 'sample')
+        cond_agg = EqualsCondition(aggregation, forecast_strategy, 'sample')
+        cs.add_conditions([cond_n_samples, cond_agg])
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index f52bde9c8..961da1f55 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -69,7 +69,7 @@ def __init__(self,
                 input_size=n_encoder_output,
                 hidden_size=n_encoder_output,
                 output_size=d_model,
-                dropout=self.dropout_rate if self.use_dropout else None,
+                dropout=dropout,
                 residual=True,
             )
             self.enrich_with_static = False
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index bf343a212..27deb82ac 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -105,7 +105,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             n_prediction_heads=self.n_prediction_heads,
             dataset_properties=X['dataset_properties']
         )
-        self.decoder_input_shape = encoder_output_shape
+
+        self.decoder_input_shape = future_variable_input
 
         return self
 
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index b555e6f2a..6d688d498 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -101,7 +101,7 @@ def forward(self,
         losses_all = []
         for q, y_pred in zip(self.quantiles, input):
             diff = target_tensor - y_pred
-            loss_q = torch.max(q * diff, (1-q) * diff)
+            loss_q = torch.max(q * diff, (q - 1) * diff)
             losses_all.append(loss_q.unsqueeze(0))
         losses_all = torch.concat(losses_all)
 

From 2f954cdc7fdb7681d3b4079d0f483f5721ba60bd Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 18:18:32 +0100
Subject: [PATCH 180/347] maint

---
 .../setup/network_backbone/forecasting_backbone/cells.py       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 961da1f55..ece7d0109 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -119,6 +119,7 @@ def forward(self,
 
         # Attention
         encoder_lengths = torch.where(encoder_lengths < self.window_size, encoder_lengths, self.window_size)
+        encoder_lengths = encoder_lengths.to(self.device)
         attn_output, attn_output_weights = self.attention_fusion(
             q=attn_input[:, self.window_size:],  # query only for predictions
             k=attn_input,
@@ -540,4 +541,4 @@ def forward(self,
                     else:
                         self.cached_intermediate_state[i] = x
             x = fx
-        return x
\ No newline at end of file
+        return x

From 31f8ddcf56f50e624c4bd101d88f0989a5f4e08f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 19:46:12 +0100
Subject: [PATCH 181/347] maint

---
 .../components/setup/network/forecasting_architecture.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 3fd95b5a3..e62700e5a 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1016,9 +1016,10 @@ def forward(self,
         forcast_shape = [batch_size, self.n_prediction_steps, *output_shape]
 
         forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
-        backcast, _ = self.encoder(past_targets)
+        backcast, _ = self.encoder(past_targets, [None])
+        backcast = backcast[0]
         for block in self.decoder.decoder['block_1']:
-            backcast_block, forecast_block = block(backcast)
+            backcast_block, forecast_block = block([None], backcast)
 
             backcast = backcast - backcast_block
             forecast = forecast + forecast_block

From 7f4911eff62599358ef149d3123c2fe3f4a249c0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 21:36:14 +0100
Subject: [PATCH 182/347] maint

---
 .../forecasting_decoder/TransformerDecoder.py               | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 0e39af488..7e52a0f7d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -44,7 +44,7 @@ def __init__(self,
                  lagged_value: Optional[Union[List, np.ndarray]] = None):
         super().__init__()
         self.lagged_value = lagged_value
-        in_features = in_features if self.lagged_value is None else len(self.lagged_value) * in_features
+        in_features = in_features
 
         self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
         if use_positional_decoder:
@@ -60,13 +60,13 @@ def __init__(self,
         self.transformer_decoder_layers = nn.TransformerDecoder(decoder_layer=transformer_decoder_layers,
                                                                 num_layers=num_layers,
                                                                 norm=norm)
-        self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(self.n_prediction_steps)
+        self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(n_prediction_steps)
 
     def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
         output = self.input_layer(x_future)
         if self.training:
             output = self.transformer_decoder_layers(output, encoder_output,
-                                                     tgt_mask=self.tgt_mask.to(self.device))
+                                                     tgt_mask=self.tgt_mask.to(encoder_output.device))
         else:
             output = self.transformer_decoder_layers(output, encoder_output)
         return output

From 2e31fdbb84f1f6081948ed86c2c12b91832aa886 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 8 Mar 2022 21:37:32 +0100
Subject: [PATCH 183/347] forecasting init configs

---
 .../configs/forecasting_init_cfgs.json        | 317 ++++++++++--------
 1 file changed, 183 insertions(+), 134 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index cb10f101d..e1114363e 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -22,125 +22,149 @@
     "models": {
         "MLP": {
             "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "MLPEncoder",
-            "network_backbone:MLPEncoder:num_groups": 1,
-            "network_backbone:MLPEncoder:num_units_1": 40,
-            "network_backbone:MLPEncoder:activation": "relu",
-            "network_backbone:MLPEncoder:use_dropout": false,
-            "network_backbone:MLPEncoder:normalization": "NoNorm",
-            "network_backbone:MLPDecoder:num_layers": 0,
-            "network_backbone:MLPDecoder:units_local_layer": 40,
-            "network_backbone:MLPDecoder:auto_regressive": false,
-            "network_backbone:MLPDecoder:has_local_layer": true,
-            "network:forecast_strategy": "sample",
-            "network:aggregation": "median",
-            "network:num_samples": 100
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "flat_encoder",
+            "network_backbone:flat_encoder:__choice__": "MLPEncoder",
+            "network_backbone:flat_encoder:MLPEncoder:num_groups": 1,
+            "network_backbone:flat_encoder:MLPEncoder:num_units_1": 40,
+            "network_backbone:flat_encoder:MLPEncoder:activation": "relu",
+            "network_backbone:flat_encoder:MLPEncoder:use_dropout": false,
+            "network_backbone:flat_encoder:MLPEncoder:normalization": "NoNorm",
+            "network_backbone:flat_encoder:MLPDecoder:num_layers": 0,
+            "network_backbone:flat_encoder:MLPDecoder:has_local_layer": true,
+            "network_backbone:flat_encoder:MLPDecoder:units_local_layer": 40
         },
         "DeepAR": {
             "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "RNNEncoder",
-            "network_backbone:RNNEncoder:cell_type": "lstm",
-            "network_backbone:RNNEncoder:num_layers": 2,
-            "network_backbone:RNNEncoder:hidden_size": 40,
-            "network_backbone:RNNEncoder:bidirectional": false,
-            "network_backbone:RNNEncoder:use_dropout": true,
-            "network_backbone:RNNEncoder:dropout": 0.1,
-            "network_backbone:RNNEncoder:decoder_type": "MLPDecoder",
-            "network_backbone:MLPDecoder:num_layers": 0,
-            "network_backbone:MLPDecoder:auto_regressive": true,
-            "network:forecast_strategy": "sample",
-            "network:aggregation": "median",
-            "network:num_samples": 100
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 2,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 40,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:seq_encoder:block_1:MLPDecoder:num_layers": 0,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:auto_regressive": true
         },
         "Seq2Seq-RNN2MLP": {
             "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "RNNEncoder",
-            "network_backbone:RNNEncoder:cell_type": "gru",
-            "network_backbone:RNNEncoder:num_layers": 1,
-            "network_backbone:RNNEncoder:hidden_size": 50,
-            "network_backbone:RNNEncoder:bidirectional": true,
-            "network_backbone:RNNEncoder:use_dropout": false,
-            "network_backbone:RNNEncoder:decoder_type": "MLPDecoder",
-            "network_backbone:MLPDecoder:num_layers": 0,
-            "network_backbone:MLPDecoder:auto_regressive": false,
-            "network_backbone:MLPDecoder:has_local_layer": true,
-            "network_backbone:MLPDecoder:units_local_layer": 30,
-            "network:forecast_strategy": "sample",
-            "network:aggregation": "median",
-            "network:num_samples": 100
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "gru",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 50,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "MLPDecoder",
+            "network_backbone:seq_encoder:block_1:MLPDecoder:num_layers": 0,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:has_local_layer": true,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:units_local_layer": 30
         },
         "Seq2Seq-TCN2MLP": {
             "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "TCNEncoder",
-            "network_backbone:TCNEncoder:use_dropout": false,
-            "network_backbone:TCNEncoder:num_blocks": 3,
-            "network_backbone:TCNEncoder:num_filters_1": 30,
-            "network_backbone:TCNEncoder:kernel_size_1": 7,
-            "network_backbone:TCNEncoder:num_filters_2": 30,
-            "network_backbone:TCNEncoder:kernel_size_2": 3,
-            "network_backbone:TCNEncoder:num_filters_3": 30,
-            "network_backbone:TCNEncoder:kernel_size_3": 3,
-            "network_backbone:MLPDecoder:num_layers": 0,
-            "network_backbone:MLPDecoder:auto_regressive": false,
-            "network_backbone:MLPDecoder:has_local_layer": false,
-            "network:forecast_strategy": "sample",
-            "network:aggregation": "median",
-            "network:num_samples": 100
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "TCNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_blocks": 3,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_1": 30,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:kernel_size_1": 7,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_2": 30,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:kernel_size_2": 3,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_3": 30,
+            "network_backbone:seq_encoder:block_1:TCNEncoder:kernel_size_3": 3,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:num_layers": 0,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:MLPDecoder:has_local_layer": false
         },
         "Seq2Seq-RNN2RNN": {
             "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "RNNEncoder",
-            "network_backbone:RNNEncoder:cell_type": "gru",
-            "network_backbone:RNNEncoder:num_layers": 3,
-            "network_backbone:RNNEncoder:hidden_size": 32,
-            "network_backbone:RNNEncoder:bidirectional": true,
-            "network_backbone:RNNEncoder:use_dropout": false,
-            "network_backbone:RNNEncoder:decoder_type": "RNNDecoder",
-            "network_backbone:RNNDecoder:decoder_type": "RNNDecoder",
-            "network:forecast_strategy": "mean"
+            "loss:DistributionLoss:forecast_strategy": "mean",
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "gru",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 3,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 32,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:block_1:RNNDecoder:decoder_type": "RNNDecoder"
         },
         "Seq2Seq-Transformer2Transformer": {
             "loss:__choice__": "DistributionLoss",
-            "network:net_out_type": "distribution",
             "loss:DistributionLoss:dist_cls": "studentT",
-            "network_backbone:__choice__": "TransformerEncoder",
-            "network_backbone:TransformerEncoder:d_model_log": 5,
-            "network_backbone:TransformerEncoder:activation": "gelu",
-            "network_backbone:TransformerEncoder:num_layers": 1,
-            "network_backbone:TransformerEncoder:decoder_type": "TransformerDecoder",
-            "network_backbone:TransformerEncoder:use_dropout": true,
-            "network_backbone:TransformerEncoder:use_positional_encoder": true,
-            "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
-            "network_backbone:TransformerEncoder:d_feed_forward_log": 7,
-            "network_backbone:TransformerEncoder:n_head_log": 3,
-            "network_backbone:TransformerEncoder:layer_norm_eps": 1e-05,
-            "network_backbone:TransformerEncoder:dropout": 0.1,
-            "network_backbone:TransformerEncoder:use_layer_norm_output": true,
-            "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
-            "network_backbone:TransformerDecoder:activation": "gelu",
-            "network_backbone:TransformerDecoder:num_layers": 1,
-            "network_backbone:TransformerDecoder:use_dropout": true,
-            "network_backbone:TransformerDecoder:use_positional_decoder": true,
-            "network_backbone:TransformerDecoder:dropout_positional_decoder": 0.1,
-            "network_backbone:TransformerDecoder:d_feed_forward_log": 7,
-            "network_backbone:TransformerDecoder:n_head_log": 3,
-            "network_backbone:TransformerDecoder:layer_norm_eps": 1e-05,
-            "network_backbone:TransformerDecoder:dropout": 0.1,
-            "network_backbone:TransformerDecoder:use_layer_norm_output": true,
-            "network_backbone:TransformerDecoder:layer_norm_eps_output": 1e-05,
-            "network:forecast_strategy": "sample",
-            "network:aggregation": "median",
-            "network:num_samples": 100
+            "loss:DistributionLoss:forecast_strategy": "sample",
+            "loss:DistributionLoss:aggregation": "median",
+            "loss:DistributionLoss:num_samples": 100,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:decoder_type": "TransformerDecoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_positional_encoder": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout_positional_encoder": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_feed_forward_log": 7,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:n_head_log": 3,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_layer_norm_output": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_positional_decoder": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout_positional_decoder": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:d_feed_forward_log": 7,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:n_head_log": 3,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_layer_norm_output": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
         },
         "NBEATS-I": {
             "target_scaler:__choice__": "TargetNoScaler",
@@ -148,46 +172,71 @@
             "data_loader:backcast_period": 2,
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mase",
-            "network:net_out_type": "regression",
-            "network_backbone:__choice__": "NBEATSEncoder",
-            "network_backbone:NBEATSDecoder:backcast_loss_ratio": 0.0,
-            "network_backbone:NBEATSDecoder:normalization": "NoNorm",
-            "network_backbone:NBEATSDecoder:activation": "relu",
-            "network_backbone:NBEATSDecoder:n_beats_type": "I",
-            "network_backbone:NBEATSDecoder:use_dropout_i": true,
-            "network_backbone:NBEATSDecoder:num_stacks_i": 2,
-            "network_backbone:NBEATSDecoder:num_blocks_i_1": 3,
-            "network_backbone:NBEATSDecoder:num_layers_i_1": 2,
-            "network_backbone:NBEATSDecoder:width_i_1": 256,
-            "network_backbone:NBEATSDecoder:weight_sharing_i_1": true,
-            "network_backbone:NBEATSDecoder:stack_type_i_1": "trend",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_i_trend_1": 3,
-            "network_backbone:NBEATSDecoder:dropout_i_1": 0.1,
-            "network_backbone:NBEATSDecoder:num_blocks_i_2": 3,
-            "network_backbone:NBEATSDecoder:num_layers_i_2": 2,
-            "network_backbone:NBEATSDecoder:width_i_2": 512,
-            "network_backbone:NBEATSDecoder:weight_sharing_i_2": true,
-            "network_backbone:NBEATSDecoder:stack_type_i_2": "seasonality",
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_i_seasonality_2": 7,
-            "network_backbone:NBEATSDecoder:dropout_i_2": 0.1
+            "network_backbone:__choice__": "flat_encoder",
+            "network_backbone:flat_encoder:__choice__": "NBEATSEncoder",
+            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ratio": 0.0,
+            "network_backbone:flat_encoder:NBEATSDecoder:normalization": "NoNorm",
+            "network_backbone:flat_encoder:NBEATSDecoder:activation": "relu",
+            "network_backbone:flat_encoder:NBEATSDecoder:n_beats_type": "I",
+            "network_backbone:flat_encoder:NBEATSDecoder:use_dropout_i": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_stacks_i": 2,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_blocks_i_1": 3,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_layers_i_1": 2,
+            "network_backbone:flat_encoder:NBEATSDecoder:width_i_1": 256,
+            "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_i_1": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:stack_type_i_1": "trend",
+            "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_i_trend_1": 3,
+            "network_backbone:flat_encoder:NBEATSDecoder:dropout_i_1": 0.1,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_blocks_i_2": 3,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_layers_i_2": 2,
+            "network_backbone:flat_encoder:NBEATSDecoder:width_i_2": 512,
+            "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_i_2": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:stack_type_i_2": "seasonality",
+            "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_i_seasonality_2": 7,
+            "network_backbone:flat_encoder:NBEATSDecoder:dropout_i_2": 0.1
         },
         "NBEATS-G": {
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mape",
-            "network:net_out_type": "regression",
-            "network_backbone:__choice__": "NBEATSEncoder",
-            "network_backbone:NBEATSDecoder:backcast_loss_ratio": 0.0,
-            "network_backbone:NBEATSDecoder:normalization": "NoNorm",
-            "network_backbone:NBEATSDecoder:activation": "relu",
-            "network_backbone:NBEATSDecoder:n_beats_type": "G",
-            "network_backbone:NBEATSDecoder:use_dropout_g": true,
-            "network_backbone:NBEATSDecoder:num_stacks_g": 30,
-            "network_backbone:NBEATSDecoder:num_blocks_g": 1,
-            "network_backbone:NBEATSDecoder:num_layers_g": 4,
-            "network_backbone:NBEATSDecoder:width_g": 512,
-            "network_backbone:NBEATSDecoder:weight_sharing_g": false,
-            "network_backbone:NBEATSDecoder:expansion_coefficient_length_g": 32,
-            "network_backbone:NBEATSDecoder:dropout_g": 0.1
+            "network_backbone:__choice__": "flat_encoder",
+            "network_backbone:flat_encoder:__choice__": "NBEATSEncoder",
+            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ratio": 0.0,
+            "network_backbone:flat_encoder:NBEATSDecoder:normalization": "NoNorm",
+            "network_backbone:flat_encoder:NBEATSDecoder:activation": "relu",
+            "network_backbone:flat_encoder:NBEATSDecoder:n_beats_type": "G",
+            "network_backbone:flat_encoder:NBEATSDecoder:use_dropout_g": true,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_stacks_g": 30,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_blocks_g": 1,
+            "network_backbone:flat_encoder:NBEATSDecoder:num_layers_g": 4,
+            "network_backbone:flat_encoder:NBEATSDecoder:width_g": 512,
+            "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_g": false,
+            "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_g": 32,
+            "network_backbone:flat_encoder:NBEATSDecoder:dropout_g": 0.1
+        },
+        "TemoporalFusionTransformer": {
+            "loss:__choice__": "QuantileLoss",
+            "loss:QuantileLoss:lower_quantile": 0.1,
+            "loss:QuantileLoss:upper_quantile": 0.9,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": true,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:variable_selection": true,
+            "network_backbone:seq_encoder:share_single_variable_networks": false,
+            "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 32,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:block_1:RNNDecoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:use_temporal_fusion": true,
+            "network_backbone:seq_encoder:temporal_fusion:attention_d_model_log": 5,
+            "network_backbone:seq_encoder:temporal_fusion:attention_n_head_log": 2,
+            "network_backbone:seq_encoder:temporal_fusion:use_dropout": true,
+            "network_backbone:seq_encoder:temporal_fusion:dropout_rate": 0.1
         }
     }
 }
\ No newline at end of file

From 125921cabd9acd34f1f7cdea1df6c879355dc548 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 9 Mar 2022 11:25:11 +0100
Subject: [PATCH 184/347] add forbidden

---
 .../seq_encoder/__init__.py                   | 33 ++++++++--
 .../pipeline/time_series_forecasting.py       | 61 ++++++-------------
 2 files changed, 45 insertions(+), 49 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index b7acffff0..d4a3e814a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -5,9 +5,11 @@
 from sklearn.pipeline import Pipeline
 
 from ConfigSpace.hyperparameters import (
+    Constant,
     CategoricalHyperparameter,
     UniformIntegerHyperparameter,
-    UniformFloatHyperparameter
+    UniformFloatHyperparameter,
+    OrdinalHyperparameter,
 )
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
 from ConfigSpace.conditions import (
@@ -148,7 +150,6 @@ def get_hyperparameter_search_space(
         if dataset_properties is None:
             dataset_properties = {}
 
-        # TODO
         static_features_shape = dataset_properties.get("static_features_shape", 0)
         future_feature_shapes = dataset_properties.get("future_feature_shapes", (0,))
 
@@ -160,7 +161,14 @@ def get_hyperparameter_search_space(
         share_single_variable_networks = get_hyperparameter(share_single_variable_networks, CategoricalHyperparameter)
 
         decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
-        num_blocks = get_hyperparameter(num_blocks, UniformIntegerHyperparameter)
+
+        if min_num_blocks == max_num_blocks:
+            num_blocks = Constant(num_blocks.hyperparameter, num_blocks.value_range[0])
+        else:
+            num_blocks = OrdinalHyperparameter(
+                num_blocks.hyperparameter,
+                sequence=list(range(min_num_blocks, max_num_blocks + 1))
+            )
 
         skip_connection = get_hyperparameter(skip_connection, CategoricalHyperparameter)
 
@@ -382,17 +390,30 @@ def get_hyperparameter_search_space(
             deep_ar_hp = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
             if deep_ar_hp in cs:
                 deep_ar_hp = cs.get_hyperparameter(deep_ar_hp)
-                forbidden_ar = ForbiddenEqualsClause(deep_ar_hp, True)
+                forbidden_deep_ar = ForbiddenEqualsClause(deep_ar_hp, True)
                 if min_num_blocks == 1:
                     if max_num_blocks > 1:
                         if max_num_blocks - min_num_blocks > 1:
                             forbidden = ForbiddenAndConjunction(
                                 ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
-                                forbidden_ar
+                                forbidden_deep_ar
                             )
                         else:
-                            forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_ar)
+                            forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_deep_ar)
                         cs.add_forbidden_clause(forbidden)
+
+                forbidden_deep_ars = []
+
+                hps_forbidden_deep_ar = [variable_selection, use_temporal_fusion]
+                for hp_forbidden_deep_ar in hps_forbidden_deep_ar:
+                    if True in hp_forbidden_deep_ar.choices:
+                        forbidden_deep_ars.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_forbidden_deep_ar, True),
+                            forbidden_deep_ar
+                        ))
+                if forbidden_deep_ars:
+                    cs.add_forbidden_clauses(forbidden_deep_ars)
+
         return cs
 
     def set_hyperparameters(self,
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index d2aa5aa95..0c6bb8822 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -203,45 +203,39 @@ def _get_hyperparameter_search_space(self,
 
             ar_forbidden = True
 
-            hp_auto_regressive = []
+            hp_deepAR = []
             for hp_name in cs.get_hyperparameter_names():
                 if hp_name.startswith('network_backbone:'):
                     if hp_name.endswith(':auto_regressive'):
-                        hp_auto_regressive.append(cs.get_hyperparameter(hp_name))
+                        hp_deepAR.append(cs.get_hyperparameter(hp_name))
 
-            # Auto-Regressive is incompatible with regression losses
+            # DeepAR
             forbidden_losses_all = []
-            if 'RegressionLoss' in hp_loss.choices:
-                forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
-                for hp_ar in hp_auto_regressive:
+            losses_non_ar = []
+            for loss in hp_loss.choices:
+                if loss != "DistributionLoss":
+                    losses_non_ar.append(loss)
+            if losses_non_ar:
+                forbidden_hp_regression_loss = ForbiddenInClause(hp_loss, losses_non_ar)
+                for hp_ar in hp_deepAR:
                     forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, ar_forbidden)
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
                     forbidden_losses_all.append(forbidden_hp_dist)
 
+            network_flat_encoder_hp = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
 
-            network_encoder_hp = cs.get_hyperparameter('network_backbone:__choice__')
-
-            if 'MLPEncoder' in network_encoder_hp.choices:
-                forbidden = ['MLPEncoder']
-                forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
-                for hp_ar in hp_auto_regressive:
-                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                    forbidden_hp_mlpencoder = ForbiddenInClause(network_encoder_hp, forbidden_deepAREncoder)
-                    forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
-                    forbidden_losses_all.append(forbidden_hp_ar_mlp)
-
-            if 'MLPEncoder' in network_encoder_hp.choices:
+            if 'MLPEncoder' in network_flat_encoder_hp.choices:
                 forbidden = ['MLPEncoder']
-                forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_encoder_hp.choices]
-                for hp_ar in hp_auto_regressive:
+                forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices]
+                for hp_ar in hp_deepAR:
                     forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                    forbidden_hp_mlpencoder = ForbiddenInClause(network_encoder_hp, forbidden_deepAREncoder)
+                    forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp, forbidden_deepAREncoder)
                     forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                     forbidden_losses_all.append(forbidden_hp_ar_mlp)
 
             forecast_strategy = cs.get_hyperparameter('loss:DistributionLoss:forecast_strategy')
             if 'mean' in forecast_strategy.choices:
-                for hp_ar in hp_auto_regressive:
+                for hp_ar in hp_deepAR:
                     forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
                     forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
                     forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
@@ -251,6 +245,7 @@ def _get_hyperparameter_search_space(self,
             cs.add_forbidden_clauses(forbidden_losses_all)
 
             # NBEATS
+            network_encoder_hp = cs.get_hyperparameter("network_backbone:__choice__")
             forbidden_NBEATS = []
             encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'flat_encoder']
             loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
@@ -259,10 +254,8 @@ def _get_hyperparameter_search_space(self,
             forbidden_encoder_NBEATS = ForbiddenInClause(network_encoder_hp, encoder_non_BEATS)
             forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
-            forbidden_backcast_false = ForbiddenEqualsClause(data_loader_backcast, False)
-
-            hp_flat_encoder =  cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
 
+            hp_flat_encoder = cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
 
             # Ensure that NBEATS encoder only works with NBEATS decoder
             if 'NBEATSEncoder' in hp_flat_encoder.choices:
@@ -277,24 +270,6 @@ def _get_hyperparameter_search_space(self,
 
             cs.add_forbidden_clauses(forbidden_NBEATS)
 
-        """
-        # rnn head only allow rnn backbone
-        if 'network_encoder' in self.named_steps.keys() and 'network_decoder' in self.named_steps.keys():
-            hp_encoder_choice = cs.get_hyperparameter('network_encoder:__choice__')
-            hp_decoder_choice = cs.get_hyperparameter('network_decoder:__choice__')
-
-            if 'RNNDecoder' in hp_decoder_choice.choices:
-                if len(hp_decoder_choice.choices) == 1 and 'RNNEncoder' not in hp_encoder_choice.choices:
-                    raise ValueError("RNN Header is only compatible with RNNBackbone, RNNHead is not allowed to be "
-                                     "the only network head choice if the backbone choices do not contain RNN!")
-                encoder_choices = [choice for choice in hp_encoder_choice.choices if choice != 'RNNEncoder']
-                forbidden_clause_encoder = ForbiddenInClause(hp_encoder_choice, encoder_choices)
-                forbidden_clause_decoder = ForbiddenEqualsClause(hp_decoder_choice, 'RNNDecoder')
-
-                cs.add_forbidden_clause(ForbiddenAndConjunction(forbidden_clause_encoder, forbidden_clause_decoder))
-            cs.get_hyperparameter_names()
-        """
-
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
         return cs

From 8d704d1f1df57ea61e24c77a4520f9abd4d7af27 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 10 Mar 2022 11:21:03 +0100
Subject: [PATCH 185/347] maint

---
 .../setup/network/forecasting_architecture.py       |  1 +
 .../network_backbone/forecasting_backbone/cells.py  | 13 +++++++++----
 .../forecasting_decoder/MLPDecoder.py               |  8 +++++---
 .../forecasting_encoder/seq_encoder/__init__.py     | 13 +++++++++++++
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index e62700e5a..7f9ae854f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -373,6 +373,7 @@ def pre_processing(self,
             batch_size = x_past.shape[0]
             if length_past > 0:
                 if past_features is None:
+                    length_past = x_past.shape[1]
                     x_past = {'past_targets': x_past.to(device=self.device),
                               'features': torch.zeros((batch_size, length_past, 1),
                                                       dtype=past_targets.dtype, device=self.device)}
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index ece7d0109..657d04962 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -70,14 +70,14 @@ def __init__(self,
                 hidden_size=n_encoder_output,
                 output_size=d_model,
                 dropout=dropout,
-                residual=True,
+                residual=False,
             )
             self.enrich_with_static = False
 
         self.attention_fusion = InterpretableMultiHeadAttention(
             d_model=d_model,
             n_head=n_head,
-            dropout=dropout
+            dropout=dropout or 0.0
         )
         self.post_attn_gate_norm = GateAddNorm(d_model, dropout=dropout, trainable_add=False)
         self.pos_wise_ff = GatedResidualNetwork(input_size=d_model, hidden_size=d_model,
@@ -423,7 +423,7 @@ def forward(self,
                         fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
                     else:
                         if self.encoder_num_hidden_states[i] == 1:
-                            fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx.expand((rnn_num_layers, -1, -1)))
+                            fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx[0].expand((rnn_num_layers, -1, -1)))
                         else:
                             hx = tuple(hx_i.expand(rnn_num_layers, -1, -1) for hx_i in hx)
                             fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
@@ -434,7 +434,10 @@ def forward(self,
                 else:
                     fx = encoder_i(x, output_seq=output_seq_i)
             if self.skip_connection:
-                fx = self.encoder[f'skip_connection_{block_id}'](fx, x)
+                if output_seq_i:
+                    fx = self.encoder[f'skip_connection_{block_id}'](fx, x)
+                else:
+                    fx = self.encoder[f'skip_connection_{block_id}'](fx, x[:, -1:])
 
             if self.encoder_output_type[i] == EncoderOutputForm.HiddenStates:
                 encoder2decoder.append(hx)
@@ -443,6 +446,8 @@ def forward(self,
             elif self.encoder_output_type[i] == EncoderOutputForm.SequenceLast:
                 if output_seq or incremental_update:
                     encoder2decoder.append(fx)
+                elif output_seq_i:
+                    encoder2decoder.append(encoder_i.get_last_seq_value(fx).squeeze(1))
                 else:
                     encoder2decoder.append(fx.squeeze(1))
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 97298f2a9..c5e6e7544 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -56,15 +56,17 @@ def _build_decoder(self,
         global_layers = []
         in_features = encoder_output_shape[-1]
         num_decoder_output_features = in_features
+        has_local_layer = 'units_local_layer' in self.config
         if 'num_layers' in self.config and self.config["num_layers"] > 0:
-            in_features += int(np.prod(future_variable_input))
-            for i in range(1, self.config["num_layers"]):
+            if not has_local_layer:
+                in_features += int(np.prod(future_variable_input))
+            for i in range(1, self.config["num_layers"] + 1):
                 global_layers.append(nn.Linear(in_features=in_features,
                                                out_features=self.config[f"units_layer_{i}"]))
                 global_layers.append(_activations[self.config["activation"]]())
                 in_features = self.config[f"units_layer_{i}"]
                 num_decoder_output_features = in_features
-        if 'units_local_layer' in self.config:
+        if has_local_layer:
             local_layers = [nn.Linear(in_features=in_features,
                                       out_features=self.config['units_local_layer'] * n_prediction_heads)]
             if 'activation' in self.config:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index d4a3e814a..091bf17bd 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -414,6 +414,19 @@ def get_hyperparameter_search_space(
                 if forbidden_deep_ars:
                     cs.add_forbidden_clauses(forbidden_deep_ars)
 
+        if True in skip_connection.choices:
+            forbidden_mlp_skip = []
+            forbidden_skip = ForbiddenEqualsClause(skip_connection, True)
+            for i in range(1, max_num_blocks + 1):
+                hp_mlp_has_local_layer = f"block_{i}:MLPDecoder:has_local_layer"
+                if hp_mlp_has_local_layer in cs:
+                    hp_mlp_has_local_layer = cs.get_hyperparameter(hp_mlp_has_local_layer)
+                    forbidden_mlp_skip.append(ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                        forbidden_skip
+                    ))
+            cs.add_forbidden_clauses(forbidden_mlp_skip)
+
         return cs
 
     def set_hyperparameters(self,

From e6466728912e8a84b1457ec52ab86791e4ff5fee Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 10 Mar 2022 12:43:15 +0100
Subject: [PATCH 186/347] maint

---
 .../components/training/data_loader/time_series_util.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 2f20608c7..08bab74f4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -52,9 +52,9 @@ def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
         length = min(tensor.size(0), seq_max_length)
         # use index notation to prevent duplicate references to the tensor
         if batch_first:
-            out_tensor[i, :length, ...] = tensor[-length:]
+            out_tensor[i, -length:, ...] = tensor[-length:]
         else:
-            out_tensor[length:, i, ...] = tensor[-length:]
+            out_tensor[-length:, i, ...] = tensor[-length:]
 
     return out_tensor
 

From a2ad3fe264ba54ef04bb292433cc62591acdd7bf Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 10 Mar 2022 15:31:04 +0100
Subject: [PATCH 187/347] maint

---
 .../forecasting_backbone/forecasting_decoder/MLPDecoder.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index c5e6e7544..7455b5b23 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -57,15 +57,15 @@ def _build_decoder(self,
         in_features = encoder_output_shape[-1]
         num_decoder_output_features = in_features
         has_local_layer = 'units_local_layer' in self.config
+        if not has_local_layer:
+            in_features += int(np.prod(future_variable_input))
         if 'num_layers' in self.config and self.config["num_layers"] > 0:
-            if not has_local_layer:
-                in_features += int(np.prod(future_variable_input))
             for i in range(1, self.config["num_layers"] + 1):
                 global_layers.append(nn.Linear(in_features=in_features,
                                                out_features=self.config[f"units_layer_{i}"]))
                 global_layers.append(_activations[self.config["activation"]]())
                 in_features = self.config[f"units_layer_{i}"]
-                num_decoder_output_features = in_features
+        num_decoder_output_features = in_features
         if has_local_layer:
             local_layers = [nn.Linear(in_features=in_features,
                                       out_features=self.config['units_local_layer'] * n_prediction_heads)]

From 200691c0631a126ec7c8d4bdab5529aa5d89d4f9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 11 Mar 2022 11:56:44 +0100
Subject: [PATCH 188/347] remove shift data

---
 autoPyTorch/api/time_series_forecasting.py  |  4 --
 autoPyTorch/datasets/time_series_dataset.py | 43 ++++++---------------
 2 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 35a56e789..937e0e9df 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -135,7 +135,6 @@ def search(
             disable_file_output: List = [],
             load_models: bool = True,
             portfolio_selection: Optional[str] = None,
-            shift_input_data: bool = True,
             normalize_y: bool = True,
             suggested_init_models: Optional[List[str]] = None,
             custom_init_setting_path: Optional[str] = None,
@@ -236,8 +235,6 @@ def search(
             disable_file_output (Union[bool, List]):
             load_models (bool), (default=True): Whether to load the
                 models after fitting AutoPyTorch.
-            shift_input_data: bool
-                if the input data needs to be shifted
             normalize_y: bool
                 if the input y values need to be normalized
             suggested_init_models: Optional[List[str]]
@@ -279,7 +276,6 @@ def search(
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
             n_prediction_steps=n_prediction_steps,
-            shift_input_data=shift_input_data,
             normalize_y=normalize_y,
         )
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 10bc838bb..a46dfe615 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -232,7 +232,6 @@ def __init__(self,
                  lagged_value: Optional[List[int]] = None,
                  n_prediction_steps: int = 1,
                  dataset_name: Optional[str] = None,
-                 shift_input_data: bool = True,
                  normalize_y: bool = True,
                  static_features: Optional[np.ndarray] = None,
                  ):
@@ -312,8 +311,6 @@ def __init__(self,
 
         self.categories = self.validator.feature_validator.categories
 
-        self.shift_input_data = shift_input_data
-
         X, Y, sequence_lengths = self.validator.transform(X, Y)
         if X_test is not None:
             X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
@@ -809,19 +806,11 @@ def create_cross_val_splits(
         splits = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
 
         for idx_seq, dataset in enumerate(self.datasets):
-            if self.shift_input_data:
-                split = self.cross_validators[cross_val_type.name](self.random_state,
-                                                                   num_splits,
-                                                                   indices=idx_start + np.arange(len(dataset)),
-                                                                   **kwargs)
-            else:
-                # If the data is not shifted, we need to discard the last n_prediction_steps such that we have enough
-                # y values
-                split = self.cross_validators[cross_val_type.name](self.random_state,
-                                                                   num_splits,
-                                                                   indices=idx_start + np.arange(
-                                                                       len(dataset) - self.n_prediction_steps),
-                                                                   **kwargs)
+            split = self.cross_validators[cross_val_type.name](self.random_state,
+                                                               num_splits,
+                                                               indices=idx_start + np.arange(len(dataset)),
+                                                               **kwargs)
+
             for idx_split in range(num_splits):
                 splits[idx_split][idx_seq] = split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
@@ -869,18 +858,11 @@ def create_holdout_val_split(
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
-            if self.shift_input_data:
-                split = self.holdout_validators[holdout_val_type.name](self.random_state,
-                                                                       val_share,
-                                                                       indices=np.arange(len(dataset)) + idx_start,
-                                                                       **kwargs)
-            else:
-                split = self.holdout_validators[holdout_val_type.name](self.random_state,
-                                                                       val_share,
-                                                                       indices=idx_start + np.arange(
-                                                                           len(dataset) - self.n_prediction_steps),
-                                                                       **kwargs)
 
+            split = self.holdout_validators[holdout_val_type.name](self.random_state,
+                                                                   val_share,
+                                                                   indices=np.arange(len(dataset)) + idx_start,
+                                                                   **kwargs)
             for idx_split in range(2):
                 splits[idx_split][idx_seq] = split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
@@ -908,16 +890,13 @@ def create_refit_split(
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
-            if self.shift_input_data:
-                split = [np.arange(len(dataset)), np.array([len(dataset) - 1])]
-            else:
-                last_idx = len(dataset) - self.n_prediction_steps - 1
-                split = [np.arange(len(dataset) - self.n_prediction_steps), np.array([last_idx])]
+            split = [np.arange(len(dataset)), np.array([len(dataset) - 1])]
 
             for idx_split in range(2):
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
 
+
         train_indices = np.hstack([sp for sp in splits[0]])
         test_indices = np.hstack([sp for sp in splits[1]])
 

From 538f24e54e8deba4bd5a3c7ea0558630de00e539 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 11 Mar 2022 13:38:04 +0100
Subject: [PATCH 189/347] maint

---
 .../setup/network_backbone/forecasting_backbone/cells.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 657d04962..69405aa11 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -112,7 +112,7 @@ def forward(self,
         if self.enrich_with_static:
             static_context_enrichment = self.static_context_enrichment(static_embedding)
             attn_input = self.enrichment(
-                network_output, static_context_enrichment[:, None].expand(-1, self.window_size + decoder_length, -1)
+                network_output, static_context_enrichment[:, None].expand(-1, network_output.shape[1], -1)
             )
         else:
             attn_input = self.enrichment(network_output)
@@ -120,8 +120,9 @@ def forward(self,
         # Attention
         encoder_lengths = torch.where(encoder_lengths < self.window_size, encoder_lengths, self.window_size)
         encoder_lengths = encoder_lengths.to(self.device)
+
         attn_output, attn_output_weights = self.attention_fusion(
-            q=attn_input[:, self.window_size:],  # query only for predictions
+            q=attn_input[:, -decoder_length:],  # query only for predictions
             k=attn_input,
             v=attn_input,
             mask=self.get_attention_mask(
@@ -129,7 +130,7 @@ def forward(self,
             ),
         )
         # skip connection over attention
-        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, self.window_size:])
+        attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, -decoder_length:])
         output = self.pos_wise_ff(attn_output)
 
         if self.network_structure.skip_connection:

From 12ccf4b09456051c501b270085ac85df2eee3be9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 11 Mar 2022 14:16:29 +0100
Subject: [PATCH 190/347] maint

---
 .../forecasting_backbone/cells.py             | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 69405aa11..5515763d5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -502,16 +502,17 @@ def __init__(self,
 
                     input_size_decoder = decoder_info[block_id].decoder_output_shape[-1]
                     skip_size_decoder = decoder_info[block_id].decoder_input_shape[-1]
-                    if input_size_encoder == input_size_decoder and skip_size_encoder == skip_size_decoder:
-                        decoder[f'skip_connection_{i}'] = encoder[f'skip_connection_{i}']
-                    else:
-                        if network_structure.skip_connection_type == 'add':
-                            decoder[f'skip_connection_{i}'] = AddLayer(input_size_decoder, skip_size_decoder)
-                        elif network_structure.skip_connection_type == 'gate_add_norm':
-                            decoder[f'skip_connection_{i}'] = GateAddNorm(input_size_decoder,
-                                                                          hidden_size=input_size_decoder,
-                                                                          skip_size=skip_size_decoder,
-                                                                          dropout=network_structure.grn_dropout_rate)
+                    if skip_size_decoder > 0:
+                        if input_size_encoder == input_size_decoder and skip_size_encoder == skip_size_decoder:
+                            decoder[f'skip_connection_{i}'] = encoder[f'skip_connection_{i}']
+                        else:
+                            if network_structure.skip_connection_type == 'add':
+                                decoder[f'skip_connection_{i}'] = AddLayer(input_size_decoder, skip_size_decoder)
+                            elif network_structure.skip_connection_type == 'gate_add_norm':
+                                decoder[f'skip_connection_{i}'] = GateAddNorm(input_size_decoder,
+                                                                              hidden_size=input_size_decoder,
+                                                                              skip_size=skip_size_decoder,
+                                                                              dropout=network_structure.grn_dropout_rate)
         self.cached_intermediate_state = [torch.empty(0) for _ in range(self.num_blocks + 1 - self.first_block)]
         self.decoder = decoder
 
@@ -536,8 +537,9 @@ def forward(self,
                     fx = decoder_i(x_all, encoder_output=encoder_output[i])[:, -1:]
                 else:
                     fx = decoder_i(x, encoder_output=encoder_output[i])
-            if self.skip_connection:
-                fx = self.decoder[f'skip_connection_{block_id}'](fx, x)
+            skip_id = f'skip_connection_{block_id}'
+            if self.skip_connection and skip_id in self.decoder:
+                fx = self.decoder[skip_id](fx, x)
             if cache_intermediate_state:
                 if self.decoder_has_hidden_states[i]:
                     self.cached_intermediate_state[i] = hx

From 4d6853d9321495c6fe25b24acb25487147cb7af8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 11 Mar 2022 15:47:34 +0100
Subject: [PATCH 191/347] copy dataset_properties for each refit iteration

---
 autoPyTorch/api/base_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 3a724f69f..151e38d98 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1237,7 +1237,7 @@ def refit(
             # could alleviate the problem in algorithms that depend on
             # the ordering of the data.
             X = self._get_fit_dictionary(
-                dataset_properties=dataset_properties,
+                dataset_properties=copy.copy(dataset_properties),
                 dataset=dataset,
                 split_id=split_id)
             fit_and_suppress_warnings(self._logger, model, X, y=None)

From 34d556a68f479d101763d4d962f7844de3607b18 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 14 Mar 2022 11:31:37 +0100
Subject: [PATCH 192/347] maint and new init

---
 .../configs/forecasting_init_cfgs.json        | 81 ++++++-------------
 ...time_series_forecasting_train_evaluator.py |  5 ++
 .../forecasting_target_scaling/utils.py       |  2 +-
 .../setup/network/forecasting_architecture.py | 40 ++++++---
 .../forecasting_backbone/cells.py             | 56 +++++++++----
 .../base_forecasting_decoder.py               |  2 +-
 .../seq_encoder/__init__.py                   | 49 ++++++++---
 .../pipeline/time_series_forecasting.py       | 15 ++++
 8 files changed, 155 insertions(+), 95 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index e1114363e..d7b97858d 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -134,37 +134,31 @@
             "loss:DistributionLoss:forecast_strategy": "sample",
             "loss:DistributionLoss:aggregation": "median",
             "loss:DistributionLoss:num_samples": 100,
-            "network_backbone:__choice__": "seq_encoder",
-            "network_backbone:seq_encoder:skip_connection": false,
-            "network_backbone:seq_encoder:num_blocks": 1,
-            "network_backbone:seq_encoder:use_temporal_fusion": false,
-            "network_backbone:seq_encoder:variable_selection": false,
-            "network_backbone:seq_encoder:decoder_auto_regressive": true,
-            "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:decoder_type": "TransformerDecoder",
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_dropout": true,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_positional_encoder": true,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout_positional_encoder": 0.1,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_feed_forward_log": 7,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:n_head_log": 3,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps": 1e-05,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout": 0.1,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_layer_norm_output": true,
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_positional_decoder": true,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout_positional_decoder": 0.1,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:d_feed_forward_log": 7,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:n_head_log": 3,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps": 1e-05,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout": 0.1,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_layer_norm_output": true,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
+            "network_backbone:__choice__": "TransformerEncoder",
+            "network_backbone:TransformerEncoder:d_model_log": 5,
+            "network_backbone:TransformerEncoder:activation": "gelu",
+            "network_backbone:TransformerEncoder:num_layers": 1,
+            "network_backbone:TransformerEncoder:decoder_type": "TransformerDecoder",
+            "network_backbone:TransformerEncoder:use_dropout": true,
+            "network_backbone:TransformerEncoder:use_positional_encoder": true,
+            "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
+            "network_backbone:TransformerEncoder:d_feed_forward_log": 7,
+            "network_backbone:TransformerEncoder:n_head_log": 3,
+            "network_backbone:TransformerEncoder:layer_norm_eps": 1e-05,
+            "network_backbone:TransformerEncoder:dropout": 0.1,
+            "network_backbone:TransformerEncoder:use_layer_norm_output": true,
+            "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:TransformerDecoder:activation": "gelu",
+            "network_backbone:TransformerDecoder:num_layers": 1,
+            "network_backbone:TransformerDecoder:use_dropout": true,
+            "network_backbone:TransformerDecoder:use_positional_decoder": true,
+            "network_backbone:TransformerDecoder:dropout_positional_decoder": 0.1,
+            "network_backbone:TransformerDecoder:d_feed_forward_log": 7,
+            "network_backbone:TransformerDecoder:n_head_log": 3,
+            "network_backbone:TransformerDecoder:layer_norm_eps": 1e-05,
+            "network_backbone:TransformerDecoder:dropout": 0.1,
+            "network_backbone:TransformerDecoder:use_layer_norm_output": true,
+            "network_backbone:TransformerDecoder:layer_norm_eps_output": 1e-05
         },
         "NBEATS-I": {
             "target_scaler:__choice__": "TargetNoScaler",
@@ -212,31 +206,6 @@
             "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_g": false,
             "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_g": 32,
             "network_backbone:flat_encoder:NBEATSDecoder:dropout_g": 0.1
-        },
-        "TemoporalFusionTransformer": {
-            "loss:__choice__": "QuantileLoss",
-            "loss:QuantileLoss:lower_quantile": 0.1,
-            "loss:QuantileLoss:upper_quantile": 0.9,
-            "network_backbone:__choice__": "seq_encoder",
-            "network_backbone:seq_encoder:skip_connection": true,
-            "network_backbone:seq_encoder:num_blocks": 1,
-            "network_backbone:seq_encoder:variable_selection": true,
-            "network_backbone:seq_encoder:share_single_variable_networks": false,
-            "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
-            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
-            "network_backbone:seq_encoder:decoder_auto_regressive": false,
-            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
-            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
-            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 32,
-            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
-            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
-            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "RNNDecoder",
-            "network_backbone:seq_encoder:block_1:RNNDecoder:decoder_type": "RNNDecoder",
-            "network_backbone:seq_encoder:use_temporal_fusion": true,
-            "network_backbone:seq_encoder:temporal_fusion:attention_d_model_log": 5,
-            "network_backbone:seq_encoder:temporal_fusion:attention_n_head_log": 2,
-            "network_backbone:seq_encoder:temporal_fusion:use_dropout": true,
-            "network_backbone:seq_encoder:temporal_fusion:dropout_rate": 0.1
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 654a4e118..411b0cdc4 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -89,6 +89,11 @@ def __init__(self, backend: Backend, queue: Queue,
         self.max_budget = max_budget
         self.min_num_test_instances = min_num_test_instances
 
+        import os
+        os.system("sh -c \"scontrol -d show job $SLURM_JOB_ID\"")
+        os.system("nvidia-smi.user")
+
+
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
         holdout"""
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index f56920a63..b155f7107 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -25,7 +25,7 @@ def transform(self, past_targets: torch.Tensor, future_targets: Optional[torch.T
             scale[scale == 0.0] = 1.0
             if future_targets is not None:
                 future_targets = (future_targets - loc) / scale
-            return (past_targets - loc) / scale, future_targets,loc, scale
+            return (past_targets - loc) / scale, future_targets, loc, scale
 
         elif self.mode == "min_max":
             min_ = torch.min(past_targets, dim=-2, keepdim=True)[0]
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 7f9ae854f..0a426ecf6 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -567,12 +567,13 @@ def forward(self,
             if future_features is not None:
                 future_features = future_features
 
+            if self.has_temporal_fusion:
+                decoder_output_all = None
+
             if self.forecast_strategy != 'sample':
                 all_predictions = []
                 predicted_target = past_targets[:, [-1]]
                 past_targets = past_targets[:, :-1]
-                if self.has_temporal_fusion:
-                    decoder_output_all = None
                 for idx_pred in range(self.n_prediction_steps):
                     predicted_target = predicted_target.cpu()
                     if self.decoder_lagged_input:
@@ -633,6 +634,13 @@ def forward(self,
                     is_hidden_states=self.encoder.encoder_has_hidden_states,
                     repeats=self.num_samples)
 
+                intermediate_values = self.repeat_intermediate_values([encoder_output, encoder_lengths],
+                                                                      is_hidden_states=[False, False],
+                                                                      repeats=self.num_samples)
+
+                encoder_output = intermediate_values[0]
+                encoder_lengths = intermediate_values[1]
+
                 if self.decoder_lagged_input:
                     max_lag_seq_length = max(self.decoder_lagged_value) + 1
                 else:
@@ -650,6 +658,12 @@ def forward(self,
                     repeats=self.num_samples, dim=0
                 ) if future_features is not None else None
 
+                if self.network_structure.variable_selection:
+                    self.variable_selector.cached_static_contex = self.repeat_intermediate_values(
+                        [self.variable_selector.cached_static_contex],
+                        is_hidden_states=[False],
+                        repeats=self.num_samples)[0]
+
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
                         x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
@@ -658,8 +672,9 @@ def forward(self,
                         x_future = repeated_predicted_target[:, [-1]]
 
                     if self.network_structure.variable_selection:
-                        x_future = self.decoder_select_variable(future_targets=x_future[:, -1:],
-                                                                future_features=repeated_time_feat[:, [idx_pred]])
+                        x_future = self.decoder_select_variable(
+                            future_targets=x_future[:, -1:],
+                            future_features=None if repeated_time_feat is None else repeated_time_feat[:, [idx_pred]])
                     else:
                         x_future = x_future if repeated_time_feat is None else torch.cat(
                             [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
@@ -678,7 +693,8 @@ def forward(self,
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
                                                               encoder_lengths=encoder_lengths,
-                                                              static_embedding=x_static
+                                                              decoder_length=idx_pred + 1,
+                                                              static_embedding=x_static,
                                                               )[:, -1:]
 
                     net_output = self.head(decoder_output)
@@ -887,13 +903,15 @@ def forward(self,
                                                            cache_intermediate_state=True,
                                                            )
 
-            self.repeat_intermediate_values(self.encoder.cached_intermediate_state,
-                                            is_hidden_states=self.encoder.encoder_has_hidden_states,
-                                            repeats=self.num_samples)
+            self.encoder.cached_intermediate_state = self.repeat_intermediate_values(
+                self.encoder.cached_intermediate_state,
+                is_hidden_states=self.encoder.encoder_has_hidden_states,
+                repeats=self.num_samples)
             if self.network_structure.variable_selection:
-                self.repeat_intermediate_values([self.variable_selector.cached_static_contex],
-                                                is_hidden_states=[False],
-                                                repeats=self.num_samples)
+                self.variable_selector.cached_static_contex = self.repeat_intermediate_values(
+                    [self.variable_selector.cached_static_contex],
+                    is_hidden_states=[False],
+                    repeats=self.num_samples)[0]
 
             if self.encoder_lagged_input:
                 max_lag_seq_length = max(max(self.encoder_lagged_value), self.window_size)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 5515763d5..1d766807d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -61,7 +61,7 @@ def __init__(self,
                     output_size=d_model,
                     dropout=dropout,
                     context_size=n_encoder_output_first,
-                    residual=True,
+                    residual=False,
                 )
                 self.enrich_with_static = True
         if not hasattr(self, 'enrichment'):
@@ -107,6 +107,7 @@ def forward(self,
         """
         if self.decoder_proj_layer is not None:
             decoder_output = self.decoder_proj_layer(decoder_output)
+
         network_output = torch.cat([encoder_output, decoder_output], dim=1)
 
         if self.enrich_with_static:
@@ -120,15 +121,20 @@ def forward(self,
         # Attention
         encoder_lengths = torch.where(encoder_lengths < self.window_size, encoder_lengths, self.window_size)
         encoder_lengths = encoder_lengths.to(self.device)
+        mask = self.get_attention_mask(encoder_lengths=encoder_lengths, decoder_length=decoder_length)
+        if mask.shape[-1] < attn_input.shape[1]:
+            # in case that none of the samples has length greater than window_size
+            mask = torch.cat([
+                mask.new_full((*mask.shape[:-1], attn_input.shape[1] - mask.shape[-1]), True),
+                mask
+            ], dim=-1)
 
         attn_output, attn_output_weights = self.attention_fusion(
             q=attn_input[:, -decoder_length:],  # query only for predictions
             k=attn_input,
             v=attn_input,
-            mask=self.get_attention_mask(
-                encoder_lengths=encoder_lengths, decoder_length=decoder_length
-            ),
-        )
+            mask=mask)
+
         # skip connection over attention
         attn_output = self.post_attn_gate_norm(attn_output, attn_input[:, -decoder_length:])
         output = self.pos_wise_ff(attn_output)
@@ -201,6 +207,7 @@ def __init__(self,
             )
         self.static_input_sizes = static_input_sizes
         if dataset_properties['uni_variant']:
+            """
             # variable selection for encoder and decoder
             encoder_input_sizes = {
                 'past_targets': dataset_properties['input_shape'][-1],
@@ -211,6 +218,23 @@ def __init__(self,
             }
             if auto_regressive:
                 decoder_input_sizes.update({'future_prediction': dataset_properties['output_shape'][-1]})
+            """
+            pre_scalar = {
+                'past_targets': nn.Linear(dataset_properties['input_shape'][-1], self.hidden_size),
+                'features': nn.Linear(1, self.hidden_size),
+            }
+            encoder_input_sizes = {
+                'past_targets': self.hidden_size,
+                'features': self.hidden_size
+            }
+            decoder_input_sizes = {
+                'features': self.hidden_size
+            }
+            if auto_regressive:
+                pre_scalar.update({'future_prediction': nn.Linear(dataset_properties['output_shape'][-1],
+                                                                  self.hidden_size)})
+                decoder_input_sizes.update({'future_prediction': self.hidden_size})
+            self.pre_scalars = {nn.ModuleDict(pre_scalar)}
         else:
             # TODO
             pass
@@ -245,6 +269,7 @@ def __init__(self,
             single_variable_grns={}
             if not network_structure.share_single_variable_networks
             else self.shared_single_variable_grns,
+            prescalers=self.pre_scalars,
         )
 
         self.decoder_variable_selection = VariableSelectionNetwork(
@@ -256,6 +281,7 @@ def __init__(self,
             single_variable_grns={}
             if not network_structure.share_single_variable_networks
             else self.shared_single_variable_grns,
+            prescalers=self.pre_scalars,
         )
 
         self.static_context_variable_selection = GatedResidualNetwork(
@@ -289,7 +315,7 @@ def device(self, device: torch.device):
         self._device = device
 
     def forward(self,
-                x_past: Optional[Dict[str,torch.Tensor]],
+                x_past: Optional[Dict[str, torch.Tensor]],
                 x_future: Optional[Dict[str, torch.Tensor]],
                 x_static: Optional[Dict[str, torch.Tensor]] = None,
                 length_past: int = 0,
@@ -307,7 +333,8 @@ def forward(self,
             if self.static_input_sizes > 0:
                 static_embedding, _ = self.static_variable_selection(x_static)
             else:
-                model_dtype = next(iter(x_past.values())).dtype if length_past > 0 else next(iter(x_future.values())).dtype
+                model_dtype = next(iter(x_past.values())).dtype if length_past > 0 else next(
+                    iter(x_future.values())).dtype
                 static_embedding = torch.zeros(
                     (batch_size, self.hidden_size), dtype=model_dtype, device=self.device
                 )
@@ -315,7 +342,7 @@ def forward(self,
 
             static_context_variable_selection = self.static_context_variable_selection(static_embedding)[:, None]
             static_context_initial_hidden = tuple(init_hidden(static_embedding) for init_hidden in
-                                             self.static_context_initial_hidden)
+                                                  self.static_context_initial_hidden)
             if cache_static_contex:
                 self.cached_static_contex = static_context_variable_selection
                 self.cached_static_embedding = static_embedding
@@ -370,9 +397,9 @@ def __init__(self,
                     encoder[f'skip_connection_{block_idx}'] = AddLayer(input_size, skip_size)
                 elif network_structure.skip_connection_type == 'gate_add_norm':
                     encoder[f'skip_connection_{block_idx}'] = GateAddNorm(input_size,
-                                                                  hidden_size=input_size,
-                                                                  skip_size=skip_size,
-                                                                  dropout=network_structure.grn_dropout_rate)
+                                                                          hidden_size=input_size,
+                                                                          skip_size=skip_size,
+                                                                          dropout=network_structure.grn_dropout_rate)
             if block_id in decoder_info:
                 if decoder_info[block_id].decoder_properties.recurrent:
                     if decoder_info[block_id].decoder_properties.has_hidden_states:
@@ -424,9 +451,10 @@ def forward(self,
                         fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
                     else:
                         if self.encoder_num_hidden_states[i] == 1:
-                            fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx[0].expand((rnn_num_layers, -1, -1)))
+                            fx, hx = encoder_i(x, output_seq=output_seq_i,
+                                               hx=hx[0].expand((rnn_num_layers, -1, -1)).contiguous())
                         else:
-                            hx = tuple(hx_i.expand(rnn_num_layers, -1, -1) for hx_i in hx)
+                            hx = tuple(hx_i.expand(rnn_num_layers, -1, -1).contiguous() for hx_i in hx)
                             fx, hx = encoder_i(x, output_seq=output_seq_i, hx=hx)
             else:
                 if incremental_update:
@@ -465,7 +493,7 @@ def forward(self,
 
         if self.has_temporal_fusion:
             if incremental_update:
-                self.cached_intermediate_state[i + 1] = torch.cat([self.cached_intermediate_state[i+1], x], dim=1)
+                self.cached_intermediate_state[i + 1] = torch.cat([self.cached_intermediate_state[i + 1], x], dim=1)
             else:
                 self.cached_intermediate_state[i + 1] = x
             return encoder2decoder, x
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 27deb82ac..3163337a5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -91,7 +91,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 if auto_regressive:
                     if self.decoder_properties().lagged_input and hasattr(self, 'lagged_value'):
                         future_in_features += len(self.lagged_value) * output_shape[-1]
-                    else:
+                    elif self.decoder_properties().recurrent:
                         future_in_features += output_shape[-1]
             future_variable_input = (self.n_prediction_heads, future_in_features)
         else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 091bf17bd..a508428ec 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -79,6 +79,17 @@ def get_hyperparameter_search_space(
                 value_range=(True, False),
                 default_value=False
             ),
+            variable_selection_use_dropout: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="variable_selection_use_dropout",
+                value_range=(True, False),
+                default_value=False,
+            ),
+            variable_selection_dropout_rate: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="variable_selection_dropout_rate",
+                value_range=(0.0, 0.8),
+                default_value=0.1,
+            ),
+
             share_single_variable_networks: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="share_single_variable_networks",
                 value_range=(True, False),
@@ -120,14 +131,12 @@ def get_hyperparameter_search_space(
             variable_selection (HyperparameterSearchSpace): if variable selection is applied, if True, then the first
                 block will be attached with a variable selection block while the following will be enriched with static
                 features.
-            share_single_variable_networks( HyperparameterSearchSpace): if single variable networks are shared between
+            variable_selection_use_dropout (HyperparameterSearchSpace): if variable selection network uses dropout
+            variable_selection_dropout_rate (HyperparameterSearchSpace): dropout rate of variable selection network
+            share_single_variable_networks (HyperparameterSearchSpace): if single variable networks are shared between
                 encoder and decoder
             skip_connection: HyperparameterSearchSpace: if skip connection is applied
             use_temporal_fusion (HyperparameterSearchSpace): if temporal fusion layer is applied
-            tf_attention_n_head_log (HyperparameterSearchSpace): log value of tf attention dims
-            tf_attention_d_model_log (HyperparameterSearchSpace): log value of tf attention d model
-            tf_use_dropout (HyperparameterSearchSpace): if tf uses dropout
-            tf_dropout_rate (HyperparameterSearchSpace): dropout rate of tf layer
             skip_connection_type (HyperparameterSearchSpace): skip connection type, it could be directly added or a grn
                 network (
                 Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
@@ -175,31 +184,42 @@ def get_hyperparameter_search_space(
         hp_network_structures = [num_blocks, decoder_auto_regressive, variable_selection,
                                  skip_connection]
         cond_skip_connections = []
+
+
         if True in skip_connection.choices:
             skip_connection_type = get_hyperparameter(skip_connection_type, CategoricalHyperparameter)
             hp_network_structures.append(skip_connection_type)
             cond_skip_connections.append(EqualsCondition(skip_connection_type, skip_connection, True))
-            if 'grn' in skip_connection_type.choices:
+            if 'gate_add_norm' in skip_connection_type.choices:
                 grn_use_dropout = get_hyperparameter(grn_use_dropout, CategoricalHyperparameter)
                 hp_network_structures.append(grn_use_dropout)
                 if True in variable_selection.choices:
                     cond_skip_connections.append(
-                        OrConjunction(EqualsCondition(grn_use_dropout, skip_connection_type, "grn"),
-                                      EqualsCondition(grn_dropout_rate, variable_selection, True))
+                        EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm")
                     )
                 else:
-                    cond_skip_connections.append(EqualsCondition(grn_use_dropout, skip_connection_type, "grn"))
+                    cond_skip_connections.append(EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm"))
                 if True in grn_use_dropout.choices:
                     grn_dropout_rate = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
                     hp_network_structures.append(grn_dropout_rate)
                     cond_skip_connections.append(EqualsCondition(grn_dropout_rate, grn_use_dropout, True))
-        elif True in variable_selection.choices:
-            cond_skip_connections.append(EqualsCondition(grn_dropout_rate, variable_selection, True))
-
         cs.add_hyperparameters(hp_network_structures)
         if cond_skip_connections:
             cs.add_conditions(cond_skip_connections)
 
+        if True in variable_selection.choices:
+            variable_selection_use_dropout = get_hyperparameter(variable_selection_use_dropout,
+                                                                CategoricalHyperparameter)
+            variable_selection_dropout_rate = get_hyperparameter(variable_selection_dropout_rate,
+                                                                 UniformFloatHyperparameter)
+            cs.add_hyperparameters([variable_selection_use_dropout, variable_selection_dropout_rate])
+
+            cond_vs_dropout = EqualsCondition(variable_selection_use_dropout, variable_selection, True)
+            cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate, variable_selection_use_dropout, True)
+            cs.add_conditions([cond_vs_dropout, cond_vs_dropoutrate])
+
+
+
         if static_features_shape + future_feature_shapes[-1] == 0:
             if False in variable_selection.choices and False in decoder_auto_regressive.choices:
                 if variable_selection.num_choices == 1 and decoder_auto_regressive.num_choices == 1:
@@ -417,6 +437,7 @@ def get_hyperparameter_search_space(
         if True in skip_connection.choices:
             forbidden_mlp_skip = []
             forbidden_skip = ForbiddenEqualsClause(skip_connection, True)
+            forbidden_temporal_fusion = ForbiddenEqualsClause(use_temporal_fusion, True)
             for i in range(1, max_num_blocks + 1):
                 hp_mlp_has_local_layer = f"block_{i}:MLPDecoder:has_local_layer"
                 if hp_mlp_has_local_layer in cs:
@@ -425,6 +446,10 @@ def get_hyperparameter_search_space(
                         ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
                         forbidden_skip
                     ))
+                    forbidden_mlp_skip.append(ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                        forbidden_temporal_fusion
+                    ))
             cs.add_forbidden_clauses(forbidden_mlp_skip)
 
         return cs
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 0c6bb8822..05f375b21 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -222,6 +222,19 @@ def _get_hyperparameter_search_space(self,
                     forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
                     forbidden_losses_all.append(forbidden_hp_dist)
 
+            decoder_auto_regressive = cs.get_hyperparameter("network_backbone:seq_encoder:decoder_auto_regressive")
+            forecast_strategy = cs.get_hyperparameter("loss:DistributionLoss:forecast_strategy")
+            use_tf = cs.get_hyperparameter("network_backbone:seq_encoder:use_temporal_fusion")
+
+            if True in decoder_auto_regressive.choices and 'sample' in forecast_strategy.choices and True in use_tf.choices:
+                cs.add_forbidden_clause(
+                    ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(decoder_auto_regressive, True),
+                        ForbiddenEqualsClause(forecast_strategy, 'sample'),
+                        ForbiddenEqualsClause(use_tf, True)
+                    )
+                )
+
             network_flat_encoder_hp = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
 
             if 'MLPEncoder' in network_flat_encoder_hp.choices:
@@ -244,6 +257,8 @@ def _get_hyperparameter_search_space(self,
 
             cs.add_forbidden_clauses(forbidden_losses_all)
 
+
+
             # NBEATS
             network_encoder_hp = cs.get_hyperparameter("network_backbone:__choice__")
             forbidden_NBEATS = []

From 37501ef859b5c3c91f91017254259644dfa3b2db Mon Sep 17 00:00:00 2001
From: dengdifan <33290713+dengdifan@users.noreply.github.com>
Date: Wed, 16 Mar 2022 19:14:28 +0100
Subject: [PATCH 193/347] Tft forecating with features (#6)

* time feature transform

* tft with time-variing features

* transform features allowed for all architecture

* repair mask for temporal fusion layer

* maint
---
 autoPyTorch/api/time_series_forecasting.py    |   7 +-
 .../configs/forecasting_init_cfgs.json        |  89 ++++--
 autoPyTorch/constants_forecasting.py          |  18 +-
 .../data/time_series_forecasting_validator.py |  57 +++-
 autoPyTorch/datasets/time_series_dataset.py   | 220 ++++++++++++--
 .../setup/network/forecasting_architecture.py | 280 ++++++++++--------
 .../setup/network/forecasting_network.py      |  23 +-
 .../forecasting_backbone/cells.py             | 117 +++++---
 .../forecasting_decoder/MLPDecoder.py         |   2 +-
 .../base_forecasting_decoder.py               |  11 +-
 .../base_forecasting_encoder.py               |  15 +-
 .../time_series_forecasting_data_loader.py    | 113 +++----
 .../training/data_loader/time_series_util.py  |  17 +-
 .../forecasting_base_trainer.py               |  47 ++-
 .../pipeline/time_series_forecasting.py       |  16 +-
 15 files changed, 712 insertions(+), 320 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 937e0e9df..236b873a2 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -120,6 +120,8 @@ def search(
             target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
             n_prediction_steps: int = 1,
             freq: Optional[Union[str, int, List[int]]] = None,
+            start_times_train: List[pd.DatetimeIndex] = [],
+            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
             dataset_name: Optional[str] = None,
             budget_type: str = 'epochs',
             min_budget: Union[int, str] = 5,
@@ -266,12 +268,15 @@ def search(
         # Fit a input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
-        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+        self.InputValidator.fit(X_train=X_train, y_train=y_train, start_times_train=start_times_train,
+                                X_test=X_test, y_test=y_test, start_times_test=start_times_test)
 
         self.dataset = TimeSeriesForecastingDataset(
             X=X_train, Y=y_train,
             X_test=X_test, Y_test=y_test,
             freq=freq,
+            start_times_train=start_times_train,
+            start_times_test=start_times_test,
             validator=self.InputValidator,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index d7b97858d..0f8680d1d 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -4,6 +4,7 @@
         "data_loader:backcast": false,
         "data_loader:sample_strategy": "SeqUniform",
         "data_loader:num_batches_per_epoch": 50,
+        "data_loader:transform_time_features": false,
         "lr_scheduler:__choice__": "ReduceLROnPlateau",
         "lr_scheduler:ReduceLROnPlateau:mode": "max",
         "lr_scheduler:ReduceLROnPlateau:factor": 0.5,
@@ -130,35 +131,42 @@
         },
         "Seq2Seq-Transformer2Transformer": {
             "loss:__choice__": "DistributionLoss",
+            "data_loader:transform_time_features": true,
             "loss:DistributionLoss:dist_cls": "studentT",
             "loss:DistributionLoss:forecast_strategy": "sample",
             "loss:DistributionLoss:aggregation": "median",
             "loss:DistributionLoss:num_samples": 100,
-            "network_backbone:__choice__": "TransformerEncoder",
-            "network_backbone:TransformerEncoder:d_model_log": 5,
-            "network_backbone:TransformerEncoder:activation": "gelu",
-            "network_backbone:TransformerEncoder:num_layers": 1,
-            "network_backbone:TransformerEncoder:decoder_type": "TransformerDecoder",
-            "network_backbone:TransformerEncoder:use_dropout": true,
-            "network_backbone:TransformerEncoder:use_positional_encoder": true,
-            "network_backbone:TransformerEncoder:dropout_positional_encoder": 0.1,
-            "network_backbone:TransformerEncoder:d_feed_forward_log": 7,
-            "network_backbone:TransformerEncoder:n_head_log": 3,
-            "network_backbone:TransformerEncoder:layer_norm_eps": 1e-05,
-            "network_backbone:TransformerEncoder:dropout": 0.1,
-            "network_backbone:TransformerEncoder:use_layer_norm_output": true,
-            "network_backbone:TransformerEncoder:layer_norm_eps_output": 1e-05,
-            "network_backbone:TransformerDecoder:activation": "gelu",
-            "network_backbone:TransformerDecoder:num_layers": 1,
-            "network_backbone:TransformerDecoder:use_dropout": true,
-            "network_backbone:TransformerDecoder:use_positional_decoder": true,
-            "network_backbone:TransformerDecoder:dropout_positional_decoder": 0.1,
-            "network_backbone:TransformerDecoder:d_feed_forward_log": 7,
-            "network_backbone:TransformerDecoder:n_head_log": 3,
-            "network_backbone:TransformerDecoder:layer_norm_eps": 1e-05,
-            "network_backbone:TransformerDecoder:dropout": 0.1,
-            "network_backbone:TransformerDecoder:use_layer_norm_output": true,
-            "network_backbone:TransformerDecoder:layer_norm_eps_output": 1e-05
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": false,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:use_temporal_fusion": false,
+            "network_backbone:seq_encoder:variable_selection": false,
+            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:decoder_type": "TransformerDecoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_positional_encoder": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout_positional_encoder": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:d_feed_forward_log": 7,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:n_head_log": 3,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:use_layer_norm_output": true,
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_positional_decoder": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout_positional_decoder": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:d_feed_forward_log": 7,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:n_head_log": 3,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout": 0.1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:use_layer_norm_output": true,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
         },
         "NBEATS-I": {
             "target_scaler:__choice__": "TargetNoScaler",
@@ -206,6 +214,37 @@
             "network_backbone:flat_encoder:NBEATSDecoder:weight_sharing_g": false,
             "network_backbone:flat_encoder:NBEATSDecoder:expansion_coefficient_length_g": 32,
             "network_backbone:flat_encoder:NBEATSDecoder:dropout_g": 0.1
+        },
+        "TemoporalFusionTransformer": {
+            "loss:__choice__": "QuantileLoss",
+            "target_scaler:__choice__": "TargetStandardScaler",
+            "data_loader:transform_time_features": true,
+            "loss:QuantileLoss:lower_quantile": 0.1,
+            "loss:QuantileLoss:upper_quantile": 0.9,
+            "network_backbone:__choice__": "seq_encoder",
+            "network_backbone:seq_encoder:skip_connection": true,
+            "network_backbone:seq_encoder:num_blocks": 1,
+            "network_backbone:seq_encoder:variable_selection": true,
+            "network_backbone:seq_encoder:share_single_variable_networks": false,
+            "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
+            "network_backbone:seq_encoder:variable_selection_use_dropout": true,
+            "network_backbone:seq_encoder:variable_selection_dropout_rate": 0.1,
+            "network_backbone:seq_encoder:grn_use_dropout": true,
+            "network_backbone:seq_encoder:grn_dropout_rate": 0.1,
+            "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
+            "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 32,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:bidirectional": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:use_dropout": false,
+            "network_backbone:seq_encoder:block_1:RNNEncoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:block_1:RNNDecoder:decoder_type": "RNNDecoder",
+            "network_backbone:seq_encoder:use_temporal_fusion": true,
+            "network_backbone:seq_encoder:temporal_fusion:attention_d_model_log": 5,
+            "network_backbone:seq_encoder:temporal_fusion:attention_n_head_log": 2,
+            "network_backbone:seq_encoder:temporal_fusion:use_dropout": true,
+            "network_backbone:seq_encoder:temporal_fusion:dropout_rate": 0.1
         }
     }
 }
\ No newline at end of file
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index 3b5a355ca..b5df96d3c 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -5,15 +5,15 @@
 FORECASTING_BUDGET_TYPE = ['resolution', 'num_seq', 'num_sample_per_seq']
 
 SEASONALITY_MAP = {
-    "minutely": [1440, 10080, 525960],
-    "10_minutes": [144, 1008, 52596],
-    "half_hourly": [48, 336, 17532],
-    "hourly": [24, 168, 8766],
-    "daily": 7,
-    "weekly": 365.25 / 7,
-    "monthly": 12,
-    "quarterly": 4,
-    "yearly": 1
+    "1min": [1440, 10080, 525960],
+    "10min": [144, 1008, 52596],
+    "30min": [48, 336, 17532],
+    "1H": [24, 168, 8766],
+    "1D": 7,
+    "1W": 365.25 / 7,
+    "1M": 12,
+    "1Q": 4,
+    "1Y": 1
 }
 
 MAX_WINDOW_SIZE_BASE = 500
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 9bf89c9a8..19c1ead0f 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -2,9 +2,10 @@
 
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Optional, Tuple, List, Union
+import warnings
+from typing import Optional, Tuple, List, Union, Dict
 import numpy as np
-
+import pandas as pd
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
@@ -23,6 +24,10 @@ def __init__(self,
         self._is_uni_variant = False
         self.known_future_features = None
         self.n_prediction_steps = 1
+        self.start_times_train = None
+        self.start_times_test = None
+        self.feature_shapes = {}
+        self.feature_names = []
 
     """
     A validator designed for a time series forecasting dataset.
@@ -35,10 +40,33 @@ def fit(
             y_train: SUPPORTED_TARGET_TYPES,
             X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
             y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+            start_times_train: Optional[List[pd.DatetimeIndex]] = None,
+            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+            freq: str = '1Y',
             n_prediction_steps: int = 1,
             known_future_features: Optional[List[Union[int, str]]] = None,
+            use_time_features: bool = False
     ) -> BaseEstimator:
         self.n_prediction_steps = n_prediction_steps
+
+        if y_test is not None and bool(start_times_test) != bool(start_times_train):
+            warnings.warn('One of start_times_test or start_times_train is missing! This might result in the '
+                          'risk of not proper evaluated ')
+
+        if start_times_train is None:
+            start_times_train = [pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)] * len(y_train)
+        else:
+            assert len(start_times_train) == len(y_train), 'start_times_train must have the same length as y_train!'
+
+        if y_test is not None:
+            if start_times_test is None:
+                start_times_test = [pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)] * len(y_test)
+            else:
+                assert len(start_times_train) == len(y_train), 'start_times_train must have the same length as y_train!'
+
+        self.start_times_train = start_times_train
+        self.start_times_test = start_times_test
+
         if X_train is None:
             self._is_uni_variant = True
         if self._is_uni_variant:
@@ -64,13 +92,22 @@ def fit(
                                      " {} for features and {} for targets".format(len(X_test), len(y_test), ))
                 # TODO write a feature input validator to check X_test for known_future_features
                 super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
-            else:
-                super().fit(X_train[0], y_train[0])
+            self.feature_validator.fit(X_train[0], None if X_test is None else X_test[0])
+            self.target_validator.fit(y_train[0], None if y_test is None else y_test[0])
+            self._is_fitted = True
 
             self.check_input_shapes(X_train, y_train, is_training=True)
 
             if X_test is not None:
                 self.check_input_shapes(X_test, y_test, is_training=False)
+            if hasattr(X_train[0], 'columns'):
+                features = X_train[0].columns.values.tolist()
+            else:
+                features = list(map(str, range(len(X_train[0]))))
+            for feature in features:
+                self.feature_names.append(feature)
+                self.feature_shapes[feature] = 1
+
             return self
 
     @staticmethod
@@ -125,20 +162,23 @@ def transform(
 
         start_idx = 0
 
+        group_ids = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+
         if self._is_uni_variant:
-            y_flat = np.empty([num_data, num_targets])
+            y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
             for seq_idx, seq_length in enumerate(sequence_lengths):
                 end_idx = start_idx + seq_length
                 y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
-            y_transformed = self.target_validator.transform(y_flat)  # type:np.ndarray
+            y_transformed = self.target_validator.transform(y_flat)
             if y_transformed.ndim == 1:
                 y_transformed = np.expand_dims(y_transformed, -1)
             return np.asarray([]), y_transformed, sequence_lengths
 
         # a matrix that is concatenated by all the time series sequences
-        X_flat = np.empty([num_data, num_features])
-        y_flat = np.empty([num_data, num_targets])
+
+        X_flat = pd.DataFrame(np.empty([num_data, num_features]), index=group_ids)
+        y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
 
         start_idx = 0
         for seq_idx, seq_length in enumerate(sequence_lengths):
@@ -152,4 +192,3 @@ def transform(
         if y_transformed.ndim == 1:
             y_transformed = np.expand_dims(y_transformed, -1)
         return X_transformed, y_transformed, sequence_lengths
-
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index a46dfe615..ea5282376 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast, Set
 import uuid
 import bisect
 import copy
@@ -31,6 +31,11 @@
 )
 
 from gluonts.time_feature.lag import get_lags_for_frequency
+from gluonts.time_feature import (
+    Constant,
+    TimeFeature,
+    time_features_from_frequency_str,
+)
 from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
@@ -49,16 +54,22 @@ class TimeSeriesSequence(Dataset):
     def __init__(self,
                  X: Optional[Union[np.ndarray, pd.DataFrame]],
                  Y: Union[np.ndarray, pd.Series],
+                 start_time_train: Optional[pd.DatetimeIndex] = None,
+                 freq: str = '1Y',
+                 time_feature_transform: List[TimeFeature] = [],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 start_time_test: Optional[pd.DatetimeIndex] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  static_features: Optional[np.ndarray] = None,
-                 n_prediction_steps: int = 1,
+                 n_prediction_steps: int = 0,
                  sp: int = 1,
-                 known_future_features: Optional[List[Union[str, int]]] = None,
+                 known_future_features: Optional[Tuple[str]] = None,
                  only_has_past_targets: bool = False,
                  compute_mase_coefficient_value: bool = True,
+                 time_features=None,
+                 is_test_set=False,
                  ):
         """
         A dataset representing a time series sequence.
@@ -72,15 +83,22 @@ def __init__(self,
 
         self.X = X
         self.Y = Y
+        if start_time_train is None:
+            start_time_train = pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)
+        self.start_time_train = start_time_train
 
         self.X_val = None
         self.Y_val = None
 
         self.X_test = X_test
         self.Y_tet = Y_test
+        self.start_time_test = start_time_test
 
+        self.time_feature_transform = time_feature_transform
         self.static_features = static_features
 
+        self.freq = freq
+
         # We also need to be able to transform the data, be it for pre-processing
         # or for augmentation
         self.train_transform = train_transforms
@@ -98,11 +116,17 @@ def __init__(self,
         self.only_has_past_targets = only_has_past_targets
         self.known_future_features = known_future_features
 
+        self.transform_time_features = False
+        self._cached_time_features: Optional[np.ndarray] = time_features
+        self.is_test_set = is_test_set
+
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
         """
         get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous sequences
-        until the given index, this allows us to do further transformation when the
+        until the given index, this allows us to do further transformation.
+        (When fed to the neural network, the data is arranged as follows:
+        [past_targets, time_features, X_features])
 
         Args:
             index (int): what element to yield from all the train/test tensors
@@ -113,25 +137,48 @@ def __getitem__(self, index: int, train: bool = True) \
         """
         if index < 0:
             index = self.__len__() + index
+
         if self.X is not None:
             if hasattr(self.X, 'loc'):
                 past_features = self.X.iloc[:index + 1]
             else:
                 past_features = self.X[:index + 1]
 
-            if self.train_transform is not None and train:
-                past_features = self.train_transform(past_features)
-            elif self.val_transform is not None and not train:
-                past_features = self.val_transform(past_features)
-
-            if self.known_future_features is not None:
-                future_features = self.X[index + 1: index + self.n_prediction_steps + 1, self.known_future_features]
+            if self.known_future_features:
+                future_features = self.X.iloc[index + 1: index + self.n_prediction_steps + 1,
+                                  self.known_future_feature_index]
             else:
                 future_features = None
         else:
             past_features = None
             future_features = None
 
+        if self.transform_time_features:
+            if self.time_feature_transform:
+                self.compute_time_features()
+
+                if past_features:
+                    past_features = np.hstack([self._cached_time_features[:index + 1], past_features])
+                else:
+                    past_features = self._cached_time_features[:index + 1]
+                if future_features:
+                    future_features = np.hstack([self._cached_time_features[
+                                                 index + 1:index + self.n_prediction_steps +1], past_features
+                                                 ])
+                else:
+                    future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+        if future_features is not None and future_features.shape[0] == 0:
+            future_features = None
+
+        if self.train_transform is not None and train and past_features is not None:
+            past_features = self.train_transform(past_features)
+            if future_features is not None:
+                future_features = self.train_transform(future_features)
+        elif self.val_transform is not None and not train and past_features is not None:
+            past_features = self.val_transform(past_features)
+            if future_features is not None:
+                future_features = self.val_transform(future_features)
+
         # In case of prediction, the targets are not provided
         targets = self.Y
         if self.only_has_past_targets:
@@ -140,6 +187,12 @@ def __getitem__(self, index: int, train: bool = True) \
             targets_future = targets[index + 1: index + self.n_prediction_steps + 1]
             targets_future = torch.from_numpy(targets_future)
 
+        if isinstance(past_features, np.ndarray):
+            past_features = torch.from_numpy(past_features)
+
+        if isinstance(future_features, np.ndarray):
+            future_features = torch.from_numpy(future_features)
+
         past_target = targets[:index + 1]
         past_target = torch.from_numpy(past_target)
 
@@ -149,11 +202,35 @@ def __getitem__(self, index: int, train: bool = True) \
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
                 'encoder_lengths': past_target.shape[0],
-                'decoder_lengths': None if targets_future is None else targets_future.shape[0] }, targets_future
+                'decoder_lengths': None if targets_future is None else targets_future.shape[0]}, targets_future
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
 
+    def compute_time_features(self,):
+        if self._cached_time_features is None:
+            periods = self.Y.shape[0]
+            if self.is_test_set:
+                periods += self.n_prediction_steps
+
+            date_info = pd.date_range(start=self.start_time_train,
+                                      periods=periods,
+                                      freq=self.freq)
+
+            self._cached_time_features = np.vstack(
+                [transform(date_info).to_numpy(float) for transform in self.time_feature_transform]
+            ).T
+        else:
+            if self.is_test_set:
+                if self._cached_time_features.shape[0] == self.Y.shape[0]:
+                    date_info = pd.date_range(start=self.start_time_train,
+                                              periods=self.n_prediction_steps + self.Y.shape[0],
+                                              freq=self.freq)
+                    time_feature_future = np.vstack(
+                        [transform(date_info[-self.n_prediction_steps:]).to_numpy(float) for transform in self.time_feature_transform]
+                    ).T
+                    self._cached_time_features = np.concatenate([self._cached_time_features, time_feature_future])
+
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
                          train: bool = True,
                          ) -> 'BaseDataset':
@@ -190,8 +267,16 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                 X = self.X[:index + 1 + self.n_prediction_steps]
             else:
                 X = None
-            return TimeSeriesSequence(X,
-                                      self.Y[:index + 1],
+            if self._cached_time_features:
+                cached_time_feautres = None
+            else:
+                cached_time_feautres = self._cached_time_features[:index + 1 + self.n_prediction_steps]
+
+            return TimeSeriesSequence(X=X,
+                                      Y=self.Y[:index + 1],
+                                      start_time_train=self.start_time_train,
+                                      freq=self.freq,
+                                      time_feature_transform=self.time_feature_transform,
                                       train_transforms=self.train_transform,
                                       val_transforms=self.val_transform,
                                       n_prediction_steps=self.n_prediction_steps,
@@ -199,7 +284,8 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                                       known_future_features=self.known_future_features,
                                       sp=self.sp,
                                       only_has_past_targets=True,
-                                      compute_mase_coefficient_value=False)
+                                      compute_mase_coefficient_value=False,
+                                      time_features=cached_time_feautres)
 
     def get_test_target(self, test_idx: int):
         if self.only_has_past_targets:
@@ -209,6 +295,12 @@ def get_test_target(self, test_idx: int):
         Y_future = self.Y[test_idx + 1: test_idx + self.n_prediction_steps + 1]
         return Y_future
 
+    def update_attribute(self, **kwargs):
+        for key, value in kwargs.items():
+            if not hasattr(self, key):
+                raise ValueError('Trying to update invalid attribute for TimeSeriesSequence!!!')
+            setattr(self, key, value)
+
 
 class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
     datasets: List[TimeSeriesSequence]
@@ -219,7 +311,10 @@ def __init__(self,
                  Y: Union[np.ndarray, pd.Series],
                  X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 known_future_features: Optional[Union[Tuple[int], int]] = None,
+                 start_times_train: Optional[List[pd.DatetimeIndex]] = None,
+                 start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+                 known_future_features: Optional[Tuple[str]] = None,
+                 time_feature_transform: Optional[List[TimeFeature]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  resampling_strategy: Optional[Union[
                      CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
@@ -299,6 +394,7 @@ def __init__(self,
 
         if not self.validator._is_fitted:
             self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
+                               start_times_train=start_times_train, start_times_test=start_times_test,
                                n_prediction_steps=n_prediction_steps)
 
         self.is_uni_variant = self.validator._is_uni_variant
@@ -311,6 +407,25 @@ def __init__(self,
 
         self.categories = self.validator.feature_validator.categories
 
+        self.feature_shapes = self.validator.feature_shapes
+        self.feature_names = tuple(self.validator.feature_names)
+
+        self.start_times_train = self.validator.start_times_train
+        self.start_times_test = self.validator.start_times_test
+
+        self._transform_time_feature = False
+        if not time_feature_transform:
+            time_feature_transform = time_features_from_frequency_str(self.freq)
+            if not time_feature_transform:
+                # If time features are empty (as for yearly data), we add a
+                # constant feature of 0
+                time_feature_transform = [Constant()]
+
+        self.time_feature_transform = time_feature_transform
+        self.time_feature_names = tuple([f'time_feature_{t.__class__.__name__}' for t in self.time_feature_transform])
+
+        # Time features are lazily generated, we do not count them as either numerical_columns or categorical columns
+
         X, Y, sequence_lengths = self.validator.transform(X, Y)
         if X_test is not None:
             X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
@@ -320,6 +435,7 @@ def __init__(self,
         self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
 
+        # check if dataset could be split with cross validation
         minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
         if isinstance(resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
@@ -408,8 +524,13 @@ def __init__(self,
         self.num_sequences = len(Y)
         self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
 
+        if known_future_features is None:
+            known_future_features = tuple()
+
         # initialize datasets
-        sequences_kwargs = {"train_transforms": self.train_transform,
+        sequences_kwargs = {"freq": self.freq,
+                            "time_feature_transform": self.time_feature_transform,
+                            "train_transforms": self.train_transform,
                             "val_transforms": self.val_transform,
                             "n_prediction_steps": n_prediction_steps,
                             "sp": self.seasonality,
@@ -419,10 +540,13 @@ def __init__(self,
         self.y_train_mean = [0] * len(self.sequence_lengths_train)
         self.y_train_std = [1] * len(self.sequence_lengths_train)
 
-        sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(X=X, Y=Y,
-                                                                                      X_test=X_test, Y_test=Y_test,
-                                                                                      normalize_y=normalize_y,
-                                                                                      **sequences_kwargs)
+        sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
+            X=X, Y=Y,
+            X_test=X_test, Y_test=Y_test,
+            start_times_train=self.start_times_train,
+            start_times_test=self.start_times_test,
+            normalize_y=normalize_y,
+            **sequences_kwargs)
 
         self.normalize_y = normalize_y
 
@@ -548,8 +672,10 @@ def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
     def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
+                                start_times_train: List[pd.DatetimeIndex],
                                 X_test: Optional[np.ndarray] = None,
                                 Y_test: Optional[np.ndarray] = None,
+                                start_times_test: Optional[List[pd.DatetimeIndex]] = None,
                                 normalize_y: bool = True,
                                 **sequences_kwargs: Optional[Dict]) -> \
             Tuple[List[TimeSeriesSequence], Tuple[List, List], Tuple[List, List]]:
@@ -588,10 +714,10 @@ def make_sequences_datasets(self,
         idx_start_test = 0
 
         seq_length_train_flat = self.sequence_lengths_train + self.n_prediction_steps
+        group_ids = np.arange(len(seq_length_train_flat)).repeat(seq_length_train_flat)
 
         for seq_idx, seq_length_train in enumerate(seq_length_train_flat):
             idx_end_train = idx_start_train + seq_length_train
-
             X_seq = X[idx_start_train: idx_end_train]
             Y_seq = Y[idx_start_train: idx_end_train]
 
@@ -614,22 +740,24 @@ def make_sequences_datasets(self,
                     Y_test_seq_std = np.std(Y_test_seq)
                     Y_seq = (Y_seq - Y_test_seq_mean) / Y_test_seq_std
 
-
                 Y_test[idx_start_test: idx_end_test] = Y_seq
 
             else:
                 X_test_seq = None
                 Y_test_seq = None
 
-            if not X_seq:
+            if X_seq.size == 0:
                 X_seq = None
                 X_test_seq = None
 
-            sequence = TimeSeriesSequence(X=X_seq,
-                                          Y=Y_seq,
-                                          X_test=X_test_seq,
-                                          Y_test=Y_test_seq,
-                                          **sequences_kwargs)
+            sequence = TimeSeriesSequence(
+                X=X_seq,
+                Y=Y_seq,
+                start_time_train=start_times_train[seq_idx],
+                X_test=X_test_seq,
+                Y_test=Y_test_seq,
+                start_time_test=None if start_times_test is None else start_times_test[seq_idx],
+                **sequences_kwargs)
             sequence_datasets.append(sequence)
             idx_start_train = idx_end_train
 
@@ -657,7 +785,9 @@ def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatas
             self.update_tensros_seqs(X_test, self.sequence_lengths_tests, is_train=False)
         return self
 
-    def update_tensros_seqs(self, X, sequence_lengths, is_train=True):
+    def update_tensros_seqs(self, X: np.ndarray, sequence_lengths, is_train=True):
+        if X.size == 0:
+            return
         idx_start = 0
         if is_train:
             for seq, seq_length in zip(self.datasets, sequence_lengths):
@@ -696,6 +826,15 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
             seq = seq.update_transform(transform, train)
         return self
 
+    @property
+    def transform_time_features(self):
+        return self._transform_time_features
+
+    @transform_time_features.setter
+    def transform_time_features(self, value: bool):
+        for seq in self.datasets:
+            seq.transform_time_features = value
+
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
         """
         Creates a set of splits based on a resampling strategy provided, apart from the
@@ -760,9 +899,14 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'freq': self.freq,
                                    'sequence_lengths_train': self.sequence_lengths_train,
                                    'seq_length_max': self.seq_length_max,
-                                   'input_shape':self.input_shape,
+                                   'input_shape': self.input_shape,
                                    'lagged_value': self.lagged_value,
+                                   'feature_names': self.feature_names,
+                                   'feature_shapes': self.feature_shapes,
+                                   'known_future_features': self.known_future_features,
                                    'static_features': self.static_features,
+                                   'time_feature_transform': self.time_feature_transform,
+                                   'time_feature_names': self.time_feature_names,
                                    'future_feature_shapes': self.future_feature_shapes,
                                    'uni_variant': self.is_uni_variant})
         return dataset_properties
@@ -896,12 +1040,24 @@ def create_refit_split(
                 splits[idx_split][idx_seq] = idx_start + split[idx_split]
             idx_start += self.sequence_lengths_train[idx_seq]
 
-
         train_indices = np.hstack([sp for sp in splits[0]])
         test_indices = np.hstack([sp for sp in splits[1]])
 
         return train_indices, test_indices
 
+    def create_refit_set(self) -> "TimeSeriesForecastingDataset":
+        refit_set: TimeSeriesForecastingDataset = copy.deepcopy(self)
+        refit_set.resampling_strategy = None
+        refit_set.splits = refit_set.get_splits_from_resampling_strategy()
+        return refit_set
+
+    def generatet_test_seqs(self) -> List[TimeSeriesSequence]:
+        test_sets = copy.deepcopy(self.datasets)
+        for test_seq in test_sets:
+            test_seq.is_test_set = True
+            test_seq.only_has_past_targets = True
+        return test_sets
+
 
 def _check_time_series_forecasting_inputs(train: np.ndarray,
                                           val: Optional[np.ndarray] = None) -> None:
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 0a426ecf6..9c2a971a3 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -30,12 +30,6 @@
     DecoderBlockInfo
 )
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import AddLayer
-from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
-    TimeDistributed, TimeDistributedInterpolation, GatedLinearUnit, ResampleNorm, AddNorm, GateAddNorm,
-    GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention
-)
-
 
 class TransformedDistribution_(TransformedDistribution):
     """
@@ -169,6 +163,10 @@ def __init__(self,
                  target_scaler: BaseTargetScaler,
                  dataset_properties: Dict,
                  auto_regressive: bool,
+                 feature_names: Optional[Tuple[str]] = (),
+                 known_future_features: Optional[Tuple[str]] = (),
+                 feature_shapes: Optional[Dict[str, int]] = (),
+                 time_feature_names: Optional[Tuple[str]] = (),
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
                  num_samples: Optional[int] = 100,
@@ -206,7 +204,12 @@ def __init__(self,
             self.variable_selector = VariableSelector(network_structure=network_structure,
                                                       dataset_properties=dataset_properties,
                                                       network_encoder=network_encoder,
-                                                      auto_regressive=auto_regressive)
+                                                      auto_regressive=auto_regressive,
+                                                      feature_names=feature_names,
+                                                      known_future_features=known_future_features,
+                                                      feature_shapes=feature_shapes,
+                                                      time_feature_names=time_feature_names,
+                                                      )
             self.lazy_modules.append(self.variable_selector)
         has_temporal_fusion = network_structure.use_temporal_fusion
         self.encoder = StackedEncoder(network_structure=network_structure,
@@ -372,19 +375,41 @@ def pre_processing(self,
         if self.network_structure.variable_selection:
             batch_size = x_past.shape[0]
             if length_past > 0:
-                if past_features is None:
-                    length_past = x_past.shape[1]
-                    x_past = {'past_targets': x_past.to(device=self.device),
-                              'features': torch.zeros((batch_size, length_past, 1),
-                                                      dtype=past_targets.dtype, device=self.device)}
+                if past_features is not None:
+                    past_features = past_features[:, -self.window_size:].to(self.device)
+                x_past = {'past_targets': x_past.to(device=self.device)}
+
+                if past_features is not None:
+                    for feature_name in self.variable_selector.feature_names:
+                        tensor_idx = self.variable_selector.feature_names2tensor_idx[feature_name]
+                        x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+
+                if hasattr(self.variable_selector, 'placeholder_features'):
+                    for placehold in self.variable_selector.placeholder_features:
+                        x_past[placehold] = torch.zeros((batch_size, length_past, 1),
+                                                        dtype=past_targets.dtype,
+                                                        device=self.device)
             else:
                 x_past = None
             if length_future > 0:
-                if future_features is None:
-                    x_future = {'features': torch.zeros((batch_size, length_future, 1),
-                                                        dtype=past_targets.dtype, device=self.device)}
+                if future_features is not None:
+                    future_features = future_features.to(self.device)
+                x_future = {}
+                if hasattr(self.variable_selector, 'placeholder_features'):
+                    for placehold in self.variable_selector.placeholder_features:
+                        x_future[placehold] = torch.zeros((batch_size,
+                                                           length_future, 1),
+                                                          dtype=past_targets.dtype,
+                                                          device=self.device)
+                else:
+                    x_future = {}
+                if future_features is not None:
+                    for feature_name in self.variable_selector.known_future_features:
+                        tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
+                        x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
             else:
                 x_future = None
+
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
                 x_past=x_past,
                 x_future=x_future,
@@ -394,13 +419,19 @@ def pre_processing(self,
                 length_future=length_future,
                 **variable_selector_kwargs
             )
+
             return x_past, x_future, x_static, loc, scale, static_context_initial_hidden
         else:
             if past_features is not None:
-                x_past = torch.cat([past_features, x_past], dim=1)
+                past_features = past_features[:, -self.window_size:]
+                x_past = torch.cat([x_past, past_features], dim=-1)
 
             x_past = x_past.to(device=self.device)
-            x_past = self.embedding(x_past)
+            if future_features is not None:
+                future_features = future_features.to(self.device)
+            if static_features is not None:
+                static_features = static_features.to(self.device)
+            x_past = self.embedding(x_past)  # TODO embedding for future features!
             return x_past, future_features, static_features, loc, scale, None
 
     def forward(self,
@@ -417,7 +448,7 @@ def forward(self,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
-            length_past=self.window_size,
+            length_past=min(self.window_size, past_targets.shape[1]),
             length_future=self.n_prediction_steps
         )
 
@@ -470,7 +501,11 @@ def predict(self,
                 static_features: Optional[torch.Tensor] = None,
                 encoder_lengths: Optional[torch.LongTensor] = None,
                 ):
-        net_output = self(past_targets, past_features, encoder_lengths=encoder_lengths)
+        net_output = self(past_targets=past_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          static_features=static_features,
+                          encoder_lengths=encoder_lengths)
         return self.pred_from_net_output(net_output)
 
 
@@ -493,11 +528,20 @@ def __init__(self, **kwargs):
     def decoder_select_variable(self, future_targets: torch.tensor, future_features: Optional[torch.Tensor]):
         batch_size = future_targets.shape[0]
         length_future = future_targets.shape[1]
-        if future_features is None:
-            x_future = {
-                'future_prediction': future_targets.to(self.device),
-                'features': torch.zeros((batch_size, length_future, 1),
-                                        dtype=future_targets.dtype, device=self.device)}
+        future_targets = future_targets.to(self.device)
+        if future_features is not None:
+            future_features = future_features.to(self.device)
+        x_future = {}
+        if hasattr(self.variable_selector, 'placeholder_features'):
+            for placeholder in self.variable_selector.placeholder_features:
+                x_future[placeholder] = torch.zeros((batch_size,
+                                                     length_future, 1),
+                                                    dtype=future_targets.dtype,
+                                                    device=self.device)
+        for feature_name in self.variable_selector.known_future_features:
+            tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
+            x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+        x_future['future_prediction'] = future_targets
         _, x_future, _, _ = self.variable_selector(x_past=None,
                                                    x_future=x_future,
                                                    x_static=None,
@@ -521,7 +565,7 @@ def forward(self,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
-            length_past=self.window_size,
+            length_past=min(self.window_size, past_targets.shape[1]),
             length_future=0,
             variable_selector_kwargs={'cache_static_contex': True}
         )
@@ -542,7 +586,7 @@ def forward(self,
             if self.network_structure.variable_selection:
                 x_future = self.decoder_select_variable(future_targets, future_features)
             else:
-                x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
+                x_future = future_targets if future_features is None else torch.cat([future_targets, future_features],
                                                                                     dim=-1)
             x_future = x_future.to(self.device)
 
@@ -582,16 +626,17 @@ def forward(self,
                     else:
                         x_future = predicted_target[:, [-1]]
 
-                    x_future = x_future.to(self.device)
-
                     if self.network_structure.variable_selection:
                         x_future = self.decoder_select_variable(
                             future_targets=predicted_target[:, -1:].to(self.device),
                             future_features=future_features[:, [idx_pred]] if future_features is not None else None
                         )
                     else:
-                        x_future = x_future if future_features is None else torch.cat([future_features, future_targets],
+                        x_future = x_future if future_features is None else torch.cat([x_future,
+                                                                                       future_features[:, [idx_pred]]],
                                                                                       dim=-1)
+                        x_future = x_future.to(self.device)
+
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
                                                   cache_intermediate_state=True,
@@ -677,7 +722,7 @@ def forward(self,
                             future_features=None if repeated_time_feat is None else repeated_time_feat[:, [idx_pred]])
                     else:
                         x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
+                            [x_future, repeated_time_feat[:, [idx_pred], :]], dim=-1)
 
                         x_future = x_future.to(self.device)
 
@@ -732,7 +777,7 @@ def predict(self,
             return net_output
 
 
-class ForecastingDeepARNet(ForecastingNet):
+class ForecastingDeepARNet(ForecastingSeq2SeqNet):
     future_target_required = True
 
     def __init__(self,
@@ -755,23 +800,31 @@ def train(self, mode: bool = True) -> nn.Module:
         self.only_generate_future_dist = False
         return super().train(mode=mode)
 
-    def decoder_select_variable(self, future_targets: torch.tensor, future_features: Optional[torch.Tensor]):
-        batch_size = future_targets.shape[0]
-        length_future = future_targets.shape[1]
-        if future_features is None:
-            x_future = {
-                'future_prediction': future_targets.to(self.device),
-                'features': torch.zeros((batch_size, length_future, 1),
-                                        dtype=future_targets.dtype, device=self.device)}
-        _, x_future, _, _ = self.variable_selector(x_past=None,
-                                                   x_future=x_future,
-                                                   x_static=None,
-                                                   length_past=0,
-                                                   length_future=length_future,
-                                                   batch_size=batch_size,
-                                                   use_cached_static_contex=True
-                                                   )
-        return x_future
+    def encoder_select_variable(self, past_targets: torch.tensor, past_features: Optional[torch.Tensor],
+                                length_past: int,
+                                **variable_selector_kwargs):
+        batch_size = past_targets.shape[0]
+        past_targets = past_targets.to(self.device)
+        if past_features is not None:
+            past_features = past_features.to(self.device)
+        x_past = []
+        if hasattr(self.variable_selector, 'placeholder_features'):
+            for placehold in self.variable_selector.placeholder_features:
+                x_past[placehold] = torch.zeros((batch_size, length_past, 1),
+                                                dtype=past_targets.dtype,
+                                                device=self.device)
+        for feature_name in self.variable_selector.feature_names:
+            tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
+            x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+        x_past, _, _, _ = self.variable_selector(x_past=x_past,
+                                                 x_future=None,
+                                                 x_static=None,
+                                                 length_past=length_past,
+                                                 length_future=0,
+                                                 batch_size=batch_size,
+                                                 **variable_selector_kwargs,
+                                                 )
+        return x_past
 
     def forward(self,
                 past_targets: torch.Tensor,
@@ -802,26 +855,20 @@ def forward(self,
                 targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
 
             if self.network_structure.variable_selection:
-                batch_size = past_targets.shape[0]
-                length_past = self.window_size + self.n_prediction_steps
-                if past_features is None:
-                    if past_features is None:
-                        x_past = {'past_targets': targets_all.to(device=self.device),
-                                  'features': torch.zeros((batch_size, length_past, 1),
-                                                          dtype=targets_all.dtype, device=self.device)}
-
-                x_input, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
-                                                                                      x_future=None,
-                                                                                      x_static=static_features,
-                                                                                      length_past=length_past,
-                                                                                      length_future=0,
-                                                                                      batch_size=batch_size,
-                                                                                      )
+                if past_features is not None:
+                    past_features = past_features[:-self.window_size:]
+                    features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
+                else:
+                    features_all = None
+                length_past = min(self.window_size, past_targets.shape[1]) + self.n_prediction_steps
+                x_input = self.encoder_select_variable(targets_all, past_features=features_all, length_past=length_past)
             else:
                 x_input = targets_all
                 if past_features is not None:
+                    past_features = past_features[:, -self.window_size:]
                     features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
-                    x_input = torch.cat([features_all, targets_all], dim=-1)
+                    x_input = torch.cat([targets_all, features_all], dim=-1)
+
                 x_input = x_input.to(self.device)
 
                 x_input = self.embedding(x_input)
@@ -836,7 +883,7 @@ def forward(self,
 
             if self.only_generate_future_dist:
                 # DeepAR only receives the output of the last encoder
-                encoder2decoder = encoder2decoder[-1][:, -self.n_prediction_steps:]
+                encoder2decoder = [encoder2decoder[-1][:, -self.n_prediction_steps:]]
             net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
             # DeepAR does not allow tf layers
             return self.rescale_output(net_output, loc, scale, self.device)
@@ -857,22 +904,19 @@ def forward(self,
                 x_past = past_targets
 
             if self.network_structure.variable_selection:
-                batch_size = past_targets.shape[0]
-                length_past = self.window_size
-                if past_features is None:
-                    if past_features is None:
-                        x_past = {'past_targets': past_targets.to(device=self.device),
-                                  'features': torch.zeros((batch_size, length_past, 1),
-                                                          dtype=past_targets.dtype, device=self.device)}
-
-                x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=x_past,
-                                                                                     x_future=None,
-                                                                                     x_static=static_features,
-                                                                                     length_past=length_past,
-                                                                                     length_future=0,
-                                                                                     batch_size=batch_size,
-                                                                                     cache_static_contex=True
-                                                                                     )
+                if past_features is not None:
+                    past_features = past_features[:-self.window_size:]
+                    features_all = torch.cat([past_features[:, 1:], future_features[:, :1]], dim=1)
+                else:
+                    features_all = None
+                length_past = min(self.window_size, past_targets.shape[1])
+                variable_selector_kwargs = dict(cache_static_contex=True,
+                                                use_cached_static_contex=False)
+                x_past = self.encoder_select_variable(x_past,
+                                                      past_features=features_all,
+                                                      length_past=length_past,
+                                                      **variable_selector_kwargs)
+
             else:
                 if past_features is not None:
                     # features is one step ahead of target
@@ -884,7 +928,7 @@ def forward(self,
                         features_all = future_features
                 else:
                     features_all = None
-                x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
+                x_past = x_past if features_all is None else torch.cat([x_past, features_all[:, :self.window_size]],
                                                                        dim=-1)
 
                 x_past = x_past.to(self.device)
@@ -917,28 +961,6 @@ def forward(self,
                 max_lag_seq_length = max(max(self.encoder_lagged_value), self.window_size)
             else:
                 max_lag_seq_length = self.window_size
-            # TODO considering padding targets here instead of inside get_lagged function
-            repeated_past_target = past_targets[:, -max_lag_seq_length:, ].repeat_interleave(
-                repeats=self.num_samples,
-                dim=0).squeeze(1)
-
-            repeated_static_feat = static_features.repeat_interleave(
-                repeats=self.num_samples, dim=0
-            ).unsqueeze(dim=1) if static_features is not None else None
-
-            if features_all is not None:
-                if not self.encoder_has_hidden_states:
-                    # both feature_past and feature_future must exist or not, otherwise deepAR is disabled due to
-                    # data properties!!!
-                    time_feature = features_all
-                else:
-                    time_feature = future_features[:, 1:] if self.n_prediction_steps > 1 else None
-            else:
-                time_feature = None
-
-            repeated_time_feat = time_feature.repeat_interleave(
-                repeats=self.num_samples, dim=0
-            ) if future_features is not None else None
 
             net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
 
@@ -950,6 +972,25 @@ def forward(self,
 
             all_samples.append(next_sample)
 
+            # TODO considering padding targets here instead of inside get_lagged function
+            if self.n_prediction_steps > 1:
+                repeated_past_target = past_targets[:, -max_lag_seq_length:, ].repeat_interleave(
+                    repeats=self.num_samples,
+                    dim=0).squeeze(1)
+
+                repeated_static_feat = static_features.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ).unsqueeze(dim=1) if static_features is not None else None
+
+                if future_features is not None:
+                    time_feature = future_features[:, 1:]
+                else:
+                    time_feature = None
+
+                repeated_time_feat = time_feature.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if time_feature is not None else None
+
             for k in range(1, self.n_prediction_steps):
                 if self.encoder_lagged_input:
                     repeated_past_target = torch.cat([repeated_past_target, all_samples[-1]], dim=1)
@@ -957,25 +998,20 @@ def forward(self,
                 else:
                     x_next = next_sample
 
-                x_next = x_next.to(self.device)
-
                 if self.network_structure.variable_selection:
-                    batch_size = past_targets.shape[0]
-                    if past_features is None:
-                        if past_features is None:
-                            x_next = {'past_targets': x_next,
-                                      'features': torch.zeros((batch_size, 1, 1),
-                                                              dtype=x_next.dtype, device=self.device)}
-
-                    x_next, _, _, _ = self.variable_selector(x_past=x_next,
-                                                             x_future=None,
-                                                             x_static=static_features,
-                                                             length_past=1,
-                                                             length_future=0,
-                                                             batch_size=batch_size,
-                                                             cache_static_contex=False,
-                                                             use_cached_static_contex=True,
-                                                             )
+                    length_past = 1
+                    variable_selector_kwargs = dict(use_cached_static_contex=True)
+                    if repeated_time_feat is not None:
+                        feature_next = repeated_time_feat[:, [k - 1]]
+                    else:
+                        feature_next = None
+                    x_next = self.encoder_select_variable(x_next, past_features=feature_next, length_past=1,
+                                                          **variable_selector_kwargs)
+
+                else:
+                    if repeated_time_feat is not None:
+                        x_next = torch.cat([x_next, repeated_time_feat[:, [k - 1]]], dim=-1)
+                    x_next = x_next.to(self.device)
                 encoder2decoder, _ = self.encoder(encoder_input=x_next,
                                                   additional_input=[None] * self.network_structure.num_blocks,
                                                   output_seq=False, cache_intermediate_state=True,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 79d5c8f6c..81c928899 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,5 +1,5 @@
 from collections import OrderedDict
-from typing import Any, Dict, Optional, Union, Tuple, List
+from typing import Any, Dict, Optional, Union, Tuple, List, Iterable
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
@@ -56,6 +56,10 @@ def _required_fit_requirements(self):
             FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
             FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
             FitRequirement("net_output_type", (str,), user_defined=False, dataset_property=False),
+            FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('time_feature_names', (Iterable,), user_defined=False, dataset_property=True)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
@@ -68,6 +72,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         network_decoder = X['network_decoder']
 
         net_output_type = X['net_output_type']
+
+        feature_names = X['dataset_properties']['feature_names']
+        feature_shapes = X['dataset_properties']['feature_shapes']
+        transform_time_features = X['transform_time_features']
+        known_future_features = X['dataset_properties']['known_future_features']
+        if transform_time_features:
+            time_feature_names = X['dataset_properties']['time_feature_names']
+        else:
+            time_feature_names = ()
+
         network_init_kwargs = dict(network_structure=network_structure,
                                    network_embedding=X['network_embedding'],
                                    network_encoder=network_encoder,
@@ -78,7 +92,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    window_size=X['window_size'],
                                    dataset_properties=X['dataset_properties'],
                                    target_scaler=X['target_scaler'],
-                                   output_type=net_output_type,)
+                                   output_type=net_output_type,
+                                   feature_names=feature_names,
+                                   feature_shapes=feature_shapes,
+                                   known_future_features=known_future_features,
+                                   time_feature_names=time_feature_names,
+                                   )
         if net_output_type == 'distribution':
             dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 1d766807d..76713e5fa 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -1,6 +1,6 @@
 from pytorch_forecasting.utils import create_mask
 
-from typing import Any, Dict, Optional, List, Tuple
+from typing import Any, Dict, Optional, List, Tuple, Set
 
 import torch
 from torch import nn
@@ -119,8 +119,11 @@ def forward(self,
             attn_input = self.enrichment(network_output)
 
         # Attention
+        encoder_out_length = encoder_output.shape[1]
         encoder_lengths = torch.where(encoder_lengths < self.window_size, encoder_lengths, self.window_size)
+        encoder_lengths = torch.where(encoder_lengths > encoder_out_length, encoder_out_length, encoder_lengths)
         encoder_lengths = encoder_lengths.to(self.device)
+
         mask = self.get_attention_mask(encoder_lengths=encoder_lengths, decoder_length=decoder_length)
         if mask.shape[-1] < attn_input.shape[1]:
             # in case that none of the samples has length greater than window_size
@@ -172,6 +175,10 @@ def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length:
         decoder_mask = attend_step >= predict_step
         # do not attend to steps where data is padded
         encoder_mask = create_mask(encoder_lengths.max(), encoder_lengths)
+
+        # this is the result of our padding strategy: we pad values at the start of the tensors
+        encoder_mask = torch.flip(encoder_mask, dims=[1])
+
         # combine masks along attended time - first encoder and then decoder
         mask = torch.cat(
             (
@@ -188,13 +195,85 @@ def __init__(self,
                  network_structure: NetworkStructure,
                  dataset_properties: Dict,
                  network_encoder: Dict[str, EncoderBlockInfo],
-                 auto_regressive: bool = False
+                 auto_regressive: bool = False,
+                 feature_names: Tuple[str] = (),
+                 known_future_features: Tuple[str] = tuple(),
+                 feature_shapes: Dict[str, int] = {},
+                 time_feature_names: Tuple[str] = (),
                  ):
         super().__init__()
         first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
         static_input_sizes = dataset_properties['static_features_shape']
         self.hidden_size = first_encoder_output_shape
 
+        assert set(feature_names) == set(feature_shapes.keys()), "feature_names and feature_shapes must have " \
+                                                                 "the same variable names"
+        pre_scalar = {'past_targets': nn.Linear(dataset_properties['output_shape'][-1], self.hidden_size)}
+        encoder_input_sizes = {'past_targets': self.hidden_size}
+        decoder_input_sizes = {}
+        feature_names2tensor_idx = {}
+        future_feature_name2tensor_idx = {}
+        idx_tracker = 0
+        idx_tracker_future = 0
+        if time_feature_names:
+            for name in time_feature_names:
+                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker+1]
+                future_feature_name2tensor_idx[name] = [idx_tracker_future, idx_tracker_future + 1]
+                idx_tracker += 1
+                idx_tracker_future += 1
+                pre_scalar[name] = nn.Linear(1, self.hidden_size)
+                encoder_input_sizes[name] = self.hidden_size
+                decoder_input_sizes[name] = self.hidden_size
+
+        if feature_names:
+            for name in feature_names:
+                feature_shape = feature_shapes[name]
+                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker + feature_shape]
+                idx_tracker += feature_shape
+                pre_scalar[name] = nn.Linear(feature_shape, self.hidden_size)
+                encoder_input_sizes[name] = self.hidden_size
+                if name in known_future_features:
+                    decoder_input_sizes[name] = self.hidden_size
+
+        for future_name in known_future_features:
+            feature_shape = feature_shapes[future_name]
+            future_feature_name2tensor_idx[future_name] = [idx_tracker_future, idx_tracker_future + feature_shape]
+            idx_tracker_future += feature_shape
+
+        feature_names = time_feature_names + feature_names
+        known_future_features = time_feature_names + known_future_features
+        # if not feature_names or not(known_future_features or time_feature_names):
+        # Ensure that at least one feature is applied
+        placeholder_features = 'placeholder_features'
+        i = 0
+        self.placeholder_features = []
+        for j in range(1):
+            while placeholder_features in feature_names or placeholder_features in self.placeholder_features:
+                i += 1
+                placeholder_features = f'placeholder_features_{i}'
+                if i == 5000:
+                    raise RuntimeError(
+                        "Cannot assign name to placeholder features, please considering rename your features")
+
+            name = placeholder_features
+            pre_scalar[name] = nn.Linear(1, self.hidden_size)
+            encoder_input_sizes[name] = self.hidden_size
+            decoder_input_sizes[name] = self.hidden_size
+            self.placeholder_features.append(placeholder_features)
+
+        # self.placeholder_features = [placeholder_features]
+
+        self.feature_names = feature_names
+        self.feature_names2tensor_idx = feature_names2tensor_idx
+        self.future_feature_name2tensor_idx = future_feature_name2tensor_idx
+        self.known_future_features = known_future_features
+
+        if auto_regressive:
+            pre_scalar.update({'future_prediction': nn.Linear(dataset_properties['output_shape'][-1],
+                                                              self.hidden_size)})
+            decoder_input_sizes.update({'future_prediction': self.hidden_size})
+        self.pre_scalars = {nn.ModuleDict(pre_scalar)}
+
         self._device = torch.device('cpu')
 
         if not dataset_properties['uni_variant']:
@@ -206,38 +285,6 @@ def __init__(self,
                 dropout=network_structure.grn_dropout_rate,
             )
         self.static_input_sizes = static_input_sizes
-        if dataset_properties['uni_variant']:
-            """
-            # variable selection for encoder and decoder
-            encoder_input_sizes = {
-                'past_targets': dataset_properties['input_shape'][-1],
-                'features': 1
-            }
-            decoder_input_sizes = {
-                'features': 1
-            }
-            if auto_regressive:
-                decoder_input_sizes.update({'future_prediction': dataset_properties['output_shape'][-1]})
-            """
-            pre_scalar = {
-                'past_targets': nn.Linear(dataset_properties['input_shape'][-1], self.hidden_size),
-                'features': nn.Linear(1, self.hidden_size),
-            }
-            encoder_input_sizes = {
-                'past_targets': self.hidden_size,
-                'features': self.hidden_size
-            }
-            decoder_input_sizes = {
-                'features': self.hidden_size
-            }
-            if auto_regressive:
-                pre_scalar.update({'future_prediction': nn.Linear(dataset_properties['output_shape'][-1],
-                                                                  self.hidden_size)})
-                decoder_input_sizes.update({'future_prediction': self.hidden_size})
-            self.pre_scalars = {nn.ModuleDict(pre_scalar)}
-        else:
-            # TODO
-            pass
 
         self.auto_regressive = auto_regressive
 
@@ -566,7 +613,7 @@ def forward(self,
                 else:
                     fx = decoder_i(x, encoder_output=encoder_output[i])
             skip_id = f'skip_connection_{block_id}'
-            if self.skip_connection and skip_id in self.decoder:
+            if self.skip_connection and skip_id in self.decoder and x is not None:
                 fx = self.decoder[skip_id](fx, x)
             if cache_intermediate_state:
                 if self.decoder_has_hidden_states[i]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 7455b5b23..3db9a8815 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -57,7 +57,7 @@ def _build_decoder(self,
         in_features = encoder_output_shape[-1]
         num_decoder_output_features = in_features
         has_local_layer = 'units_local_layer' in self.config
-        if not has_local_layer:
+        if not has_local_layer and not self.auto_regressive:
             in_features += int(np.prod(future_variable_input))
         if 'num_layers' in self.config and self.config["num_layers"] > 0:
             for i in range(1, self.config["num_layers"] + 1):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 3163337a5..fe2e5b917 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -42,7 +42,9 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('future_feature_shapes', (Tuple,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
-            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False)
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
         ]
 
     @property
@@ -78,9 +80,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         network_structure = X['network_structure']
         variable_selection = network_structure.variable_selection
+
         if 'n_decoder_output_features' not in X:
             future_feature_shapes = X['dataset_properties']['future_feature_shapes']
 
+            if X['transform_time_features']:
+                n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
+            else:
+                n_time_feature_transform = 0
+
             if self.block_number == network_structure.num_blocks:
                 self.is_last_decoder = True
 
@@ -93,6 +101,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                         future_in_features += len(self.lagged_value) * output_shape[-1]
                     elif self.decoder_properties().recurrent:
                         future_in_features += output_shape[-1]
+                future_in_features += n_time_feature_transform
             future_variable_input = (self.n_prediction_heads, future_in_features)
         else:
             future_variable_input = (self.n_prediction_heads, X['n_decoder_output_features'])
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 394a88f94..f75f9930e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -53,7 +53,9 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('static_features_shape', (int,), user_defined=True, dataset_property=True),
-            FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False)
+            FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -73,19 +75,22 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                     X_train = X_train[:1, np.newaxis, ...]
                     X_train = transforms(X_train)
                     input_shape = np.concatenate(X_train).shape[1:]
+            if X['transform_time_features']:
+                n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
+            else:
+                n_time_feature_transform = 0
 
             if 'network_embedding' in X.keys():
                 input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
 
-            in_features = input_shape[-1]
-
             variable_selection = X['network_structure'].variable_selection
             if variable_selection:
                 in_features = self.n_encoder_output_feature()
             elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
-                in_features = len(self.lagged_value) * output_shape[-1] + input_shape[-1] + static_features_shape
+                in_features = len(self.lagged_value) * output_shape[-1] + \
+                              input_shape[-1] + static_features_shape + n_time_feature_transform
             else:
-                in_features = output_shape[-1] + input_shape[-1] + static_features_shape
+                in_features = output_shape[-1] + input_shape[-1] + static_features_shape + n_time_feature_transform
 
             input_shape = (X['window_size'], in_features)
         else:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 9641678ea..ca9983605 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -13,7 +13,7 @@
 
 import torchvision
 
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from autoPyTorch.utils.common import (
     HyperparameterSearchSpace,
     custom_collate_fn,
@@ -47,6 +47,7 @@ def __init__(self,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
                  sample_strategy='seq_uniform',
+                 transform_time_features=False,
                  random_state: Optional[np.random.RandomState] = None) -> None:
         """
         initialize a dataloader
@@ -79,6 +80,10 @@ def __init__(self,
         self.known_future_features = None
         self._is_uni_variant = False
 
+        self.transform_time_features = transform_time_features
+        self.freq = "1Y"
+        self.time_feature_transform = []
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
         Fits a component by using an input dictionary with pre-requisites
@@ -106,6 +111,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
 
+
         if sample_interval > 1:
             # for lower resolution, window_size should be smaller
             self.window_size = (self.window_size - 1) // sample_interval + 1
@@ -113,6 +119,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         max_lagged_value = max(X['dataset_properties'].get('lagged_value', [np.inf]))
         max_lagged_value += self.window_size + self.n_prediction_steps
 
+
         self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value, max_lagged_value)
 
         # this value corresponds to budget type num_sequence
@@ -133,6 +140,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             self.val_transform,
             train=False,
         )
+        datamanager.transform_time_features = self.transform_time_features
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
@@ -143,6 +151,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         self._is_uni_variant = X['dataset_properties']['uni_variant']
 
+        self.freq = X['dataset_properties']['freq']
+        self.time_feature_transform = X['dataset_properties']['time_feature_transform']
+
         train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
 
         train_split, test_split = datamanager.splits[X['split_id']]
@@ -233,7 +244,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         return self
 
     def transform(self, X: Dict) -> Dict:
-        X.update({"window_size": self.window_size})
+        X.update({"window_size": self.window_size,
+                  'transform_time_features': self.transform_time_features})
         return super().transform(X)
 
     def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
@@ -272,69 +284,33 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         applying the transformations meant to validation objects
         This is a lazy loaded test set, each time only one piece of series
         """
-        if self._is_uni_variant:
-            if isinstance(X, TimeSeriesSequence):
-                X.update_transform(self.test_transform, train=False)
-                dataset = [X]
-            elif isinstance(X, Sequence):
-                dataset = []
-                if isinstance(X[0], TimeSeriesSequence):
-                    for X_seq in X:
-                        X_seq.update_transform(self.test_transform, train=False)
-                        dataset.append(X_seq)
-                else:
-                    for X_seq in X:
-                        seq = TimeSeriesSequence(
-                            X=None, Y=X_seq,
-                            # This dataset is used for loading test data in a batched format
-                            train_transforms=self.test_transform,
-                            val_transforms=self.test_transform,
-                            n_prediction_steps=0,
-                            static_features=self.static_features,
-                            known_future_features=self.known_future_features,
-                            only_has_past_targets=True,
-                        )
-                        dataset.append(seq)
-            else:
-                raise NotImplementedError(f"Unsupported type of input: {type(y)}")
+        if isinstance(X, TimeSeriesSequence):
+            X = [X]
+        if isinstance(X, List):
+            for x_seq in X:
+                if not isinstance(x_seq, TimeSeriesSequence):
+                    raise NotImplementedError('Test Set must be a TimeSeriesSequence or a'
+                                              ' list of time series objects!')
+                if x_seq.freq != self.freq:
+                    # WE need to recompute the cached time features (However, this should not happen)
+                    x_seq._cached_time_features = None
+
+                x_seq.update_transform(self.test_transform, train=False)
+                x_seq.update_attribute(freq=self.freq,
+                                       transform_time_features=self.transform_time_features,
+                                       time_feature_transform=self.time_feature_transform,
+                                       static_features=self.static_features,
+                                       known_future_features=self.known_future_features,
+                                       )
+                if self.transform_time_features:
+                    x_seq.compute_time_features()
+
+                x_seq.freq = self.freq
+                x_seq.update_transform(self.test_transform, train=False)
         else:
-            if y is None:
-                # TODO consider other circumstances!
-                y = X['past_targets']
-                X = X['features']
-
-            # TODO replace with strings
-            if isinstance(y, (np.ndarray, torch.Tensor)):
-                if isinstance(y, torch.Tensor):
-                    y = y.numpy()
-                    X = X.numpy()
-                if y.ndim == 1:
-                    y = [y]
-                if X.ndim == 1:
-                    X = [X]
-
-            if isinstance(y, Sequence):
-                dataset = []
-                if isinstance(y[0], TimeSeriesSequence):
-                    for y_seq in y:
-                        y_seq.update_transform(self.test_transform, train=False)
-                        dataset.append(y_seq)
-                else:
-                    for X_seq, y_seq in zip(X, y):
-                        seq = TimeSeriesSequence(
-                            X=X_seq, Y=y_seq,
-                            # This dataset is used for loading test data in a batched format
-                            train_transforms=self.test_transform,
-                            val_transforms=self.test_transform,
-                            n_prediction_steps=0,
-                            static_features=self.static_features,
-                            known_future_features=self.known_future_features,
-                            only_has_past_targets=True,
-                        )
-                        dataset.append(seq)
-            else:
-                raise NotImplementedError(f"Unsupported type of input: {type(y)}")
+            raise NotImplementedError('Unsupported data type for time series data loader!')
 
+        dataset = X
         dataset_test = TestSequenceDataset(dataset, train=False)
 
         return torch.utils.data.DataLoader(
@@ -395,7 +371,11 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                                         backcast_period: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter='backcast_period',
                                                                   value_range=(1, 7),
-                                                                  default_value=2)
+                                                                  default_value=2),
+                                        transform_time_features: HyperparameterSearchSpace =
+                                        HyperparameterSearchSpace(hyperparameter='transform_time_features',
+                                                                  value_range=(True, False),
+                                                                  default_value=False)
                                         ) -> ConfigurationSpace:
         """
         hyperparameter search space for forecasting dataloader. Forecasting dataloader construct the window size in two
@@ -418,6 +398,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
             multiple of n_prediction_steps)
             backcast_period (int): activate if backcast is activate, the window size is then computed with
                                    backcast_period * n_prediction_steps
+            transform_time_features (bool) if time feature trasnformation is applied
 
         Returns:
             cs: Configuration Space
@@ -456,6 +437,10 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         backcast_period_cond = EqualsCondition(backcast_period, backcast, True)
         cs.add_conditions([window_size_cond, backcast_period_cond])
 
+        time_feature_transform = dataset_properties.get('time_feature_transform', [])
+        if time_feature_transform:
+            add_hyperparameter(cs, transform_time_features, CategoricalHyperparameter)
+
         return cs
 
     def __str__(self) -> str:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 08bab74f4..2c4d781b2 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -74,14 +74,14 @@ def __init__(self, window_size: int, sample_interval, target_padding_value: floa
         self.target_padding_value = target_padding_value
         self.seq_max_length = seq_max_length
 
-    def __call__(self, batch, sample_interval=1, padding_value=0.0):
+    def __call__(self, batch, sample_interval=1, seq_minimal_length=1, padding_value=0.0):
         elem = batch[0]
         elem_type = type(elem)
         if isinstance(elem, torch.Tensor):
             seq = pad_sequence_with_minimal_length(batch,
-                                          seq_minimal_length=self.window_size,
-                                          seq_max_length=self.seq_max_length,
-                                          batch_first=True, padding_value=padding_value)  # type: torch.Tensor
+                                                   seq_minimal_length=seq_minimal_length,
+                                                   seq_max_length=self.seq_max_length,
+                                                   batch_first=True, padding_value=padding_value)  # type: torch.Tensor
 
             if sample_interval > 1:
                 subseq_length = seq.shape[1]
@@ -109,8 +109,13 @@ def __call__(self, batch, sample_interval=1, padding_value=0.0):
             return batch
         elif isinstance(elem, collections.abc.Mapping):
             # only past targets and features needs to be transformed
-            return {key: self([d[key] for d in batch]) if "past" not in key
-            else self([d[key] for d in batch], self.sample_interval, self.target_padding_value) for key in elem}
+
+            return {key: self([d[key] for d in batch]) if "past" not in key else self([d[key] for d in batch],
+                                                                                  self.sample_interval,
+                                                                                  self.window_size,
+                                                                                  self.target_padding_value) for key
+                in elem}
+
         elif elem is None:
             return None
         raise TypeError(f"Unsupported data type {elem_type}")
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 53902cb46..fcc990103 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -89,6 +89,8 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
         self.model.train()
         outputs_data = list()
         targets_data = list()
+        import time
+        time_start = time.time()
 
         for step, (data, targets) in enumerate(train_loader):
             if self.budget_tracker.is_max_time_reached():
@@ -114,6 +116,10 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
+        time_end = time.time()
+        print(f'time used epoch {epoch}: {time_end - time_start}')
+        print(f'loss: {loss_sum / N}')
+
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
@@ -145,8 +151,19 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         past_target = data['past_targets'].float()
         encoder_lengths = data['encoder_lengths']
 
+        past_features = data["past_features"]
+        if past_features is not None:
+            past_features = past_features.float()
+        future_features = data['future_features']
+        if future_features is not None:
+            future_features = future_features.float()
+        static_features = data['static_features']
+        if static_features is not None:
+            static_features = static_features.float()
+
         future_targets = self.cast_targets(future_targets)
 
+
         if isinstance(self.criterion, MASELoss):
             self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
 
@@ -182,7 +199,12 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             else:
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets.to(self.device))
 
-            outputs = self.model(past_targets=past_target, future_targets=future_targets, encoder_lengths=encoder_lengths)
+            outputs = self.model(past_targets=past_target,
+                                 past_features=past_features,
+                                 future_features=future_features,
+                                 static_features=static_features,
+                                 future_targets=future_targets,
+                                 encoder_lengths=encoder_lengths)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
 
@@ -228,6 +250,16 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 past_target = data['past_targets'].float()
                 encoder_lengths = data['encoder_lengths']
 
+                past_features = data["past_features"]
+                if past_features is not None:
+                    past_features = past_features.float()
+                future_features = data['future_features']
+                if future_features is not None:
+                    future_features = future_features.float()
+                static_features = data['static_features']
+                if static_features is not None:
+                    static_features = static_features.float()
+
                 mase_coefficients.append(data['mase_coefficient'])
                 if isinstance(self.criterion, MASELoss):
                     self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
@@ -239,9 +271,18 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
 
                 if isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
-                    outputs = self.model(past_targets=past_target, future_targets=future_targets, encoder_lengths=encoder_lengths)
+                    outputs = self.model(past_targets=past_target,
+                                         past_features=past_features,
+                                         future_targets=future_targets,
+                                         future_features=future_features,
+                                         static_features=static_features,
+                                         encoder_lengths=encoder_lengths)
                 else:
-                    outputs = self.model(past_targets=past_target, encoder_lengths=encoder_lengths)
+                    outputs = self.model(past_targets=past_target,
+                                         past_features=past_features,
+                                         future_features=future_features,
+                                         static_features=static_features,
+                                         encoder_lengths=encoder_lengths)
 
                 # prepare
                 future_targets = future_targets.to(self.device)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 05f375b21..1ba8f38eb 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -257,16 +257,14 @@ def _get_hyperparameter_search_space(self,
 
             cs.add_forbidden_clauses(forbidden_losses_all)
 
-
-
             # NBEATS
             network_encoder_hp = cs.get_hyperparameter("network_backbone:__choice__")
             forbidden_NBEATS = []
-            encoder_non_BEATS = [choice for choice in network_encoder_hp.choices if choice != 'flat_encoder']
+            encoder_non_flat = [choice for choice in network_encoder_hp.choices if choice != 'flat_encoder']
             loss_non_regression = [choice for choice in hp_loss.choices if choice != 'RegressionLoss']
             data_loader_backcast = cs.get_hyperparameter('data_loader:backcast')
 
-            forbidden_encoder_NBEATS = ForbiddenInClause(network_encoder_hp, encoder_non_BEATS)
+            forbidden_encoder_non_flat = ForbiddenInClause(network_encoder_hp, encoder_non_flat)
             forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
 
@@ -278,9 +276,17 @@ def _get_hyperparameter_search_space(self,
                     ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
                     forbidden_loss_non_regression)
                 )
+                transform_time_features = "data_loader:transform_time_features"
+                if transform_time_features in cs:
+                    hp_ttf = cs.get_hyperparameter(transform_time_features)
+                    forbidden_NBEATS.append(ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                        ForbiddenEqualsClause(hp_ttf, True))
+                    )
+
             forbidden_NBEATS.append(ForbiddenAndConjunction(
                 forbidden_backcast,
-                forbidden_encoder_NBEATS
+                forbidden_encoder_non_flat
             ))
 
             cs.add_forbidden_clauses(forbidden_NBEATS)

From 5746541d6d757f48ebd2a9196dbde2f42e177ba7 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 16 Mar 2022 21:25:46 +0100
Subject: [PATCH 194/347] fix loss computation in QuantileLoss

---
 .../forecasting_target_scaling/utils.py                    | 1 +
 autoPyTorch/pipeline/components/training/losses.py         | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index b155f7107..81b78f284 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -22,6 +22,7 @@ def transform(self, past_targets: torch.Tensor, future_targets: Optional[torch.T
         if self.mode == "standard":
             loc = torch.mean(past_targets, dim=-2, keepdim=True)
             scale = torch.std(past_targets, dim=-2, keepdim=True)
+
             scale[scale == 0.0] = 1.0
             if future_targets is not None:
                 future_targets = (future_targets - loc) / scale
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 6d688d498..4ebafa36e 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -87,7 +87,7 @@ def forward(self,
 class QuantileLoss(Loss):
     __constants__ = ['reduction']
 
-    def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5]) -> None:
+    def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5], loss_weights=None) -> None:
         super(QuantileLoss, self).__init__(reduction)
         self.quantiles = quantiles
 
@@ -101,9 +101,10 @@ def forward(self,
         losses_all = []
         for q, y_pred in zip(self.quantiles, input):
             diff = target_tensor - y_pred
+
             loss_q = torch.max(q * diff, (q - 1) * diff)
-            losses_all.append(loss_q.unsqueeze(0))
-        losses_all = torch.concat(losses_all)
+            losses_all.append(loss_q.unsqueeze(-1))
+        losses_all = torch.mean(torch.concat(losses_all, dim=-1), dim=-1)
 
         if self.reduction == 'mean':
             return losses_all.mean()

From b1fbece260fea7a22c1b9dcc1e8249197afcf4de Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 18 Mar 2022 18:43:45 +0100
Subject: [PATCH 195/347] fixed scaler computation

---
 autoPyTorch/datasets/time_series_dataset.py   |   9 +-
 ...time_series_forecasting_train_evaluator.py |   5 -
 .../base_target_scaler.py                     |   5 +-
 .../forecasting_target_scaling/utils.py       | 146 +++++++++++++-----
 .../setup/network/forecasting_architecture.py |  70 ++++++---
 .../setup/network/forecasting_network.py      |   4 +-
 .../forecasting_backbone/cells.py             |  19 +--
 .../training/data_loader/time_series_util.py  |   6 +-
 .../pipeline/components/training/losses.py    |  12 +-
 .../forecasting_base_trainer.py               |  15 +-
 10 files changed, 189 insertions(+), 102 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ea5282376..8ed4fbb08 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -196,12 +196,15 @@ def __getitem__(self, index: int, train: bool = True) \
         past_target = targets[:index + 1]
         past_target = torch.from_numpy(past_target)
 
+        # TODO combine with imputer!
+        past_observed_values = torch.ones([past_target.shape[0], 1], dtype=torch.bool)
+
         return {"past_targets": past_target,
                 "past_features": past_features,
                 "future_features": future_features,
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
-                'encoder_lengths': past_target.shape[0],
+                'past_observed_values': past_observed_values,
                 'decoder_lengths': None if targets_future is None else targets_future.shape[0]}, targets_future
 
     def __len__(self) -> int:
@@ -327,7 +330,7 @@ def __init__(self,
                  lagged_value: Optional[List[int]] = None,
                  n_prediction_steps: int = 1,
                  dataset_name: Optional[str] = None,
-                 normalize_y: bool = True,
+                 normalize_y: bool = False,
                  static_features: Optional[np.ndarray] = None,
                  ):
         """
@@ -1051,7 +1054,7 @@ def create_refit_set(self) -> "TimeSeriesForecastingDataset":
         refit_set.splits = refit_set.get_splits_from_resampling_strategy()
         return refit_set
 
-    def generatet_test_seqs(self) -> List[TimeSeriesSequence]:
+    def generate_test_seqs(self) -> List[TimeSeriesSequence]:
         test_sets = copy.deepcopy(self.datasets)
         for test_seq in test_sets:
             test_seq.is_test_set = True
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 411b0cdc4..654a4e118 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -89,11 +89,6 @@ def __init__(self, backend: Backend, queue: Queue,
         self.max_budget = max_budget
         self.min_num_test_instances = min_num_test_instances
 
-        import os
-        os.system("sh -c \"scontrol -d show job $SLURM_JOB_ID\"")
-        os.system("nvidia-smi.user")
-
-
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
         holdout"""
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
index 0e37c488e..45512296a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
@@ -54,6 +54,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     def __call__(self,
                  past_target: Union[np.ndarray, torch.tensor],
+                 past_observed_values: Optional[torch.BoolTensor] = None,
                  future_targets: Optional[Union[np.ndarray, torch.Tensor]]=None,
                  ) -> Union[np.ndarray, torch.tensor]:
 
@@ -64,5 +65,7 @@ def __call__(self,
         if len(past_target.shape) == 2:
             # expand batch dimension when called on a single record
             past_target = past_target[np.newaxis, ...]
-        past_target, future_targets, loc, scale = self.scaler.transform(past_target, future_targets)
+        past_target, future_targets, loc, scale = self.scaler.transform(past_target,
+                                                                        past_observed_values,
+                                                                        future_targets)
         return past_target, future_targets, loc, scale
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index 81b78f284..fa25ae205 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -11,54 +11,116 @@ class TargetScaler(BaseEstimator):
     """
     To accelerate training, this scaler is only applied under trainer (after the data is loaded by dataloader)
     """
+
     def __init__(self, mode: str):
         self.mode = mode
 
     def fit(self, X: Dict, y: Any = None) -> "TimeSeriesScalerBatch":
         return self
 
-    def transform(self, past_targets: torch.Tensor, future_targets: Optional[torch.Tensor]=None) -> \
+    def transform(self,
+                  past_targets: torch.Tensor,
+                  past_observed_values: torch.BoolTensor,
+                  future_targets: Optional[torch.Tensor] = None) -> \
             Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        if self.mode == "standard":
-            loc = torch.mean(past_targets, dim=-2, keepdim=True)
-            scale = torch.std(past_targets, dim=-2, keepdim=True)
-
-            scale[scale == 0.0] = 1.0
-            if future_targets is not None:
-                future_targets = (future_targets - loc) / scale
-            return (past_targets - loc) / scale, future_targets, loc, scale
-
-        elif self.mode == "min_max":
-            min_ = torch.min(past_targets, dim=-2, keepdim=True)[0]
-            max_ = torch.max(past_targets, dim=-2, keepdim=True)[0]
-
-            diff_ = max_ - min_
-            loc = min_ - 1e-10
-            scale = diff_
-            scale[scale == 0.0] = 1.0
-            if future_targets is not None:
-                future_targets = (future_targets - loc) / scale
-            return (past_targets - loc) / scale, future_targets, loc, scale
-
-        elif self.mode == "max_abs":
-            max_abs_ = torch.max(torch.abs(past_targets), dim=-2, keepdim=True)[0]
-            max_abs_[max_abs_ == 0.0] = 1.0
-            scale = max_abs_
-            if future_targets is not None:
-                future_targets = future_targets / scale
-            return past_targets / scale, future_targets, None, scale
-
-        elif self.mode == 'mean_abs':
-            mean_abs = torch.mean(torch.abs(past_targets), dim=1,  keepdim=True)
-            mean_abs[mean_abs == 0.0] = 1.0
-            scale = mean_abs
-            if future_targets is not None:
-                future_targets = future_targets / scale
-            return past_targets / scale, future_targets, None, scale
-
-
-        elif self.mode == "none":
-            return past_targets, future_targets, None, None
+        if past_observed_values is None or torch.all(past_observed_values):
+            if self.mode == "standard":
+                loc = torch.mean(past_targets, dim=1, keepdim=True)
+                scale = torch.std(past_targets, dim=1, keepdim=True)
+
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+                return (past_targets - loc) / scale, future_targets, loc, scale
+
+            elif self.mode == "min_max":
+                min_ = torch.min(past_targets, dim=1, keepdim=True)[0]
+                max_ = torch.max(past_targets, dim=1, keepdim=True)[0]
+
+                diff_ = max_ - min_
+                loc = min_ - 1e-10
+                scale = diff_
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+                return (past_targets - loc) / scale, future_targets, loc, scale
+
+            elif self.mode == "max_abs":
+                max_abs_ = torch.max(torch.abs(past_targets), dim=1, keepdim=True)[0]
+                max_abs_[max_abs_ == 0.0] = 1.0
+                scale = max_abs_
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+                return past_targets / scale, future_targets, None, scale
 
+            elif self.mode == 'mean_abs':
+                mean_abs = torch.mean(torch.abs(past_targets), dim=1, keepdim=True)
+                mean_abs[mean_abs == 0.0] = 1.0
+                scale = mean_abs
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+                return past_targets / scale, future_targets, None, scale
+
+            elif self.mode == "none":
+                return past_targets, future_targets, None, None
+
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
         else:
-            raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
\ No newline at end of file
+            valid_past_targets = past_observed_values * past_targets
+            valid_past_obs = torch.sum(past_observed_values, dim=1, keepdim=True)
+            if self.mode == "standard":
+                dfredom = 1
+                loc = torch.sum(valid_past_targets, dim=1, keepdim=True) / valid_past_obs
+                scale = torch.sum(torch.square((valid_past_targets - loc) * past_observed_values), dim=1, keepdim=True)
+                scale /= valid_past_obs - dfredom
+                scale = torch.sqrt(scale)
+
+                offset_targets = past_targets - loc
+                # ensure that all the targets are scaled properly
+                scale = torch.where(torch.logical_or(scale == 0.0, scale == torch.nan), offset_targets[:, [-1]], scale)
+                scale[scale == 0.0] = 1.0
+
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+
+                scaled_past_targets = torch.where(past_observed_values, offset_targets / scale, past_targets)
+                return scaled_past_targets, future_targets, loc, scale
+
+            elif self.mode == "min_max":
+                obs_mask = ~past_observed_values
+                min_masked_past_targets = past_targets.masked_fill(obs_mask, value=torch.inf)
+                max_masked_past_targets = past_targets.masked_fill(obs_mask, value=-torch.inf)
+                min_ = torch.min(min_masked_past_targets, dim=1, keepdim=True)[0]
+                max_ = torch.max(max_masked_past_targets, dim=1, keepdim=True)[0]
+
+                diff_ = max_ - min_
+                loc = min_ - 1e-10
+                scale = torch.where(diff_ == 0, past_targets[:, [-1]], diff_)
+                scale[scale == 0.0] = 1.0
+
+                if future_targets is not None:
+                    future_targets = (future_targets - loc) / scale
+                return (past_targets - loc) / scale, future_targets, loc, scale
+
+            elif self.mode == "max_abs":
+                max_abs_ = torch.max(torch.abs(valid_past_targets), dim=1, keepdim=True)[0]
+                scale = torch.where(max_abs_ == 0, past_targets[:, [-1]], max_abs_)
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+                return past_targets / scale, future_targets, None, scale
+
+            elif self.mode == 'mean_abs':
+                mean_abs = torch.sum(torch.abs(valid_past_targets), dim=1, keepdim=True) / valid_past_obs
+                scale = torch.where(mean_abs == 0.0, valid_past_targets[:, [-1]], mean_abs)
+                # in case that all values in the tensor is 0
+                scale[scale == 0.0] = 1.0
+                if future_targets is not None:
+                    future_targets = future_targets / scale
+                return past_targets / scale, future_targets, None, scale
+
+            elif self.mode == "none":
+                return past_targets, future_targets, None, None
+
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for Forecasting scaler")
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 9c2a971a3..6d484b68c 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -316,7 +316,7 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.Tensor] = None,
+                past_observed_values: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
         raise NotImplementedError
@@ -352,6 +352,7 @@ def repeat_intermediate_values(self,
 class ForecastingNet(AbstractForecastingNet):
     def pre_processing(self,
                        past_targets: torch.Tensor,
+                       past_observed_values: torch.BoolTensor,
                        past_features: Optional[torch.Tensor] = None,
                        future_features: Optional[torch.Tensor] = None,
                        static_features: Optional[torch.Tensor] = None,
@@ -360,7 +361,10 @@ def pre_processing(self,
                        variable_selector_kwargs: Dict = {},
                        ):
         if self.encoder_lagged_input:
-            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(past_targets[:, -self.window_size:])
+            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                past_targets[:, -self.window_size:],
+                past_observed_values[:, -self.window_size:]
+            )
             past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
             x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
                                                                            self.window_size,
@@ -369,7 +373,8 @@ def pre_processing(self,
         else:
             if self.window_size < past_targets.shape[1]:
                 past_targets = past_targets[:, -self.window_size:]
-            past_targets, _, loc, scale = self.target_scaler(past_targets)
+                past_observed_values = past_observed_values[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
             x_past = past_targets
 
         if self.network_structure.variable_selection:
@@ -440,11 +445,12 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.LongTensor] = None,
+                past_observed_values: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
             past_targets=past_targets,
+            past_observed_values=past_observed_values,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
@@ -460,11 +466,14 @@ def forward(self,
         if self.has_temporal_fusion:
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
-                                                  encoder_lengths=encoder_lengths,
+                                                  past_observed_values=past_observed_values,
                                                   decoder_length=self.n_prediction_steps,
                                                   static_embedding=x_static
                                                   )
+
         output = self.head(decoder_output)
+
+
         return self.rescale_output(output, loc, scale, self.device)
 
     def pred_from_net_output(self, net_output):
@@ -499,13 +508,13 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.LongTensor] = None,
+                past_observed_values: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
                           static_features=static_features,
-                          encoder_lengths=encoder_lengths)
+                          past_observed_values=past_observed_values)
         return self.pred_from_net_output(net_output)
 
 
@@ -558,10 +567,11 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.Tensor] = None,
+                past_observed_values: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
             past_targets=past_targets,
+            past_observed_values=past_observed_values,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
@@ -598,7 +608,7 @@ def forward(self,
             if self.has_temporal_fusion:
                 decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                       decoder_output=decoder_output,
-                                                      encoder_lengths=encoder_lengths,
+                                                      past_observed_values=past_observed_values,
                                                       decoder_length=self.n_prediction_steps,
                                                       static_embedding=x_static
                                                       )
@@ -649,7 +659,7 @@ def forward(self,
                             decoder_output_all = decoder_output
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
-                                                              encoder_lengths=encoder_lengths,
+                                                              past_observed_values=past_observed_values,
                                                               decoder_length=idx_pred + 1,
                                                               static_embedding=x_static
                                                               )[:, -1:]
@@ -679,12 +689,12 @@ def forward(self,
                     is_hidden_states=self.encoder.encoder_has_hidden_states,
                     repeats=self.num_samples)
 
-                intermediate_values = self.repeat_intermediate_values([encoder_output, encoder_lengths],
+                intermediate_values = self.repeat_intermediate_values([encoder_output, past_observed_values],
                                                                       is_hidden_states=[False, False],
                                                                       repeats=self.num_samples)
 
                 encoder_output = intermediate_values[0]
-                encoder_lengths = intermediate_values[1]
+                past_observed_values = intermediate_values[1]
 
                 if self.decoder_lagged_input:
                     max_lag_seq_length = max(self.decoder_lagged_value) + 1
@@ -737,7 +747,7 @@ def forward(self,
                             decoder_output_all = decoder_output
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
-                                                              encoder_lengths=encoder_lengths,
+                                                              past_observed_values=past_observed_values,
                                                               decoder_length=idx_pred + 1,
                                                               static_embedding=x_static,
                                                               )[:, -1:]
@@ -764,13 +774,13 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.LongTensor] = None,
+                past_observed_values: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
                           static_features=static_features,
-                          encoder_lengths=encoder_lengths)
+                          past_observed_values=past_observed_values)
         if self.output_type == 'regression':
             return self.pred_from_net_output(net_output)
         else:
@@ -832,13 +842,17 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.Tensor] = None,
+                past_observed_values: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.training:
             if self.encoder_lagged_input:
                 past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    past_targets[:, -self.window_size:])
+                    past_targets[:, -self.window_size:],
+                    past_observed_values[:, -self.window_size:]
+                )
+
                 past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+
                 future_targets = self.scale_value(future_targets, loc, scale)
 
                 targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
@@ -850,7 +864,8 @@ def forward(self,
             else:
                 if self.window_size < past_targets.shape[1]:
                     past_targets = past_targets[:, -self.window_size:]
-                past_targets, _, loc, scale = self.target_scaler(past_targets)
+                    past_observed_values = past_observed_values[:, -self.window_size:]
+                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
                 future_targets = self.scale_value(future_targets, loc, scale)
                 targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
 
@@ -890,8 +905,12 @@ def forward(self,
         else:
             if self.encoder_lagged_input:
                 past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    past_targets[:, -self.window_size:])
+                    past_targets[:, -self.window_size:],
+                    past_observed_values[:, -self.window_size:],
+                )
+
                 past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+
                 x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
                                                                                     self.window_size,
                                                                                     self.encoder_lagged_value,
@@ -899,8 +918,8 @@ def forward(self,
             else:
                 if self.window_size < past_targets.shape[1]:
                     past_targets = past_targets[:, -self.window_size:]
-
-                past_targets, _, loc, scale = self.target_scaler(past_targets)
+                    past_observed_values = past_observed_values[:, -self.window_size]
+                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
                 x_past = past_targets
 
             if self.network_structure.variable_selection:
@@ -1040,13 +1059,13 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.LongTensor] = None,
+                past_observed_values: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
                           static_features=static_features,
-                          encoder_lengths=encoder_lengths)
+                          past_observed_values=past_observed_values)
         return net_output
 
 
@@ -1059,11 +1078,12 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                encoder_lengths: Optional[torch.Tensor] = None,
+                past_observed_values: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.window_size < past_targets.shape[1]:
             past_targets = past_targets[:, -self.window_size:]
-        past_targets, _, loc, scale = self.target_scaler(past_targets)
+            past_observed_values = past_observed_values[:, -self.window_size]
+        past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
         past_targets = past_targets.to(self.device)
 
         batch_size = past_targets.shape[0]
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 81c928899..9fe307221 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -148,7 +148,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
             past_features = X_batch['past_features']
             future_features = X_batch["future_features"]
             static_features = X_batch["static_features"]
-            encoder_lengths = X_batch['encoder_lengths']
+            past_observed_values = X_batch['past_observed_values']
 
             if past_targets.ndim == 2:
                 past_targets = past_targets.unsqueeze(-1)
@@ -162,7 +162,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
                 if pred_kwargs[key] is not None:
                     pred_kwargs[key] = pred_kwargs[key].float()
 
-            pred_kwargs.update({'encoder_lengths': encoder_lengths})
+            pred_kwargs.update({'past_observed_values': past_observed_values})
 
             with torch.no_grad():
                 Y_batch_pred = self.network.predict(**pred_kwargs)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 76713e5fa..c28f6cb5e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -94,14 +94,14 @@ def __init__(self,
     def forward(self,
                 encoder_output: torch.Tensor,
                 decoder_output: torch.Tensor,
-                encoder_lengths: torch.LongTensor,
+                past_observed_values: torch.BoolTensor,
                 decoder_length: int,
                 static_embedding: Optional[torch.Tensor] = None):
         """
         Args:
             encoder_output: the output of the last layer of encoder network
             decoder_output: the output of the last layer of decoder network
-            encoder_lengths: length of encoder network
+            past_observed_values: observed values in the past
             decoder_length: length of decoder network
             static_embedding: output of static variable selection network (if applible)
         """
@@ -120,11 +120,10 @@ def forward(self,
 
         # Attention
         encoder_out_length = encoder_output.shape[1]
-        encoder_lengths = torch.where(encoder_lengths < self.window_size, encoder_lengths, self.window_size)
-        encoder_lengths = torch.where(encoder_lengths > encoder_out_length, encoder_out_length, encoder_lengths)
-        encoder_lengths = encoder_lengths.to(self.device)
+        past_observed_values = past_observed_values[:, -encoder_out_length:]
+        past_observed_values = past_observed_values.to(self.device)
 
-        mask = self.get_attention_mask(encoder_lengths=encoder_lengths, decoder_length=decoder_length)
+        mask = self.get_attention_mask(past_observed_values=past_observed_values, decoder_length=decoder_length)
         if mask.shape[-1] < attn_input.shape[1]:
             # in case that none of the samples has length greater than window_size
             mask = torch.cat([
@@ -156,7 +155,7 @@ def device(self, device: torch.device):
         self.to(device)
         self._device = device
 
-    def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length: int):
+    def get_attention_mask(self, past_observed_values: torch.BoolTensor, decoder_length: int):
         """
         https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
         temporal_fusion_transformer/__init__.py
@@ -174,16 +173,14 @@ def get_attention_mask(self, encoder_lengths: torch.LongTensor, decoder_length:
         #   data and self
         decoder_mask = attend_step >= predict_step
         # do not attend to steps where data is padded
-        encoder_mask = create_mask(encoder_lengths.max(), encoder_lengths)
-
         # this is the result of our padding strategy: we pad values at the start of the tensors
-        encoder_mask = torch.flip(encoder_mask, dims=[1])
+        encoder_mask = ~past_observed_values.squeeze(-1)
 
         # combine masks along attended time - first encoder and then decoder
         mask = torch.cat(
             (
                 encoder_mask.unsqueeze(1).expand(-1, decoder_length, -1),
-                decoder_mask.unsqueeze(0).expand(encoder_lengths.size(0), -1, -1),
+                decoder_mask.unsqueeze(0).expand(encoder_mask.size(0), -1, -1),
             ),
             dim=2,
         )
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 2c4d781b2..ce49c9fa2 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -46,8 +46,10 @@ def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
         out_dims = (len(sequences), max_len) + trailing_dims
     else:
         out_dims = (max_len, len(sequences)) + trailing_dims
-
-    out_tensor = sequences[0].new_full(out_dims, padding_value)
+    if sequences[0].dtype == torch.bool:
+        out_tensor = sequences[0].new_full(out_dims, False)
+    else:
+        out_tensor = sequences[0].new_full(out_dims, padding_value)
     for i, tensor in enumerate(sequences):
         length = min(tensor.size(0), seq_max_length)
         # use index notation to prevent duplicate references to the tensor
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 4ebafa36e..2730e9227 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -48,9 +48,14 @@ def __init__(self, reduction: str = 'mean') -> None:
         super(MAPELoss, self).__init__(reduction)
 
     def forward(self, input: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
-        target = torch.abs(target_tensor)
-        target[target == 0] = 1
-        loss = torch.abs(input - target_tensor) / target
+        # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/model/n_beats/_network.py
+        denominator = torch.abs(target_tensor)
+        diff = torch.abs(input - target_tensor)
+
+        flag = (denominator == 0).float()
+
+        loss = (diff * (1 - flag)) / (denominator + flag)
+
         if self.reduction == 'mean':
             return loss.mean()
         elif self.reduction == 'sum':
@@ -104,6 +109,7 @@ def forward(self,
 
             loss_q = torch.max(q * diff, (q - 1) * diff)
             losses_all.append(loss_q.unsqueeze(-1))
+
         losses_all = torch.mean(torch.concat(losses_all, dim=-1), dim=-1)
 
         if self.reduction == 'mean':
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index fcc990103..f97075d8f 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -118,7 +118,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         time_end = time.time()
         print(f'time used epoch {epoch}: {time_end - time_start}')
-        print(f'loss: {loss_sum / N}')
 
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
@@ -149,7 +148,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             float: the loss incurred in the prediction
         """
         past_target = data['past_targets'].float()
-        encoder_lengths = data['encoder_lengths']
+        past_observed_values = data['past_observed_values']
 
         past_features = data["past_features"]
         if past_features is not None:
@@ -163,7 +162,6 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         future_targets = self.cast_targets(future_targets)
 
-
         if isinstance(self.criterion, MASELoss):
             self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
 
@@ -172,10 +170,11 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         if isinstance(self.model, NBEATSNet):
             past_target = past_target[:, -self.window_size:]
+            past_observed_values = past_observed_values[:, -self.window_size:]
             past_target, criterion_kwargs_past = self.data_preparation(past_target,
                                                                        past_target.to(self.device))
             past_target, criterion_kwargs_future = self.data_preparation(past_target, future_targets.to(self.device))
-            backcast, forecast = self.model(past_targets=past_target, encoder_lengths=encoder_lengths)
+            backcast, forecast = self.model(past_targets=past_target, past_observed_values=past_observed_values)
 
             loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
             loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
@@ -204,7 +203,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
                                  future_features=future_features,
                                  static_features=static_features,
                                  future_targets=future_targets,
-                                 encoder_lengths=encoder_lengths)
+                                 past_observed_values=past_observed_values)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
 
@@ -248,7 +247,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         with torch.no_grad():
             for step, (data, future_targets) in enumerate(test_loader):
                 past_target = data['past_targets'].float()
-                encoder_lengths = data['encoder_lengths']
+                past_observed_values = data['past_observed_values']
 
                 past_features = data["past_features"]
                 if past_features is not None:
@@ -276,13 +275,13 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                                          future_targets=future_targets,
                                          future_features=future_features,
                                          static_features=static_features,
-                                         encoder_lengths=encoder_lengths)
+                                         past_observed_values=past_observed_values)
                 else:
                     outputs = self.model(past_targets=past_target,
                                          past_features=past_features,
                                          future_features=future_features,
                                          static_features=static_features,
-                                         encoder_lengths=encoder_lengths)
+                                         past_observed_values=past_observed_values)
 
                 # prepare
                 future_targets = future_targets.to(self.device)

From 683ccf564d6d10b090dccb258612a8002a15ccc6 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 19 Mar 2022 17:49:13 +0100
Subject: [PATCH 196/347] maint

---
 autoPyTorch/configs/forecasting_init_cfgs.json  |  6 +++---
 .../forecasting_target_scaling/utils.py         |  3 ++-
 .../forecasting_backbone/cells.py               | 17 ++++++++---------
 .../time_series_forecasting_data_loader.py      |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 0f8680d1d..07f1b39d7 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -169,7 +169,6 @@
             "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
         },
         "NBEATS-I": {
-            "target_scaler:__choice__": "TargetNoScaler",
             "data_loader:backcast": true,
             "data_loader:backcast_period": 2,
             "loss:__choice__": "RegressionLoss",
@@ -198,6 +197,7 @@
             "network_backbone:flat_encoder:NBEATSDecoder:dropout_i_2": 0.1
         },
         "NBEATS-G": {
+            "target_scaler:__choice__": "TargetNoScaler",
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mape",
             "network_backbone:__choice__": "flat_encoder",
@@ -225,10 +225,10 @@
             "network_backbone:seq_encoder:skip_connection": true,
             "network_backbone:seq_encoder:num_blocks": 1,
             "network_backbone:seq_encoder:variable_selection": true,
-            "network_backbone:seq_encoder:share_single_variable_networks": false,
-            "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
             "network_backbone:seq_encoder:variable_selection_use_dropout": true,
             "network_backbone:seq_encoder:variable_selection_dropout_rate": 0.1,
+            "network_backbone:seq_encoder:share_single_variable_networks": false,
+            "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
             "network_backbone:seq_encoder:grn_use_dropout": true,
             "network_backbone:seq_encoder:grn_dropout_rate": 0.1,
             "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
index fa25ae205..2fdb33e3d 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
@@ -72,7 +72,8 @@ def transform(self,
             if self.mode == "standard":
                 dfredom = 1
                 loc = torch.sum(valid_past_targets, dim=1, keepdim=True) / valid_past_obs
-                scale = torch.sum(torch.square((valid_past_targets - loc) * past_observed_values), dim=1, keepdim=True)
+                scale = torch.sum(torch.square((valid_past_targets - loc * past_observed_values)), dim=1, keepdim=True)
+
                 scale /= valid_past_obs - dfredom
                 scale = torch.sqrt(scale)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index c28f6cb5e..d44f04085 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -237,14 +237,12 @@ def __init__(self,
             future_feature_name2tensor_idx[future_name] = [idx_tracker_future, idx_tracker_future + feature_shape]
             idx_tracker_future += feature_shape
 
-        feature_names = time_feature_names + feature_names
-        known_future_features = time_feature_names + known_future_features
-        # if not feature_names or not(known_future_features or time_feature_names):
-        # Ensure that at least one feature is applied
-        placeholder_features = 'placeholder_features'
-        i = 0
-        self.placeholder_features = []
-        for j in range(1):
+        if not feature_names or not known_future_features:
+            # Ensure that at least one feature is applied
+            placeholder_features = 'placeholder_features'
+            i = 0
+
+            self.placeholder_features = []
             while placeholder_features in feature_names or placeholder_features in self.placeholder_features:
                 i += 1
                 placeholder_features = f'placeholder_features_{i}'
@@ -258,7 +256,8 @@ def __init__(self,
             decoder_input_sizes[name] = self.hidden_size
             self.placeholder_features.append(placeholder_features)
 
-        # self.placeholder_features = [placeholder_features]
+        feature_names = time_feature_names + feature_names
+        known_future_features = time_feature_names + known_future_features
 
         self.feature_names = feature_names
         self.feature_names2tensor_idx = feature_names2tensor_idx
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index ca9983605..ee6bef576 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -423,7 +423,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                 window_size_value_range = window_size.value_range
                 window_size = HyperparameterSearchSpace(hyperparameter='window_size',
                                                         value_range=(window_size_value_range[0], seq_length_max),
-                                                        default_value=window_size_value_range[0])
+                                                        default_value=min(window_size.default_value, seq_length_max))
                 window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
         else:
             window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)

From 95d2ab508b86945fc1944f2e607f7467f4059725 Mon Sep 17 00:00:00 2001
From: Deng Difan <deng@p200300cd070f1f50dabbc1fffe9c6aa9.dip0.t-ipconnect.de>
Date: Tue, 22 Mar 2022 11:37:07 +0100
Subject: [PATCH 197/347] fix dataset

---
 autoPyTorch/datasets/time_series_dataset.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 8ed4fbb08..401f10045 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -146,7 +146,7 @@ def __getitem__(self, index: int, train: bool = True) \
 
             if self.known_future_features:
                 future_features = self.X.iloc[index + 1: index + self.n_prediction_steps + 1,
-                                  self.known_future_feature_index]
+                                  self.known_future_features]
             else:
                 future_features = None
         else:
@@ -163,7 +163,7 @@ def __getitem__(self, index: int, train: bool = True) \
                     past_features = self._cached_time_features[:index + 1]
                 if future_features:
                     future_features = np.hstack([self._cached_time_features[
-                                                 index + 1:index + self.n_prediction_steps +1], past_features
+                                                 index + 1:index + self.n_prediction_steps + 1], past_features
                                                  ])
                 else:
                     future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
@@ -210,7 +210,7 @@ def __getitem__(self, index: int, train: bool = True) \
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
 
-    def compute_time_features(self,):
+    def compute_time_features(self, ):
         if self._cached_time_features is None:
             periods = self.Y.shape[0]
             if self.is_test_set:
@@ -230,7 +230,8 @@ def compute_time_features(self,):
                                               periods=self.n_prediction_steps + self.Y.shape[0],
                                               freq=self.freq)
                     time_feature_future = np.vstack(
-                        [transform(date_info[-self.n_prediction_steps:]).to_numpy(float) for transform in self.time_feature_transform]
+                        [transform(date_info[-self.n_prediction_steps:]).to_numpy(float) for transform in
+                         self.time_feature_transform]
                     ).T
                     self._cached_time_features = np.concatenate([self._cached_time_features, time_feature_future])
 
@@ -561,10 +562,10 @@ def __init__(self,
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
         self.seq_length_max = int(np.max(self.sequence_lengths_train))
 
-        if max(n_prediction_steps, freq_value) > self.seq_length_median:
-            self.base_window_size = min(n_prediction_steps, freq_value, self.seq_length_median)
+        if freq_value > self.seq_length_median:
+            self.base_window_size = self.seq_length_median
         else:
-            self.base_window_size = max(n_prediction_steps, freq_value)
+            self.base_window_size = freq_value
 
         self.train_tensors = train_tensors
 
@@ -624,11 +625,11 @@ def __init__(self,
 
         # TODO doing experiments to give the most proper way of defining these two values
         if lagged_value is None:
-            if self.freq in FREQUENCY_MAP:
-                freq = FREQUENCY_MAP[self.freq]
+            try:
                 lagged_value = [0] + get_lags_for_frequency(freq)
-            else:
+            except Exception:
                 lagged_value = list(range(8))
+
         self.lagged_value = lagged_value
 
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:

From baaf34f838ee7090dda667139543aa89486807af Mon Sep 17 00:00:00 2001
From: Deng Difan <deng@p200300cd070f1f50dabbc1fffe9c6aa9.dip0.t-ipconnect.de>
Date: Tue, 22 Mar 2022 11:58:10 +0100
Subject: [PATCH 198/347] adjust window_size to seasonality

---
 autoPyTorch/datasets/time_series_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 401f10045..4e23f88f3 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -368,7 +368,7 @@ def __init__(self,
 
         if isinstance(freq_value, list):
             if np.max(freq_value) < n_prediction_steps:
-                tmp_freq = n_prediction_steps
+                tmp_freq = max(freq_value)
             else:
                 tmp_freq = min([freq_value_item for
                                 freq_value_item in freq_value if freq_value_item >= n_prediction_steps])

From 897cd749986e6b3e42c1bae490029cee31836d78 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 23 Mar 2022 16:07:37 +0100
Subject: [PATCH 199/347] maint scaling

---
 .../setup/network/forecasting_architecture.py  | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 6d484b68c..ae9f967a7 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -365,7 +365,10 @@ def pre_processing(self,
                 past_targets[:, -self.window_size:],
                 past_observed_values[:, -self.window_size:]
             )
-            past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+            past_targets[:, :-self.window_size] = torch.where(
+                past_observed_values[:, :-self.window_size],
+                self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                past_targets[:, :-self.window_size])
             x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
                                                                            self.window_size,
                                                                            self.encoder_lagged_value,
@@ -567,7 +570,7 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.Tensor] = None,
+                past_observed_values: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
             past_targets=past_targets,
@@ -851,7 +854,11 @@ def forward(self,
                     past_observed_values[:, -self.window_size:]
                 )
 
-                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+                past_targets[:, :-self.window_size] = torch.where(
+                    past_observed_values[:, :-self.window_size],
+                    self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                    past_targets[:, :-self.window_size])
+
 
                 future_targets = self.scale_value(future_targets, loc, scale)
 
@@ -909,7 +916,10 @@ def forward(self,
                     past_observed_values[:, -self.window_size:],
                 )
 
-                past_targets[:, :-self.window_size] = self.scale_value(past_targets[:, :-self.window_size], loc, scale)
+                past_targets[:, :-self.window_size] = torch.where(
+                    past_observed_values[:, :-self.window_size],
+                    self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                    past_targets[:, :-self.window_size])
 
                 x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
                                                                                     self.window_size,

From a09ddbb30f61aaaa444485bdfd6cc0a0cd67f685 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 23 Mar 2022 21:09:24 +0100
Subject: [PATCH 200/347] fix uncorrect Seq2Seq scaling

---
 .../configs/forecasting_init_cfgs.json        |  2 ++
 .../setup/network/forecasting_architecture.py | 21 ++++++++++------
 .../forecasting_backbone/cells.py             | 24 ++++++++++--------
 .../forecasting_backbone/components_util.py   | 15 +++++++----
 .../forecasting_decoder/MLPDecoder.py         |  2 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  2 +-
 .../forecasting_decoder/RNNDecoder.py         |  6 +++--
 .../forecasting_decoder/TransformerDecoder.py | 25 +++++++++++++------
 .../forecasting_decoder/components.py         |  2 +-
 .../seq_encoder/TransformerEncoder.py         |  5 ++++
 10 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 07f1b39d7..adb7c66ef 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -145,6 +145,7 @@
             "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
             "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:norm_first": true,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:decoder_type": "TransformerDecoder",
             "network_backbone:seq_encoder:block_1:TransformerEncoder:use_dropout": true,
@@ -158,6 +159,7 @@
             "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
             "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:norm_first": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:use_positional_decoder": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout_positional_decoder": 0.1,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index ae9f967a7..21bf29412 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -428,7 +428,7 @@ def pre_processing(self,
                 **variable_selector_kwargs
             )
 
-            return x_past, x_future, x_static, loc, scale, static_context_initial_hidden
+            return x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets
         else:
             if past_features is not None:
                 past_features = past_features[:, -self.window_size:]
@@ -440,7 +440,7 @@ def pre_processing(self,
             if static_features is not None:
                 static_features = static_features.to(self.device)
             x_past = self.embedding(x_past)  # TODO embedding for future features!
-            return x_past, future_features, static_features, loc, scale, None
+            return x_past, future_features, static_features, loc, scale, None, past_targets
 
     def forward(self,
                 past_targets: torch.Tensor,
@@ -451,7 +451,7 @@ def forward(self,
                 past_observed_values: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
-        x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
+        x_past, x_future, x_static, loc, scale, static_context_initial_hidden, _ = self.pre_processing(
             past_targets=past_targets,
             past_observed_values=past_observed_values,
             past_features=past_features,
@@ -464,7 +464,9 @@ def forward(self,
         encoder_additional = [static_context_initial_hidden]
         encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
         encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
-        decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder)
+
+        decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
+                                      pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
 
         if self.has_temporal_fusion:
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
@@ -476,7 +478,6 @@ def forward(self,
 
         output = self.head(decoder_output)
 
-
         return self.rescale_output(output, loc, scale, self.device)
 
     def pred_from_net_output(self, net_output):
@@ -572,7 +573,7 @@ def forward(self,
                 static_features: Optional[torch.Tensor] = None,
                 past_observed_values: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
-        x_past, x_future, x_static, loc, scale, static_context_initial_hidden = self.pre_processing(
+        x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
             past_targets=past_targets,
             past_observed_values=past_observed_values,
             past_features=past_features,
@@ -586,6 +587,7 @@ def forward(self,
         encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
         if self.training:
+            future_targets = self.scale_value(future_targets, loc, scale)
             # we do one step ahead forecasting
             if self.decoder_lagged_input:
                 future_targets = torch.cat([past_targets, future_targets[:, :-1, :]], dim=1)
@@ -606,7 +608,8 @@ def forward(self,
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
                                                            additional_input=encoder_additional)
 
-            decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder)
+            decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
+                                          pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
 
             if self.has_temporal_fusion:
                 decoder_output = self.temporal_fusion(encoder_output=encoder_output,
@@ -652,6 +655,7 @@ def forward(self,
 
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
+                                                  pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
                                                   cache_intermediate_state=True,
                                                   incremental_update=idx_pred > 0)
 
@@ -741,6 +745,7 @@ def forward(self,
 
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
+                                                  pos_idx=(x_past.shape[1]+idx_pred, x_past.shape[1] + idx_pred+1),
                                                   cache_intermediate_state=True,
                                                   incremental_update=idx_pred > 0)
                     if self.has_temporal_fusion:
@@ -753,7 +758,7 @@ def forward(self,
                                                               past_observed_values=past_observed_values,
                                                               decoder_length=idx_pred + 1,
                                                               static_embedding=x_static,
-                                                              )[:, -1:]
+                                                              )
 
                     net_output = self.head(decoder_output)
                     samples = self.pred_from_net_output(net_output).cpu()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index d44f04085..2355abba4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -590,6 +590,7 @@ def __init__(self,
     def forward(self,
                 x_future: Optional[torch.Tensor],
                 encoder_output: List[torch.Tensor],
+                pos_idx: Optional[Tuple[int]] = None,
                 cache_intermediate_state: bool = False,
                 incremental_update: bool = False
                 ) -> torch.Tensor:
@@ -599,25 +600,28 @@ def forward(self,
             if self.decoder_has_hidden_states[i]:
                 if incremental_update:
                     hx = self.cached_intermediate_state[i]
-                    fx, hx = decoder_i(x_future=x, encoder_output=hx)
+                    fx, hx = decoder_i(x_future=x, encoder_output=hx, pos_idx=pos_idx)
                 else:
-                    fx, hx = decoder_i(x_future=x, encoder_output=encoder_output[i])
+                    fx, hx = decoder_i(x_future=x, encoder_output=encoder_output[i], pos_idx=pos_idx)
             else:
                 if incremental_update:
-                    x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
-                    fx = decoder_i(x_all, encoder_output=encoder_output[i])[:, -1:]
+                    # in this case, we only have Transformer, thus x_all needs to be None value!
+                    # TODO make this argument clearer!
+                    #x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
+                    fx = decoder_i(x, encoder_output=encoder_output[i], pos_idx=pos_idx)
                 else:
-                    fx = decoder_i(x, encoder_output=encoder_output[i])
+                    fx = decoder_i(x, encoder_output=encoder_output[i], pos_idx=pos_idx)
             skip_id = f'skip_connection_{block_id}'
             if self.skip_connection and skip_id in self.decoder and x is not None:
                 fx = self.decoder[skip_id](fx, x)
             if cache_intermediate_state:
                 if self.decoder_has_hidden_states[i]:
                     self.cached_intermediate_state[i] = hx
-                else:
-                    if incremental_update:
-                        self.cached_intermediate_state[i] = x_all
-                    else:
-                        self.cached_intermediate_state[i] = x
+                    #TODO consider if there are other case that could make use of cached intermediate states
+                # else:
+                #    if incremental_update:
+                #        self.cached_intermediate_state[i] = x_all
+                #    else:
+                #        self.cached_intermediate_state[i] = x
             x = fx
         return x
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index 7ee9da187..b6597180e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -1,7 +1,7 @@
 import math
 from sklearn.base import BaseEstimator
 
-from typing import Any, Dict, NamedTuple
+from typing import Any, Dict, NamedTuple, Optional, Tuple
 
 import torch
 from torch import nn
@@ -71,13 +71,14 @@ def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='e
     dropout = config.get('dropout', 0.0)
     activation = config['activation']
     layer_norm_eps = config['layer_norm_eps']
+    norm_first = config['norm_first']
     if layer_type == 'encoder':
         return nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
-                                          dropout=dropout, activation=activation,
+                                          dropout=dropout, activation=activation, norm_first=norm_first,
                                           layer_norm_eps=layer_norm_eps, batch_first=True)
     elif layer_type == 'decoder':
         return nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
-                                          dropout=dropout, activation=activation,
+                                          dropout=dropout, activation=activation, norm_first=norm_first,
                                           layer_norm_eps=layer_norm_eps, batch_first=True)
     else:
         raise ValueError('layer_type must be encoder or decoder!')
@@ -115,15 +116,19 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         pe = pe.unsqueeze(0)
         self.register_buffer('pe', pe)
 
-    def forward(self, x):
+    def forward(self, x, pos_idx:Optional[Tuple[int]] = None):
         r"""Inputs of forward function
         Args:
             x: the sequence fed to the positional encoder model (required).
         Shape:
             x: [batch size, sequence length embed dim]
+            pos_idx: positional index, indicating the index of the current
             output: [batch size, sequence length, embed dim]
         Examples:
             >>> output = pos_encoder(x)
         """
-        x = x + self.pe[:, :x.size(1), :]
+        if pos_idx is None:
+            x = x + self.pe[:, :x.size(1), :]
+        else:
+            x = x + self.pe[:, pos_idx[0]: pos_idx[1], :]
         return self.dropout(x)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 3db9a8815..1f966f179 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -30,7 +30,7 @@ def __init__(self,
         self.local_layers = local_layers
         self.auto_regressive = auto_regressive
 
-    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor):
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
         if x_future is None or self.auto_regressive:
             # for auto-regressive model, x_future is fed to the encoders
             x = self.global_layers(encoder_output)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 913b6a832..a783d4756 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -74,7 +74,7 @@ def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
         if self.use_dropout:
             layers.append(nn.Dropout(self.dropout_rate))
 
-    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor):
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
         if self.backcast_head is None and self.forecast_head is None:
             # used to compute head dimensions
             return self.backbone(encoder_output)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 6d6fba994..4106a3c7a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -46,8 +46,10 @@ def __init__(self,
                          bidirectional=False,
                          batch_first=True)
 
-    def forward(self, x_future: torch.Tensor,
-                encoder_output: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, ...]:
+    def forward(self,
+                x_future: torch.Tensor,
+                encoder_output: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                pos_idx: Optional[Tuple[int]] = None) -> Tuple[torch.Tensor, ...]:
         if x_future.ndim == 2:
             x_future = x_future.unsqueeze(1)
         outputs, hidden_state, = self.lstm(x_future, encoder_output)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 7e52a0f7d..5a3d633e0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -16,7 +16,7 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.utils.common import add_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.\
+from autoPyTorch.pipeline.components.setup.network_backbone. \
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
     BaseForecastingDecoder,
     DecoderProperties
@@ -40,16 +40,18 @@ def __init__(self,
                  use_layer_norm_output: bool,
                  dropout_pd: float = 0.0,
                  layer_norm_eps_output: Optional[float] = None,
-                 n_prediction_steps:int = 1,
+                 n_prediction_steps: int = 1,
                  lagged_value: Optional[Union[List, np.ndarray]] = None):
         super().__init__()
         self.lagged_value = lagged_value
         in_features = in_features
 
-        self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        # self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+        self.input_layer = nn.Linear(in_features, d_model, bias=False)
+
+        self.use_positional_decoder = use_positional_decoder
         if use_positional_decoder:
-            self.input_layer.append(PositionalEncoding(d_model, dropout_pd))
-        self.input_layer = nn.Sequential(*self.input_layer)
+            self.pos_encoding = PositionalEncoding(d_model, dropout_pd)
 
         self.use_layer_norm_output = use_layer_norm_output
 
@@ -62,8 +64,10 @@ def __init__(self,
                                                                 norm=norm)
         self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(n_prediction_steps)
 
-    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
+    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
         output = self.input_layer(x_future)
+        if self.use_positional_decoder:
+            output = self.pos_encoding(output, pos_idx)
         if self.training:
             output = self.transformer_decoder_layers(output, encoder_output,
                                                      tgt_mask=self.tgt_mask.to(encoder_output.device))
@@ -110,7 +114,9 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
 
     @staticmethod
     def decoder_properties():
-        return DecoderProperties(recurrent=True, lagged_input=True, mask_on_future_target=True)
+        return DecoderProperties(recurrent=True,
+                                 lagged_input=True,
+                                 mask_on_future_target=True)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.transformer_encoder_kwargs = X['transformer_encoder_kwargs']
@@ -148,6 +154,10 @@ def get_hyperparameter_search_space(
             HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
                                       value_range=(6, 12),
                                       default_value=7),
+            norm_first: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="norm_first",
+                                      value_range=(True, False),
+                                      default_value=True),
             layer_norm_eps: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='layer_norm_eps',
                                       value_range=(1e-7, 1e-3),
@@ -185,6 +195,7 @@ def get_hyperparameter_search_space(
         cs = CS.ConfigurationSpace()
 
         add_hyperparameter(cs, activation, CategoricalHyperparameter)
+        add_hyperparameter(cs, norm_first, CategoricalHyperparameter)
 
         min_transformer_layers, max_transformer_layers = num_layers.value_range
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
index 08f5a4505..fad87e403 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -21,7 +21,7 @@ class DecoderBlockInfo(NamedTuple):
 
 
 class DecoderNetwork(nn.Module):
-    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor):
+    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
         """
         Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 644fbd21c..1a7ecd296 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -151,6 +151,10 @@ def get_hyperparameter_search_space(
             HyperparameterSearchSpace(hyperparameter='d_feed_forward_log',
                                       value_range=(6, 12),
                                       default_value=7),
+            norm_first: HyperparameterSearchSpace =
+            HyperparameterSearchSpace(hyperparameter="norm_first",
+                                      value_range=(True, False),
+                                      default_value=True),
             layer_norm_eps: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='layer_norm_eps',
                                       value_range=(1e-7, 1e-3),
@@ -193,6 +197,7 @@ def get_hyperparameter_search_space(
 
         add_hyperparameter(cs, activation, CategoricalHyperparameter)
         add_hyperparameter(cs, d_model_log, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, norm_first, CategoricalHyperparameter)
 
         min_transformer_layers, max_transformer_layers = num_layers.value_range
 

From c1dda0afb3501d1d042983c2490c7ec07f2c3759 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 25 Mar 2022 11:53:48 +0100
Subject: [PATCH 201/347] fix sampling for seq2seq

---
 .../setup/network/forecasting_architecture.py   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 21bf29412..804aab32a 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -688,7 +688,6 @@ def forward(self,
 
             else:
                 # we follow the DeepAR implementation:
-                all_samples = []
                 batch_size = past_targets.shape[0]
 
                 encoder2decoder = self.repeat_intermediate_values(
@@ -696,12 +695,13 @@ def forward(self,
                     is_hidden_states=self.encoder.encoder_has_hidden_states,
                     repeats=self.num_samples)
 
-                intermediate_values = self.repeat_intermediate_values([encoder_output, past_observed_values],
-                                                                      is_hidden_states=[False, False],
-                                                                      repeats=self.num_samples)
+                if self.has_temporal_fusion:
+                    intermediate_values = self.repeat_intermediate_values([encoder_output, past_observed_values],
+                                                                          is_hidden_states=[False, False],
+                                                                          repeats=self.num_samples)
 
-                encoder_output = intermediate_values[0]
-                past_observed_values = intermediate_values[1]
+                    encoder_output = intermediate_values[0]
+                    past_observed_values = intermediate_values[1]
 
                 if self.decoder_lagged_input:
                     max_lag_seq_length = max(self.decoder_lagged_value) + 1
@@ -761,14 +761,13 @@ def forward(self,
                                                               )
 
                     net_output = self.head(decoder_output)
-                    samples = self.pred_from_net_output(net_output).cpu()
+                    samples = net_output.sample().cpu()
 
                     repeated_predicted_target = torch.cat([repeated_predicted_target,
                                                            samples],
                                                           dim=1)
-                    all_samples.append(samples)
 
-                all_predictions = torch.cat(all_samples, dim=1).unflatten(0, (batch_size, self.num_samples))
+                all_predictions = repeated_predicted_target[:, 1:].unflatten(0, (batch_size, self.num_samples))
 
                 if self.aggregation == 'mean':
                     return self.rescale_output(torch.mean(all_predictions, dim=1), loc, scale)

From 49ee49cb7e288fb840e1a7e6ba3c96156bfeb7eb Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 25 Mar 2022 17:00:14 +0100
Subject: [PATCH 202/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 4e23f88f3..429f84e26 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -271,7 +271,7 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                 X = self.X[:index + 1 + self.n_prediction_steps]
             else:
                 X = None
-            if self._cached_time_features:
+            if self._cached_time_features is None:
                 cached_time_feautres = None
             else:
                 cached_time_feautres = self._cached_time_features[:index + 1 + self.n_prediction_steps]

From dc97df2a8e8a3d7e766af3a93dca245d9373702b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 25 Mar 2022 19:21:47 +0100
Subject: [PATCH 203/347] fix scaling in NBEATS

---
 .../components/setup/network/forecasting_architecture.py  | 4 +++-
 .../components/training/data_loader/time_series_util.py   | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 804aab32a..27a883a79 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1096,8 +1096,10 @@ def forward(self,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.window_size < past_targets.shape[1]:
             past_targets = past_targets[:, -self.window_size:]
-            past_observed_values = past_observed_values[:, -self.window_size]
+            past_observed_values = past_observed_values[:, -self.window_size:]
+
         past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
+
         past_targets = past_targets.to(self.device)
 
         batch_size = past_targets.shape[0]
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index ce49c9fa2..6512de1e1 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -113,10 +113,10 @@ def __call__(self, batch, sample_interval=1, seq_minimal_length=1, padding_value
             # only past targets and features needs to be transformed
 
             return {key: self([d[key] for d in batch]) if "past" not in key else self([d[key] for d in batch],
-                                                                                  self.sample_interval,
-                                                                                  self.window_size,
-                                                                                  self.target_padding_value) for key
-                in elem}
+                                                                                      self.sample_interval,
+                                                                                      self.window_size,
+                                                                                      self.target_padding_value) for key
+                    in elem}
 
         elif elem is None:
             return None

From 399572cc9bf36f8c7d3b41b0c63602ce3ae4a07f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 28 Mar 2022 17:42:29 +0200
Subject: [PATCH 204/347] move time feature computation to dataset

---
 autoPyTorch/datasets/time_series_dataset.py | 62 +++++++++++++++++----
 1 file changed, 52 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 429f84e26..fc31d5604 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -42,7 +42,7 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
     TimeSeriesTransformer
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.constants_forecasting import SEASONALITY_MAP
+from autoPyTorch.constants_forecasting import SEASONALITY_MAP, MAX_WINDOW_SIZE_BASE
 from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
 
 TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
@@ -272,9 +272,9 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
             else:
                 X = None
             if self._cached_time_features is None:
-                cached_time_feautres = None
+                cached_time_features = None
             else:
-                cached_time_feautres = self._cached_time_features[:index + 1 + self.n_prediction_steps]
+                cached_time_features = self._cached_time_features[:index + 1 + self.n_prediction_steps]
 
             return TimeSeriesSequence(X=X,
                                       Y=self.Y[:index + 1],
@@ -289,7 +289,7 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                                       sp=self.sp,
                                       only_has_past_targets=True,
                                       compute_mase_coefficient_value=False,
-                                      time_features=cached_time_feautres)
+                                      time_features=cached_time_features)
 
     def get_test_target(self, test_idx: int):
         if self.only_has_past_targets:
@@ -367,11 +367,12 @@ def __init__(self,
             freq_value = freq
 
         if isinstance(freq_value, list):
-            if np.max(freq_value) < n_prediction_steps:
+            min_base_size = min(n_prediction_steps, MAX_WINDOW_SIZE_BASE)
+            if np.max(freq_value) < min_base_size:
                 tmp_freq = max(freq_value)
             else:
                 tmp_freq = min([freq_value_item for
-                                freq_value_item in freq_value if freq_value_item >= n_prediction_steps])
+                                freq_value_item in freq_value if freq_value_item > min_base_size])
             freq_value = tmp_freq
 
         seasonality = SEASONALITY_MAP.get(freq, 1)
@@ -417,6 +418,7 @@ def __init__(self,
         self.start_times_train = self.validator.start_times_train
         self.start_times_test = self.validator.start_times_test
 
+
         self._transform_time_feature = False
         if not time_feature_transform:
             time_feature_transform = time_features_from_frequency_str(self.freq)
@@ -431,10 +433,14 @@ def __init__(self,
         # Time features are lazily generated, we do not count them as either numerical_columns or categorical columns
 
         X, Y, sequence_lengths = self.validator.transform(X, Y)
+        time_features_train = self.compute_time_features(self.start_times_train, sequence_lengths)
+
         if X_test is not None:
             X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
+            time_features_test = self.compute_time_features(self.start_times_test, self.sequence_lengths_tests)
         else:
             self.sequence_lengths_tests = None
+            time_features_test = None
 
         self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
@@ -549,6 +555,8 @@ def __init__(self,
             X_test=X_test, Y_test=Y_test,
             start_times_train=self.start_times_train,
             start_times_test=self.start_times_test,
+            time_features_train=time_features_train,
+            time_features_test=time_features_test,
             normalize_y=normalize_y,
             **sequences_kwargs)
 
@@ -632,6 +640,28 @@ def __init__(self,
 
         self.lagged_value = lagged_value
 
+    def compute_time_features(self,
+                              start_times: List[pd.DatetimeIndex],
+                              seq_lengths: List[int]) -> Dict[pd.DatetimeIndex, np.ndarray]:
+        """
+        compute the max series length for each start_time and compute their corresponding time_features. As lots of
+        series in a dataset share the same start time, we could only compute the features for longest possible series
+        and reuse them
+        """
+        series_lengths_max = {}
+        for start_t, seq_l in zip(start_times, seq_lengths):
+            if start_t not in series_lengths_max or seq_l > series_lengths_max[start_t]:
+                series_lengths_max[start_t] = seq_l
+        series_time_features = {}
+        for start_t, max_l in series_lengths_max.items():
+            date_info = pd.date_range(start=start_t,
+                                      periods=max_l,
+                                      freq=self.freq)
+            series_time_features[start_t] = np.vstack(
+                [transform(date_info).to_numpy(float) for transform in self.time_feature_transform]
+            ).T
+        return series_time_features
+
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:
         if idx < 0:
             if -idx > len(self):
@@ -677,9 +707,11 @@ def make_sequences_datasets(self,
                                 X: np.ndarray,
                                 Y: np.ndarray,
                                 start_times_train: List[pd.DatetimeIndex],
+                                time_features_train:Optional[Dict[pd.Timestamp, np.ndarray]]=None,
                                 X_test: Optional[np.ndarray] = None,
                                 Y_test: Optional[np.ndarray] = None,
                                 start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+                                time_features_test:Optional[Dict[pd.Timestamp, np.ndarray]]=None,
                                 normalize_y: bool = True,
                                 **sequences_kwargs: Optional[Dict]) -> \
             Tuple[List[TimeSeriesSequence], Tuple[List, List], Tuple[List, List]]:
@@ -691,6 +723,10 @@ def make_sequences_datasets(self,
                 number of features
             Y: np.ndarray (N_all, N_target)
                 flattened train target array with size N_all (the sum of all the series sequences) and number of targets
+            start_times_train: List[pd.DatetimeIndex]
+                start time of each training series
+            time_features_train: Dict[pd.Timestamp, np.ndarray]:
+                time features for each possible start training times
             sequence_lengths_train: List[int]
                 a list containing all the sequences length in the training set
             X_test: Optional[np.ndarray (N_all_test, N_feature)]
@@ -698,6 +734,10 @@ def make_sequences_datasets(self,
                 number of features
             Y_test: np.ndarray (N_all_test, N_target)
                 flattened test target array with size N_all (the sum of all the series sequences) and number of targets
+            start_times_test: Optional[List[pd.DatetimeIndex]]
+                start time for each test series
+            time_features_test:Optional[Dict[pd.Timestamp, np.ndarray]]
+                time features for each possible start test times.
             sequence_lengths_test: Optional[List[int]]
                 a list containing all the sequences length in the test set
             normalize_y: bool
@@ -753,14 +793,16 @@ def make_sequences_datasets(self,
             if X_seq.size == 0:
                 X_seq = None
                 X_test_seq = None
+            start_time_train = start_times_train[seq_idx]
 
             sequence = TimeSeriesSequence(
                 X=X_seq,
                 Y=Y_seq,
-                start_time_train=start_times_train[seq_idx],
+                start_time_train=start_time_train,
                 X_test=X_test_seq,
                 Y_test=Y_test_seq,
                 start_time_test=None if start_times_test is None else start_times_test[seq_idx],
+                time_features=time_features_train[start_time_train][:len(Y_seq)],
                 **sequences_kwargs)
             sequence_datasets.append(sequence)
             idx_start_train = idx_end_train
@@ -784,12 +826,12 @@ def make_sequences_datasets(self,
 
     def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
-        self.update_tensros_seqs(X_train, self.sequence_lengths_train + self.n_prediction_steps, is_train=True)
+        self.update_tensors_seqs(X_train, self.sequence_lengths_train + self.n_prediction_steps, is_train=True)
         if X_test is not None:
-            self.update_tensros_seqs(X_test, self.sequence_lengths_tests, is_train=False)
+            self.update_tensors_seqs(X_test, self.sequence_lengths_tests, is_train=False)
         return self
 
-    def update_tensros_seqs(self, X: np.ndarray, sequence_lengths, is_train=True):
+    def update_tensors_seqs(self, X: np.ndarray, sequence_lengths, is_train=True):
         if X.size == 0:
             return
         idx_start = 0

From 7154308f6b6649f15f430537b515f993de856178 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 30 Mar 2022 20:13:20 +0200
Subject: [PATCH 205/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index fc31d5604..c486a96e6 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -372,7 +372,7 @@ def __init__(self,
                 tmp_freq = max(freq_value)
             else:
                 tmp_freq = min([freq_value_item for
-                                freq_value_item in freq_value if freq_value_item > min_base_size])
+                                freq_value_item in freq_value if freq_value_item >= min_base_size])
             freq_value = tmp_freq
 
         seasonality = SEASONALITY_MAP.get(freq, 1)

From 1ba08fe1274afa438f94360343097132990ce9d5 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 31 Mar 2022 15:50:18 +0200
Subject: [PATCH 206/347] fix feature computation

---
 autoPyTorch/datasets/time_series_dataset.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index c486a96e6..51e3d7b32 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 import pandas as pd
+from pandas._libs.tslibs import to_offset
 from scipy.sparse import issparse
 
 import torch
@@ -32,7 +33,7 @@
 
 from gluonts.time_feature.lag import get_lags_for_frequency
 from gluonts.time_feature import (
-    Constant,
+    Constant as ConstantTransform,
     TimeFeature,
     time_features_from_frequency_str,
 )
@@ -425,7 +426,7 @@ def __init__(self,
             if not time_feature_transform:
                 # If time features are empty (as for yearly data), we add a
                 # constant feature of 0
-                time_feature_transform = [Constant()]
+                time_feature_transform = [ConstantTransform()]
 
         self.time_feature_transform = time_feature_transform
         self.time_feature_names = tuple([f'time_feature_{t.__class__.__name__}' for t in self.time_feature_transform])
@@ -657,8 +658,12 @@ def compute_time_features(self,
             date_info = pd.date_range(start=start_t,
                                       periods=max_l,
                                       freq=self.freq)
+
+
             series_time_features[start_t] = np.vstack(
-                [transform(date_info).to_numpy(float) for transform in self.time_feature_transform]
+                [transform(date_info).to_numpy(float)
+                 if not isinstance(transform, ConstantTransform) else transform(date_info)
+                 for transform in self.time_feature_transform]
             ).T
         return series_time_features
 

From 04a69d8f660138a876c60f65cbd5e5daa77f1459 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 31 Mar 2022 18:18:37 +0200
Subject: [PATCH 207/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py | 43 ++++++++++++---------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 51e3d7b32..ab9c38254 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 from pandas._libs.tslibs import to_offset
+from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
 from scipy.sparse import issparse
 
 import torch
@@ -227,13 +228,19 @@ def compute_time_features(self, ):
         else:
             if self.is_test_set:
                 if self._cached_time_features.shape[0] == self.Y.shape[0]:
-                    date_info = pd.date_range(start=self.start_time_train,
-                                              periods=self.n_prediction_steps + self.Y.shape[0],
-                                              freq=self.freq)
-                    time_feature_future = np.vstack(
-                        [transform(date_info[-self.n_prediction_steps:]).to_numpy(float) for transform in
-                         self.time_feature_transform]
-                    ).T
+                    try:
+                        date_info = pd.date_range(start=self.start_time_train,
+                                                  periods=self.n_prediction_steps + self.Y.shape[0],
+                                                  freq=self.freq)
+                        time_feature_future = np.vstack(
+                            [transform(date_info).to_numpy(float)
+                             if not isinstance(transform, ConstantTransform) else transform(date_info)
+                             for transform in self.time_feature_transform]
+                        ).T
+                    except OutOfBoundsDatetime:
+                        # This is only a temporal solution TODO consider how to solve this!
+                        time_feature_future = np.zeros([self.n_prediction_steps, len(self.time_feature_transform)])
+
                     self._cached_time_features = np.concatenate([self._cached_time_features, time_feature_future])
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
@@ -419,7 +426,6 @@ def __init__(self,
         self.start_times_train = self.validator.start_times_train
         self.start_times_test = self.validator.start_times_test
 
-
         self._transform_time_feature = False
         if not time_feature_transform:
             time_feature_transform = time_features_from_frequency_str(self.freq)
@@ -655,16 +661,17 @@ def compute_time_features(self,
                 series_lengths_max[start_t] = seq_l
         series_time_features = {}
         for start_t, max_l in series_lengths_max.items():
-            date_info = pd.date_range(start=start_t,
-                                      periods=max_l,
-                                      freq=self.freq)
-
-
-            series_time_features[start_t] = np.vstack(
-                [transform(date_info).to_numpy(float)
-                 if not isinstance(transform, ConstantTransform) else transform(date_info)
-                 for transform in self.time_feature_transform]
-            ).T
+            try:
+                date_info = pd.date_range(start=start_t,
+                                          periods=max_l,
+                                          freq=self.freq)
+                series_time_features[start_t] = np.vstack(
+                    [transform(date_info).to_numpy(float)
+                     if not isinstance(transform, ConstantTransform) else transform(date_info)
+                     for transform in self.time_feature_transform]
+                ).T
+            except OutOfBoundsDatetime as e:
+                series_time_features[start_t] = np.zeros([max_l, len(self.time_feature_transform)])
         return series_time_features
 
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:

From 471db34a2492a4edc7573a48cd35a1a3cd294357 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 13 Apr 2022 16:10:08 +0200
Subject: [PATCH 208/347] multi-variant feature validator

---
 .../data/time_series_feature_validator.py     | 118 ++++------
 .../data/time_series_forecasting_validator.py | 208 ++++++++++--------
 2 files changed, 161 insertions(+), 165 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 7a7422ef6..d2990eb93 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -1,104 +1,66 @@
 import logging
-from typing import Optional, Union
-
+from typing import Optional, Union, Tuple, Sequence
+import pandas as pd
 import numpy as np
 
 import sklearn.utils
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
-
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-class TimeSeriesFeatureValidator(BaseEstimator):
-    def __init__(self,
-                 logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None) -> None:
-        self.logger = logger
-        self._is_fitted = False
+class TimeSeriesFeatureValidator(TabularFeatureValidator):
+    def __init__(
+        self,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+    ):
+        super().__init__(logger)
+        self.data_contain_ser_idx = False
 
     def fit(self,
-            X_train: np.ndarray,
-            X_test: Optional[np.ndarray] = None) -> BaseEstimator:
+            X_train: Union[pd.DataFrame, np.ndarray],
+            X_test: Union[pd.DataFrame, np.ndarray] = None,
+            series_idx: Optional[Union[Tuple[Union[str, int]]]] = None) -> BaseEstimator:
         """
 
         Arguments:
-            X_train (np.ndarray):
+            X_train (Union[pd.DataFrame, np.ndarray]):
                 A set of data that are going to be validated (type and dimensionality
                 checks) and used for fitting
 
-            X_test (Optional[np.ndarray]):
+            X_test (Union[pd.DataFrame, np.ndarray]):
                 An optional set of data that is going to be validated
 
+            series_idx (Optional[Union[str, int]]):
+                Series Index, to identify each individual series
+
         Returns:
             self:
                 The fitted base estimator
         """
-
-        if not isinstance(X_train, np.ndarray):
-            raise ValueError(f"Time series train data must be given as a numpy array, but got {type(X_train)}")
-
-        if X_train.ndim != 3:
-            raise ValueError(f"Invalid number of dimensions for time series train data, "
-                             f"expected 3 but got {X_train.ndim}. "
-                             f"Time series data has to be of shape [B, T, F] where B is the "
-                             f"batch dimension, T is the time dimension and F are the number of features.")
-
-        _ = sklearn.utils.check_array(
-            X_train,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        )
-
-        if X_test is not None:
-            if not isinstance(X_test, np.ndarray):
-                raise ValueError(f"Time series test data must be given as a numpy array, but got {type(X_test)}")
-
-            if not X_test.ndim == 3:
-                raise ValueError(f"Invalid number of dimensions for time series test data, "
-                                 f"expected 3 but got {X_train.ndim}. "
-                                 f"Time series data has to be of shape [B, T, F] where B is the "
-                                 f"batch dimension, T is the time dimension and F are the number of features")
-
-            if X_train.shape[1:] != X_test.shape[1:]:
-                raise ValueError(f"Time series train and test data are expected to have the same shape except for "
-                                 f"the batch dimension, but got {X_train.shape} for train data and "
-                                 f"{X_test.shape} for test data")
-
-            _ = sklearn.utils.check_array(
-                X_test,
-                force_all_finite=True,
-                ensure_2d=False,
-                allow_nd=True,
-                accept_sparse=False,
-                accept_large_sparse=False
-            )
-
-        self._is_fitted = True
+        if series_idx is not None:
+            # remove series idx as they are not part of features
+            if isinstance(X_train, pd.DataFrame):
+                for series_id in series_idx:
+                    if series_id not in X_train.columns:
+                        raise ValueError(f"All Series ID must be contained in the training column, however, {series_id}"
+                                         f"is not part of {X_train.columns.tolist()}")
+                self.data_contain_ser_idx = True
+
+                X_train_ = X_train.drop(series_idx, axis=1)
+                X_test_ = X_test.drop(series_idx, axis=1) if X_test is not None else None
+
+                super().fit(X_train_, X_test_)
+            else:
+                raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
+                                          f"X_train is {type(X_train)} ")
+        else:
+            super().fit(X_train, X_test)
+        if isinstance(X_train, pd.DataFrame):
+            if series_idx is None:
+                series_idx = ['Series Idx']
+            self.column_order = series_idx + self.column_order
 
         return self
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
-        """
-
-        Arguments:
-            X (np.ndarray):
-                A set of data, that is going to be transformed
-
-        Return:
-            np.ndarray:
-                The transformed array
-        """
-        if not self._is_fitted:
-            raise NotFittedError("Cannot call transform on a validator that is not fitted")
-
-        return sklearn.utils.check_array(
-            X,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        )
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 19c1ead0f..62ee57c8e 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -6,21 +6,27 @@
 from typing import Optional, Tuple, List, Union, Dict
 import numpy as np
 import pandas as pd
+from pandas.core.groupby.generic import DataFrameGroupBy
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
-from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 
 
 class TimeSeriesForecastingInputValidator(TabularInputValidator):
+    """
+    A validator designed for a time series forecasting dataset.
+    As a time series forecasting dataset might contain several time sequence with different length, we will transform
+    all the data to DataFrameGroupBy whereas each group represents a series
+    """
     def __init__(self,
                  is_classification: bool = False,
                  logger_port: Optional[int] = None,
                  ) -> None:
         super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port)
+        self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
         self._is_uni_variant = False
         self.known_future_features = None
         self.n_prediction_steps = 1
@@ -28,18 +34,15 @@ def __init__(self,
         self.start_times_test = None
         self.feature_shapes = {}
         self.feature_names = []
-
-    """
-    A validator designed for a time series forecasting dataset.
-    As a time series forecasting dataset might contain several time sequnces with
-    """
+        self.series_idx = None
 
     def fit(
             self,
-            X_train: Optional[SUPPORTED_FEAT_TYPES],
-            y_train: SUPPORTED_TARGET_TYPES,
-            X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
-            y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+            X_train: Optional[Union[List, pd.DataFrame]],
+            y_train: Union[List, pd.DataFrame],
+            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+            X_test: Optional[Union[List, pd.DataFrame]] = None,
+            y_test: Optional[Union[List, pd.DataFrame]] = None,
             start_times_train: Optional[List[pd.DatetimeIndex]] = None,
             start_times_test: Optional[List[pd.DatetimeIndex]] = None,
             freq: str = '1Y',
@@ -47,6 +50,17 @@ def fit(
             known_future_features: Optional[List[Union[int, str]]] = None,
             use_time_features: bool = False
     ) -> BaseEstimator:
+        """
+        fit the validator with the training data, (optionally) start times and other information
+        Args:
+            X_train (Optional[Union[List, pd.DataFrame]]): training features, could be None for "pure" forecasting tasks
+            y_train (Union[List, pd.DataFrame]), training targets
+            series_idx (Optional[Union[List[Union[str, int]], str, int]]): which columns of the data are considered as
+
+        """
+        if isinstance(series_idx, (str, int)):
+            series_idx = [series_idx]
+        self.series_idx = series_idx
         self.n_prediction_steps = n_prediction_steps
 
         if y_test is not None and bool(start_times_test) != bool(start_times_train):
@@ -69,46 +83,57 @@ def fit(
 
         if X_train is None:
             self._is_uni_variant = True
-        if self._is_uni_variant:
-            self.feature_validator.num_features = 0
-            self.feature_validator.numerical_columns = []
-            self.feature_validator.categorical_columns = []
-
-            if y_test is not None:
-                self.target_validator.fit(y_train[0], y_test[0])
+        if isinstance(y_train, List):
+            if self.series_idx is not None:
+                # TODO: add support for this
+                raise NotImplementedError("When training data is given in the form of list, providing series idx info"
+                                          "is not supported")
+            # X_train and y_train are stored as lists
+            if self._is_uni_variant:
+                self.feature_validator.num_features = 0
+                self.feature_validator.numerical_columns = []
+                self.feature_validator.categorical_columns = []
+
+                if y_test is not None:
+                    self.target_validator.fit(y_train[0], y_test[0])
+                else:
+                    self.target_validator.fit(y_train[0])
+                self._is_fitted = True
             else:
-                self.target_validator.fit(y_train[0])
-            self._is_fitted = True
+                self.known_future_features = known_future_features
+                # Check that the data is valid
+                if len(X_train) != len(y_train):
+                    raise ValueError("Inconsistent number of sequences for features and targets,"
+                                     " {} for features and {} for targets".format(len(X_train), len(y_train), ))
+
+                if X_test is not None:
+                    if len(X_test) != len(y_test):
+                        raise ValueError("Inconsistent number of test datapoints for features and targets,"
+                                         " {} for features and {} for targets".format(len(X_test), len(y_test), ))
+                    # TODO write a feature input validator to check X_test for known_future_features
+                    super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
+                self.feature_validator.fit(X_train[0], None if X_test is None else X_test[0], series_idx=series_idx)
+                self.target_validator.fit(y_train[0], None if y_test is None else y_test[0])
+                self._is_fitted = True
+
+                # In this case we don't assign series index to the data, we manually assigne
+
+                self.check_input_shapes(X_train, y_train, is_training=True)
+
+                if X_test is not None:
+                    self.check_input_shapes(X_test, y_test, is_training=False)
+                if hasattr(X_train[0], 'columns'):
+                    features = X_train[0].columns.values.tolist()
+                else:
+                    features = list(map(str, range(len(X_train[0]))))
+                for feature in features:
+                    self.feature_names.append(feature)
+                    self.feature_shapes[feature] = 1
         else:
-            self.known_future_features = known_future_features
-            # Check that the data is valid
-            if len(X_train) != len(y_train):
-                raise ValueError("Inconsistent number of sequences for features and targets,"
-                                 " {} for features and {} for targets".format(len(X_train), len(y_train), ))
-
-            if X_test is not None:
-                if len(X_test) != len(y_test):
-                    raise ValueError("Inconsistent number of test datapoints for features and targets,"
-                                     " {} for features and {} for targets".format(len(X_test), len(y_test), ))
-                # TODO write a feature input validator to check X_test for known_future_features
-                super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
-            self.feature_validator.fit(X_train[0], None if X_test is None else X_test[0])
-            self.target_validator.fit(y_train[0], None if y_test is None else y_test[0])
-            self._is_fitted = True
-
-            self.check_input_shapes(X_train, y_train, is_training=True)
-
-            if X_test is not None:
-                self.check_input_shapes(X_test, y_test, is_training=False)
-            if hasattr(X_train[0], 'columns'):
-                features = X_train[0].columns.values.tolist()
-            else:
-                features = list(map(str, range(len(X_train[0]))))
-            for feature in features:
-                self.feature_names.append(feature)
-                self.feature_shapes[feature] = 1
+            # TODO X_train and y_train are pd.DataFrame
+            raise NotImplementedError
 
-            return self
+        return self
 
     @staticmethod
     def get_num_features(X):
@@ -134,61 +159,70 @@ def check_input_shapes(X, y, is_training: bool = True):
 
     def transform(
             self,
-            X: Optional[SUPPORTED_FEAT_TYPES],
-            y: Optional[SUPPORTED_TARGET_TYPES] = None,
-    ) -> Tuple[Union[np.ndarray], List[int], Optional[np.ndarray]]:
+            X: Optional[Union[List, pd.DataFrame]],
+            y: Optional[Union[List, pd.DataFrame]] = None,
+    ) -> Tuple[DataFrameGroupBy, Optional[DataFrameGroupBy], List[int]]:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
         if y is None:
             raise ValueError('Targets must be given!')
 
-        num_sequences = len(y)
-        sequence_lengths = [0] * num_sequences
-        if self._is_uni_variant:
-            num_features = 0
-        else:
-            if X is None:
-                raise ValueError('Multi Variant dataset requires X as input!')
-            num_features = self.feature_validator.num_features
+        series_idx = ['Series Index'] or self.series_idx
+
+        if isinstance(y, List):
+            num_sequences = len(y)
+            sequence_lengths = [0] * num_sequences
+            if self._is_uni_variant:
+                num_features = 0
+            else:
+                if X is None:
+                    raise ValueError('Multi Variant dataset requires X as input!')
+                num_features = self.feature_validator.num_features
 
-        for seq_idx in range(num_sequences):
-            sequence_lengths[seq_idx] = len(y[seq_idx])
-        sequence_lengths = np.asarray(sequence_lengths)
+            for seq_idx in range(num_sequences):
+                sequence_lengths[seq_idx] = len(y[seq_idx])
+            sequence_lengths = np.asarray(sequence_lengths)
 
-        num_targets = self.target_validator.out_dimensionality
+            num_targets = self.target_validator.out_dimensionality
 
-        num_data = np.sum(sequence_lengths)
+            num_data = np.sum(sequence_lengths)
 
-        start_idx = 0
+            start_idx = 0
 
-        group_ids = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
 
-        if self._is_uni_variant:
+            group_ids = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+
+            if self._is_uni_variant:
+                y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
+                for seq_idx, seq_length in enumerate(sequence_lengths):
+                    end_idx = start_idx + seq_length
+                    y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
+                    start_idx = end_idx
+
+                y_transformed = self.target_validator.transform(y_flat)
+                import pdb
+                pdb.set_trace()
+                if y_transformed.ndim == 1:
+                    y_transformed = np.expand_dims(y_transformed, -1)
+                return np.asarray([]), y_transformed, sequence_lengths
+
+            # a matrix that is concatenated by all the time series sequences
+
+            X_flat = pd.DataFrame(np.empty([num_data, num_features]), index=group_ids)
             y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
+
+            start_idx = 0
             for seq_idx, seq_length in enumerate(sequence_lengths):
                 end_idx = start_idx + seq_length
+                X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
                 y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
-            y_transformed = self.target_validator.transform(y_flat)
+
+            X_transformed = self.feature_validator.transform(X_flat)  # type:np.ndarray
+            y_transformed = self.target_validator.transform(y_flat)  # type:np.ndarray
             if y_transformed.ndim == 1:
                 y_transformed = np.expand_dims(y_transformed, -1)
-            return np.asarray([]), y_transformed, sequence_lengths
-
-        # a matrix that is concatenated by all the time series sequences
-
-        X_flat = pd.DataFrame(np.empty([num_data, num_features]), index=group_ids)
-        y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
-
-        start_idx = 0
-        for seq_idx, seq_length in enumerate(sequence_lengths):
-            end_idx = start_idx + seq_length
-            X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
-            y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
-            start_idx = end_idx
-
-        X_transformed = self.feature_validator.transform(X_flat)  # type:np.ndarray
-        y_transformed = self.target_validator.transform(y_flat)  # type:np.ndarray
-        if y_transformed.ndim == 1:
-            y_transformed = np.expand_dims(y_transformed, -1)
-        return X_transformed, y_transformed, sequence_lengths
+            return X_transformed, y_transformed, sequence_lengths
+        else:
+            raise NotImplementedError
\ No newline at end of file

From cc77b518c0a70988351664ede5e9b4d23aa1c8b9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 13 Apr 2022 18:41:01 +0200
Subject: [PATCH 209/347] maint

---
 autoPyTorch/api/base_task.py                  |  20 +-
 autoPyTorch/api/time_series_forecasting.py    | 205 ++++++++++++++----
 .../data/time_series_forecasting_validator.py |   9 +-
 autoPyTorch/data/time_series_validator.py     |   5 -
 autoPyTorch/datasets/time_series_dataset.py   |   1 +
 autoPyTorch/evaluation/abstract_evaluator.py  |   3 -
 autoPyTorch/evaluation/tae.py                 |  73 +++----
 ...time_series_forecasting_train_evaluator.py |   2 +-
 autoPyTorch/optimizer/smbo.py                 |   1 +
 .../time_series_forecasting_data_loader.py    |   3 +-
 .../forecasting_base_trainer.py               |   8 -
 requirements.txt                              |   6 +-
 12 files changed, 215 insertions(+), 121 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index c2e08a7f1..9f108cf71 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -745,14 +745,9 @@ def _do_dummy_prediction(self) -> None:
             initial_num_run=num_run,
             stats=stats,
             memory_limit=memory_limit,
-<<<<<<< HEAD
-            disable_file_output=True if len(self._disable_file_output) > 0 else False,
+            disable_file_output=self._disable_file_output,
             all_supported_metrics=self._all_supported_metrics,
             evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
-=======
-            disable_file_output=self._disable_file_output,
-            all_supported_metrics=self._all_supported_metrics
->>>>>>> upstream/development
         )
 
         status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -838,7 +833,8 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
                     stats=stats,
                     memory_limit=memory_limit,
                     disable_file_output=self._disable_file_output,
-                    all_supported_metrics=self._all_supported_metrics
+                    all_supported_metrics=self._all_supported_metrics,
+                    evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
                 )
                 dask_futures.append([
                     classifier,
@@ -937,7 +933,6 @@ def _search(
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
         dask_client: Optional[dask.distributed.Client] = None,
-        time_series_forecasting: bool = False,
         **kwargs: Dict[str, Any]
     ) -> 'BaseTask':
         """
@@ -1134,7 +1129,7 @@ def _search(
 
         # Incorporate budget to pipeline config
         if budget_type not in ('epochs', 'runtime') and (budget_type in FORECASTING_BUDGET_TYPE
-                                                         and not time_series_forecasting):
+                                                         and not self.time_series_forecasting):
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type
@@ -1254,7 +1249,6 @@ def _search(
         if time_left_for_smac <= 0:
             self._logger.warning(" Not starting SMAC because there is no time left")
         else:
-
             _proc_smac = AutoMLSMBO(
                 config_space=self.search_space,
                 dataset_name=str(dataset.dataset_name),
@@ -1285,7 +1279,7 @@ def _search(
                 search_space_updates=self.search_space_updates,
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
-                time_series_forecasting=time_series_forecasting,
+                time_series_forecasting=self.time_series_forecasting,
                 **kwargs,
             )
             try:
@@ -1662,7 +1656,9 @@ def fit_pipeline(
             exclude=exclude_components,
             search_space_updates=search_space_updates,
             pipeline_config=pipeline_options,
-            pynisher_context=self._multiprocessing_context
+            pynisher_context=self._multiprocessing_context,
+            evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
+
         )
 
         run_info, run_value = tae.run_wrapper(
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 236b873a2..99a886beb 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -1,6 +1,6 @@
 import os
 import uuid
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Mapping
 
 import numpy as np
 
@@ -13,7 +13,13 @@
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
+    ResamplingStrategies,
 )
+from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
+)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.automl_common.common.utils.backend import Backend
@@ -107,17 +113,136 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An
             ))
         return dataset.get_required_dataset_info()
 
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesForecastingPipeline:
-        return TimeSeriesForecastingPipeline(dataset_properties=dataset_properties)
+    def build_pipeline(
+            self,
+            dataset_properties: Dict[str, BaseDatasetPropertiesType],
+            include_components: Optional[Dict[str, Any]] = None,
+            exclude_components: Optional[Dict[str, Any]] = None,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+    ) -> TimeSeriesForecastingPipeline:
+        """
+        Build pipeline according to current task
+        and for the passed dataset properties
 
-    def search(
+        Args:
+            dataset_properties (Dict[str, Any]):
+                Characteristics of the dataset to guide the pipeline
+                choices of components
+            include_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to include. Key is the node
+                name and Value is an Iterable of the names of the components
+                to include. Only these components will be present in the
+                search space.
+            exclude_components (Optional[Dict[str, Any]]):
+                Dictionary containing components to exclude. Key is the node
+                name and Value is an Iterable of the names of the components
+                to exclude. All except these components will be present in
+                the search space.
+            search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+                Search space updates that can be used to modify the search
+                space of particular components or choice modules of the pipeline
+
+        Returns:
+            TimeSeriesForecastingPipeline:
+
+        """
+        return TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                             include=include_components,
+                                             exclude=exclude_components,
+                                             search_space_updates=search_space_updates)
+
+    def _get_dataset_input_validator(
             self,
-            optimize_metric: str,
-            X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            X_train: Union[List, pd.DataFrame, np.ndarray],
+            y_train: Union[List, pd.DataFrame, np.ndarray],
             X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
             y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+            resampling_strategy: Optional[ResamplingStrategies] = None,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            dataset_name: Optional[str] = None,
+            dataset_compression: Optional[DatasetCompressionSpec] = None,
+            freq: Optional[Union[str, int, List[int]]] = None,
+            start_times_train: List[pd.DatetimeIndex] = [],
+            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+            n_prediction_steps: int = 1,
+    ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
+        """
+        Returns an object of `TabularDataset` and an object of
+        `TabularInputValidator` according to the current task.
+
+        Args:
+            X_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training feature set.
+            y_train (Union[List, pd.DataFrame, np.ndarray]):
+                Training target set.
+            X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing feature set
+            y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
+                Testing target set
+            resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
+                Strategy to split the training data. if None, uses
+                HoldoutValTypes.holdout_validation.
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                arguments required for the chosen resampling strategy. If None, uses
+                the default values provided in DEFAULT_RESAMPLING_PARAMETERS
+                in ```datasets/resampling_strategy.py```.
+            dataset_name (Optional[str]):
+                name of the dataset, used as experiment name.
+            dataset_compression (Optional[DatasetCompressionSpec]):
+                specifications for dataset compression. For more info check
+                documentation for `BaseTask.get_dataset`.
+            freq: Optional[Union[str, int, List[int]]]
+                frequency information, it determines the configuration space of the window size, if it is not given,
+                we will use the default configuration
+            forecasting_kwargs (Any)
+                kwargs for forecasting dataset, for more details, please check
+                ```datasets/time_series_dataset.py```
+        Returns:
+            TabularDataset:
+                the dataset object.
+            TabularInputValidator:
+                the input validator fitted on the data.
+        """
+
+        resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
+        resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
+            self.resampling_strategy_args
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        input_validator = TimeSeriesForecastingInputValidator(
+            is_classification=False,
+            logger_port=self._logger_port,
+            dataset_compression=dataset_compression
+        )
+
+        # Fit a input validator to check the provided data
+        # Also, an encoder is fit to both train and test data,
+        # to prevent unseen categories during inference
+        input_validator.fit(X_train=X_train, y_train=y_train, start_times_train=start_times_train,
+                                X_test=X_test, y_test=y_test, start_times_test=start_times_test)
+
+        dataset = TimeSeriesForecastingDataset(
+            X=X_train, Y=y_train,
+            X_test=X_test, Y_test=y_test,
+            freq=freq,
+            start_times_train=start_times_train,
+            start_times_test=start_times_test,
+            validator=input_validator,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args,
+            n_prediction_steps=n_prediction_steps,
+        )
+
+        return dataset, input_validator
+
+    def search(
+            self,
+            optimize_metric: str,
+            X_train: Optional[Union[List, pd.DataFrame]] = None,
+            y_train: Optional[Union[List, pd.DataFrame]] = None,
+            X_test: Optional[Union[List, pd.DataFrame]] = None,
+            y_test: Optional[Union[List, pd.DataFrame]] = None,
             n_prediction_steps: int = 1,
             freq: Optional[Union[str, int, List[int]]] = None,
             start_times_train: List[pd.DatetimeIndex] = [],
@@ -137,10 +262,10 @@ def search(
             disable_file_output: List = [],
             load_models: bool = True,
             portfolio_selection: Optional[str] = None,
-            normalize_y: bool = True,
             suggested_init_models: Optional[List[str]] = None,
             custom_init_setting_path: Optional[str] = None,
             min_num_test_instances: Optional[int] = None,
+            dataset_compression: Union[Mapping[str, Any], bool] = False,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -149,21 +274,29 @@ def search(
         To disable ensembling, set ensemble_size==0.
         using the optimizer.
         Args:
-            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
+            optimize_metric (str):
+                name of the metric that is used to evaluate a pipeline.
+            X_train: Optional[Union[List, pd.DataFrame]]
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
-            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
-                (used for multi-variable prediction), indicates which value needs to be predicted
+            y_train: Union[List, pd.DataFrame]
+                training target, must be given
+            X_test: Optional[Union[List, pd.DataFrame]]
+                Test Features, Test series need to end at one step before forecasting
+            y_test: Optional[Union[List, pd.DataFrame]]
+                Test Targets
             n_prediction_steps: int
                 How many steps in advance we need to predict
             freq: Optional[Union[str, int, List[int]]]
                 frequency information, it determines the configuration space of the window size, if it is not given,
                 we will use the default configuration
+            start_times_train: : List[pd.DatetimeIndex]
+                A list indicating the start time of each series in the training sets
+            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+            A list indicating the start time of each series in the test sets
             dataset_name: Optional[str],
                 dataset name
-            optimize_metric (str): name of the metric that is used to
-                evaluate a pipeline.
             budget_type (str):
                 Type of budget to be used when fitting the pipeline.
                 It can be one of:
@@ -237,8 +370,6 @@ def search(
             disable_file_output (Union[bool, List]):
             load_models (bool), (default=True): Whether to load the
                 models after fitting AutoPyTorch.
-            normalize_y: bool
-                if the input y values need to be normalized
             suggested_init_models: Optional[List[str]]
                 suggested initial models with their default configurations setting
             custom_init_setting_path: Optional[str]
@@ -251,37 +382,22 @@ def search(
             self
 
         """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
 
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
-        # TODO we will only consider target variables as int here
-        self.target_variables = target_variables
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TimeSeriesForecastingInputValidator(
-            is_classification=False,
-            logger_port=self._logger_port,
-        )
+        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
 
-        # Fit a input validator to check the provided data
-        # Also, an encoder is fit to both train and test data,
-        # to prevent unseen categories during inference
-        self.InputValidator.fit(X_train=X_train, y_train=y_train, start_times_train=start_times_train,
-                                X_test=X_test, y_test=y_test, start_times_test=start_times_test)
-
-        self.dataset = TimeSeriesForecastingDataset(
-            X=X_train, Y=y_train,
-            X_test=X_test, Y_test=y_test,
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
+            resampling_strategy=self.resampling_strategy,
+            resampling_strategy_args=self.resampling_strategy_args,
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression,
             freq=freq,
             start_times_train=start_times_train,
             start_times_test=start_times_test,
-            validator=self.InputValidator,
-            resampling_strategy=self.resampling_strategy,
-            resampling_strategy_args=self.resampling_strategy_args,
-            n_prediction_steps=n_prediction_steps,
-            normalize_y=normalize_y,
+            n_prediction_steps=n_prediction_steps
         )
 
         if self.dataset.base_window_size is not None or not self.customized_window_size:
@@ -331,13 +447,12 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
-            time_series_forecasting=self.time_series_forecasting,
-            **forecasting_kwargs,
+            **forecasting_kwargs
         )
 
     def predict(
             self,
-            X_test: Optional[Union[Union[List[np.ndarray]], pd.DataFrame, Dict]]=None,
+            X_test: Optional[Union[Union[List[np.ndarray]], pd.DataFrame, Dict]] = None,
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
             past_targets: Optional[List[np.ndarray]] = None,
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 62ee57c8e..2fbad0fd2 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -10,6 +10,7 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
+from autoPyTorch.data.utils import  DatasetCompressionSpec
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
@@ -24,8 +25,9 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
     def __init__(self,
                  is_classification: bool = False,
                  logger_port: Optional[int] = None,
+                 dataset_compression: Optional[DatasetCompressionSpec] = None,
                  ) -> None:
-        super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port)
+        super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port, dataset_compression)
         self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
         self._is_uni_variant = False
         self.known_future_features = None
@@ -68,7 +70,7 @@ def fit(
                           'risk of not proper evaluated ')
 
         if start_times_train is None:
-            start_times_train = [pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)] * len(y_train)
+            start_times_train = [pd.DatetimeIndex(pd.to_datetime(['2000-01-01']), freq=freq)] * len(y_train)
         else:
             assert len(start_times_train) == len(y_train), 'start_times_train must have the same length as y_train!'
 
@@ -201,8 +203,7 @@ def transform(
                     start_idx = end_idx
 
                 y_transformed = self.target_validator.transform(y_flat)
-                import pdb
-                pdb.set_trace()
+
                 if y_transformed.ndim == 1:
                     y_transformed = np.expand_dims(y_transformed, -1)
                 return np.asarray([]), y_transformed, sequence_lengths
diff --git a/autoPyTorch/data/time_series_validator.py b/autoPyTorch/data/time_series_validator.py
index 373a6a740..11073363e 100644
--- a/autoPyTorch/data/time_series_validator.py
+++ b/autoPyTorch/data/time_series_validator.py
@@ -2,11 +2,6 @@
 import logging
 import typing
 
-from sklearn.base import BaseEstimator
-
-from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
-from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
-
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ab9c38254..ee11be04e 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -713,6 +713,7 @@ def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
             if dataset_idx != 0:
                 test_idx = test_idx - self.cumulative_sizes[dataset_idx - 1]
             y_test[y_i] = self.datasets[dataset_idx].get_test_target(test_idx)
+
         return y_test.reshape([-1, self.num_target])
 
     def make_sequences_datasets(self,
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 5cf4c51df..667e2ec2c 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -546,9 +546,6 @@ def __init__(self, backend: Backend,
                 raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_regression
 
-
-
-
         self.additional_metrics: Optional[List[autoPyTorchMetric]] = None
         metrics_dict: Optional[Dict[str, List[str]]] = None
         if all_supported_metrics:
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 48ffe71b2..6568296d4 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -105,31 +105,31 @@ class ExecuteTaFuncWithQueue(AbstractTAFunc):
     """
 
     def __init__(
-        self,
-        backend: Backend,
-        seed: int,
-        metric: autoPyTorchMetric,
-        cost_for_crash: float,
-        abort_on_first_run_crash: bool,
-        pynisher_context: str,
-        multi_objectives: List[str],
-        pipeline_config: Optional[Dict[str, Any]] = None,
-        initial_num_run: int = 1,
-        stats: Optional[Stats] = None,
-        run_obj: str = 'quality',
-        par_factor: int = 1,
-        output_y_hat_optimization: bool = True,
-        include: Optional[Dict[str, Any]] = None,
-        exclude: Optional[Dict[str, Any]] = None,
-        memory_limit: Optional[int] = None,
-        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-        init_params: Dict[str, Any] = None,
-        budget_type: str = None,
-        ta: Optional[Callable] = None,
-        logger_port: int = None,
-        all_supported_metrics: bool = True,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        ** eval_func_kwargs: Dict):
+            self,
+            backend: Backend,
+            seed: int,
+            metric: autoPyTorchMetric,
+            cost_for_crash: float,
+            abort_on_first_run_crash: bool,
+            pynisher_context: str,
+            multi_objectives: List[str],
+            pipeline_config: Optional[Dict[str, Any]] = None,
+            initial_num_run: int = 1,
+            stats: Optional[Stats] = None,
+            run_obj: str = 'quality',
+            par_factor: int = 1,
+            output_y_hat_optimization: bool = True,
+            include: Optional[Dict[str, Any]] = None,
+            exclude: Optional[Dict[str, Any]] = None,
+            memory_limit: Optional[int] = None,
+            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+            init_params: Dict[str, Any] = None,
+            budget_type: str = None,
+            ta: Optional[Callable] = None,
+            logger_port: int = None,
+            all_supported_metrics: bool = True,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+            **eval_func_kwargs: Dict):
 
         self.backend = backend
 
@@ -147,7 +147,7 @@ def __init__(
         self.resampling_strategy_args = dm.resampling_strategy_args
 
         if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
-            eval_function =  functools.partial(eval_train_function, **eval_func_kwargs)
+            eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
             self.output_y_hat_optimization = output_y_hat_optimization
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
             # TODO check eval_test for forecasting tasks
@@ -226,8 +226,8 @@ def _check_and_get_default_budget(self) -> float:
             return budget_choices[budget_type]
 
     def run_wrapper(
-        self,
-        run_info: RunInfo,
+            self,
+            run_info: RunInfo,
     ) -> Tuple[RunInfo, RunValue]:
         """
         wrapper function for ExecuteTARun.run_wrapper() to cap the target algorithm
@@ -257,7 +257,6 @@ def run_wrapper(
             # The budget will be provided in train evaluator when budget_type is None
             run_info = run_info._replace(budget=default_budget)
 
-
         remaining_time = self.stats.get_remaing_time_budget()
 
         if remaining_time - 5 < run_info.cutoff:
@@ -287,13 +286,13 @@ def run_wrapper(
         return run_info, run_value
 
     def run(
-        self,
-        config: Configuration,
-        instance: Optional[str] = None,
-        cutoff: Optional[float] = None,
-        seed: int = 12345,
-        budget: float = 0.0,
-        instance_specific: Optional[str] = None,
+            self,
+            config: Configuration,
+            instance: Optional[str] = None,
+            cutoff: Optional[float] = None,
+            seed: int = 12345,
+            budget: float = 0.0,
+            instance_specific: Optional[str] = None,
     ) -> Tuple[StatusType, float, float, Dict[str, Any]]:
 
         context = multiprocessing.get_context(self.pynisher_context)
@@ -319,7 +318,7 @@ def run(
             # Pynisher expects seconds as a time indicator
             wall_time_in_s=int(cutoff) if cutoff is not None else None,
             # TODO Figure out how pynisher influences GPU memory usage here
-            #mem_in_mb=self.memory_limit,
+            # mem_in_mb=self.memory_limit,
             capture_output=True,
             context=context,
         )
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 654a4e118..444a7b3e8 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -76,7 +76,7 @@ def __init__(self, backend: Backend, queue: Queue,
             pipeline_config=pipeline_config,
             search_space_updates=search_space_updates
         )
-        self.datamanager: TimeSeriesForecastingDataset
+        self.datamanager = backend.load_datamanager()
         self.n_prediction_steps = self.datamanager.n_prediction_steps
         self.num_sequences = self.datamanager.num_sequences
         self.num_targets = self.datamanager.num_target
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index f5126a169..666af4374 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -336,6 +336,7 @@ def run_smbo(self, func: Optional[Callable] = None
             pipeline_config=self.pipeline_config,
             search_space_updates=self.search_space_updates,
             pynisher_context=self.pynisher_context,
+            evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
         )
 
         ta = ExecuteTaFuncWithQueue
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index ee6bef576..8ba9e0b28 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -154,7 +154,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         self.freq = X['dataset_properties']['freq']
         self.time_feature_transform = X['dataset_properties']['time_feature_transform']
 
-        train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'])
+        train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
+        val_dataset = datamanager.get_dataset(split_id=X['split_id'], train=False)
 
         train_split, test_split = datamanager.splits[X['split_id']]
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index f97075d8f..ff80e500c 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -89,8 +89,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
         self.model.train()
         outputs_data = list()
         targets_data = list()
-        import time
-        time_start = time.time()
 
         for step, (data, targets) in enumerate(train_loader):
             if self.budget_tracker.is_max_time_reached():
@@ -116,9 +114,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
-        time_end = time.time()
-        print(f'time used epoch {epoch}: {time_end - time_start}')
-
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
@@ -229,8 +224,6 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
             float: test loss
             Dict[str, float]: scores for each desired metric
         """
-        import time
-        time_start = time.time()
         if not isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
             # To save time, we simply make one step prediction for DeepAR and Seq2Seq
             self.model.eval()
@@ -315,7 +308,6 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         self._scheduler_step(step_interval=StepIntervalUnit.valid, loss=loss_sum / N)
 
         self.model.train()
-        print(f'time for evaluation: {time.time() - time_start}')
         return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
 
     def compute_metrics(self, outputs_data: List[torch.Tensor], targets_data: List[torch.Tensor]
diff --git a/requirements.txt b/requirements.txt
index c94200f2d..1757e3727 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,12 +10,8 @@ imgaug>=0.4.0
 ConfigSpace>=0.4.14,<0.5
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
-<<<<<<< HEAD
-smac
-gluonts
-=======
 smac>=1.2
->>>>>>> upstream/development
+gluonts
 dask
 distributed>=2.2.0
 catboost

From fe6fb1f46e8824750b9f1156ab398d130ba2dd90 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 13 Apr 2022 20:34:02 +0200
Subject: [PATCH 210/347] validator for multi-variant series

---
 .../data/time_series_feature_validator.py     | 15 ++--
 .../data/time_series_forecasting_validator.py | 71 ++++++++++---------
 2 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index d2990eb93..cae26d211 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -16,7 +16,7 @@ def __init__(
         logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
     ):
         super().__init__(logger)
-        self.data_contain_ser_idx = False
+        self.only_contain_series_idx = True
 
     def fit(self,
             X_train: Union[pd.DataFrame, np.ndarray],
@@ -46,9 +46,16 @@ def fit(self,
                     if series_id not in X_train.columns:
                         raise ValueError(f"All Series ID must be contained in the training column, however, {series_id}"
                                          f"is not part of {X_train.columns.tolist()}")
-                self.data_contain_ser_idx = True
+                self.only_contain_series_idx = len(X_train.columns) == series_idx
+                if self.only_contain_series_idx:
+                    self._is_fitted = True
+
+                    self.num_features = 0
+                    self.numerical_columns = []
+                    self.categorical_columns = []
 
                 X_train_ = X_train.drop(series_idx, axis=1)
+
                 X_test_ = X_test.drop(series_idx, axis=1) if X_test is not None else None
 
                 super().fit(X_train_, X_test_)
@@ -57,10 +64,6 @@ def fit(self,
                                           f"X_train is {type(X_train)} ")
         else:
             super().fit(X_train, X_test)
-        if isinstance(X_train, pd.DataFrame):
-            if series_idx is None:
-                series_idx = ['Series Idx']
-            self.column_order = series_idx + self.column_order
 
         return self
 
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 2fbad0fd2..ba017836d 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -86,10 +86,6 @@ def fit(
         if X_train is None:
             self._is_uni_variant = True
         if isinstance(y_train, List):
-            if self.series_idx is not None:
-                # TODO: add support for this
-                raise NotImplementedError("When training data is given in the form of list, providing series idx info"
-                                          "is not supported")
             # X_train and y_train are stored as lists
             if self._is_uni_variant:
                 self.feature_validator.num_features = 0
@@ -116,6 +112,10 @@ def fit(
                     super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
                 self.feature_validator.fit(X_train[0], None if X_test is None else X_test[0], series_idx=series_idx)
                 self.target_validator.fit(y_train[0], None if y_test is None else y_test[0])
+
+                if self.feature_validator.only_contain_series_idx:
+                    self._is_uni_variant = True
+
                 self._is_fitted = True
 
                 # In this case we don't assign series index to the data, we manually assigne
@@ -163,15 +163,13 @@ def transform(
             self,
             X: Optional[Union[List, pd.DataFrame]],
             y: Optional[Union[List, pd.DataFrame]] = None,
-    ) -> Tuple[DataFrameGroupBy, Optional[DataFrameGroupBy], List[int]]:
+    ) -> Tuple[Optional[DataFrameGroupBy], DataFrameGroupBy, np.ndarray]:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
         if y is None:
             raise ValueError('Targets must be given!')
 
-        series_idx = ['Series Index'] or self.series_idx
-
         if isinstance(y, List):
             num_sequences = len(y)
             sequence_lengths = [0] * num_sequences
@@ -181,6 +179,7 @@ def transform(
                 if X is None:
                     raise ValueError('Multi Variant dataset requires X as input!')
                 num_features = self.feature_validator.num_features
+            assert len(X) == len(y), "Length of features must equal to length of targets!"
 
             for seq_idx in range(num_sequences):
                 sequence_lengths[seq_idx] = len(y[seq_idx])
@@ -192,38 +191,44 @@ def transform(
 
             start_idx = 0
 
+            y_flat = np.empty([num_data, num_targets])
 
-            group_ids = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
-
-            if self._is_uni_variant:
-                y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
-                for seq_idx, seq_length in enumerate(sequence_lengths):
-                    end_idx = start_idx + seq_length
-                    y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
-                    start_idx = end_idx
-
-                y_transformed = self.target_validator.transform(y_flat)
-
-                if y_transformed.ndim == 1:
-                    y_transformed = np.expand_dims(y_transformed, -1)
-                return np.asarray([]), y_transformed, sequence_lengths
-
-            # a matrix that is concatenated by all the time series sequences
-
-            X_flat = pd.DataFrame(np.empty([num_data, num_features]), index=group_ids)
-            y_flat = pd.DataFrame(np.empty([num_data, num_targets]), index=group_ids)
-
-            start_idx = 0
             for seq_idx, seq_length in enumerate(sequence_lengths):
                 end_idx = start_idx + seq_length
-                X_flat[start_idx: end_idx] = np.array(X[seq_idx]).reshape([-1, num_features])
                 y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
 
-            X_transformed = self.feature_validator.transform(X_flat)  # type:np.ndarray
-            y_transformed = self.target_validator.transform(y_flat)  # type:np.ndarray
+            y_transformed = self.target_validator.transform(y_flat)
             if y_transformed.ndim == 1:
                 y_transformed = np.expand_dims(y_transformed, -1)
-            return X_transformed, y_transformed, sequence_lengths
+
+            if self.series_idx is None:
+                series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+                if not self._is_uni_variant:
+                    if isinstance(X[0], np.ndarray):
+                        x_flat: pd.DataFrame = pd.DataFrame(np.vstack(X))
+                    elif isinstance(X[0], pd.DataFrame):
+                        x_flat: pd.DataFrame = pd.concat(X)
+                    else:
+                        raise NotImplementedError(f'Cannot transform a List of {type(X[0])}')
+                    x_transformed = self.feature_validator.transform(x_flat)
+
+            else:
+                # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
+                x_flat = pd.concat(X)
+                x_columns = x_flat.columns
+                for ser_id in self.series_idx:
+                    if ser_id not in x_columns:
+                        raise ValueError(f'{ser_id} does not exist in input feature X')
+
+                series_number = pd.MultiIndex.from_frame(x_flat[self.series_idx])
+                if not self._is_uni_variant:
+                    x_transformed = self.feature_validator.transform(x_flat.drop[self.series_idx])
+            y_transformed: pd.DataFrame = pd.DataFrame(y_transformed,
+                                                       index=pd.Index(series_number))
+            y_transformed: DataFrameGroupBy = y_transformed.groupby(y_transformed.index)
+            if self._is_uni_variant:
+                return None, y_transformed, sequence_lengths
+            return x_transformed.groupby(x_transformed.index), y_transformed, sequence_lengths
         else:
-            raise NotImplementedError
\ No newline at end of file
+            raise NotImplementedError

From 9264f896c729f01ede894bd3e691c1b319a78587 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 14 Apr 2022 15:02:21 +0200
Subject: [PATCH 211/347] feature validator

---
 .../data/time_series_feature_validator.py     |  3 ++-
 .../data/time_series_forecasting_validator.py | 19 +++++++++-----
 autoPyTorch/datasets/time_series_dataset.py   | 26 ++++++++-----------
 3 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index cae26d211..3365b7c26 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -16,7 +16,7 @@ def __init__(
         logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
     ):
         super().__init__(logger)
-        self.only_contain_series_idx = True
+        self.only_contain_series_idx = False
 
     def fit(self,
             X_train: Union[pd.DataFrame, np.ndarray],
@@ -53,6 +53,7 @@ def fit(self,
                     self.num_features = 0
                     self.numerical_columns = []
                     self.categorical_columns = []
+                    return self
 
                 X_train_ = X_train.drop(series_idx, axis=1)
 
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index ba017836d..2fffaa5c3 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -6,13 +6,11 @@
 from typing import Optional, Tuple, List, Union, Dict
 import numpy as np
 import pandas as pd
-from pandas.core.groupby.generic import DataFrameGroupBy
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from autoPyTorch.data.utils import  DatasetCompressionSpec
+from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.data.tabular_validator import TabularInputValidator
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 
 
@@ -57,7 +55,9 @@ def fit(
         Args:
             X_train (Optional[Union[List, pd.DataFrame]]): training features, could be None for "pure" forecasting tasks
             y_train (Union[List, pd.DataFrame]), training targets
-            series_idx (Optional[Union[List[Union[str, int]], str, int]]): which columns of the data are considered as
+            series_idx (Optional[Union[List[Union[str, int]], str, int]]): which columns of the data are considered to
+                identify the
+
 
         """
         if isinstance(series_idx, (str, int)):
@@ -163,7 +163,7 @@ def transform(
             self,
             X: Optional[Union[List, pd.DataFrame]],
             y: Optional[Union[List, pd.DataFrame]] = None,
-    ) -> Tuple[Optional[DataFrameGroupBy], DataFrameGroupBy, np.ndarray]:
+    ) -> Tuple[Optional[pd.DataFrame], pd.DataFrame, np.ndarray]:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
@@ -226,9 +226,14 @@ def transform(
                     x_transformed = self.feature_validator.transform(x_flat.drop[self.series_idx])
             y_transformed: pd.DataFrame = pd.DataFrame(y_transformed,
                                                        index=pd.Index(series_number))
-            y_transformed: DataFrameGroupBy = y_transformed.groupby(y_transformed.index)
             if self._is_uni_variant:
                 return None, y_transformed, sequence_lengths
-            return x_transformed.groupby(x_transformed.index), y_transformed, sequence_lengths
+
+            if x_transformed.ndim == 1:
+                x_transformed = np.expand_dims(x_transformed, -1)
+            x_transformed: pd.DataFrame = pd.DataFrame(x_transformed,
+                                                       index=series_number)
+
+            return x_transformed, y_transformed, sequence_lengths
         else:
             raise NotImplementedError
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ee11be04e..8895ede91 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -717,31 +717,29 @@ def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
         return y_test.reshape([-1, self.num_target])
 
     def make_sequences_datasets(self,
-                                X: np.ndarray,
-                                Y: np.ndarray,
+                                X: pd.DataFrame,
+                                Y: pd.DataFrame,
                                 start_times_train: List[pd.DatetimeIndex],
-                                time_features_train:Optional[Dict[pd.Timestamp, np.ndarray]]=None,
-                                X_test: Optional[np.ndarray] = None,
-                                Y_test: Optional[np.ndarray] = None,
+                                time_features_train: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
+                                X_test: Optional[pd.DataFram] = None,
+                                Y_test: Optional[pd.DataFram] = None,
                                 start_times_test: Optional[List[pd.DatetimeIndex]] = None,
-                                time_features_test:Optional[Dict[pd.Timestamp, np.ndarray]]=None,
+                                time_features_test: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
                                 normalize_y: bool = True,
                                 **sequences_kwargs: Optional[Dict]) -> \
             Tuple[List[TimeSeriesSequence], Tuple[List, List], Tuple[List, List]]:
         """
-        build a series time seequences datasets
+        build a series time sequence datasets
         Args:
-            X: np.ndarray (N_all, N_feature)
-                flattened train feature array with size N_all (the sum of all the series sequences) and N_feature,
-                number of features
-            Y: np.ndarray (N_all, N_target)
+            X: pd.DataFrame (N_all, N_feature)
+                flattened train feature DataFrame with size N_all (the sum of all the series sequences) and N_feature,
+                number of features, X's index should contain the information identifying its series number
+            Y: pd.DataFrame (N_all, N_target)
                 flattened train target array with size N_all (the sum of all the series sequences) and number of targets
             start_times_train: List[pd.DatetimeIndex]
                 start time of each training series
             time_features_train: Dict[pd.Timestamp, np.ndarray]:
                 time features for each possible start training times
-            sequence_lengths_train: List[int]
-                a list containing all the sequences length in the training set
             X_test: Optional[np.ndarray (N_all_test, N_feature)]
                 flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
                 number of features
@@ -751,8 +749,6 @@ def make_sequences_datasets(self,
                 start time for each test series
             time_features_test:Optional[Dict[pd.Timestamp, np.ndarray]]
                 time features for each possible start test times.
-            sequence_lengths_test: Optional[List[int]]
-                a list containing all the sequences length in the test set
             normalize_y: bool
                 if we want to normalize target vaues (normalization is conducted w.r.t. each sequence)
             sequences_kwargs: Dict

From aa3f7a65fe06965bdaab9f2a667256f93f326179 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 14 Apr 2022 17:41:02 +0200
Subject: [PATCH 212/347] multi-variant datasets

---
 .../data/time_series_forecasting_validator.py |   3 +-
 autoPyTorch/datasets/time_series_dataset.py   | 109 ++++++++----------
 2 files changed, 48 insertions(+), 64 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 2fffaa5c3..6dd25b0d9 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -222,8 +222,9 @@ def transform(
                         raise ValueError(f'{ser_id} does not exist in input feature X')
 
                 series_number = pd.MultiIndex.from_frame(x_flat[self.series_idx])
+
                 if not self._is_uni_variant:
-                    x_transformed = self.feature_validator.transform(x_flat.drop[self.series_idx])
+                    x_transformed = self.feature_validator.transform(x_flat.drop(self.series_idx, axis=1))
             y_transformed: pd.DataFrame = pd.DataFrame(y_transformed,
                                                        index=pd.Index(series_number))
             if self._is_uni_variant:
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 8895ede91..a35e83c24 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -71,6 +71,7 @@ def __init__(self,
                  only_has_past_targets: bool = False,
                  compute_mase_coefficient_value: bool = True,
                  time_features=None,
+                 time_features_test=None,
                  is_test_set=False,
                  ):
         """
@@ -442,13 +443,27 @@ def __init__(self,
         X, Y, sequence_lengths = self.validator.transform(X, Y)
         time_features_train = self.compute_time_features(self.start_times_train, sequence_lengths)
 
-        if X_test is not None:
+        if Y_test is not None:
             X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
             time_features_test = self.compute_time_features(self.start_times_test, self.sequence_lengths_tests)
         else:
             self.sequence_lengths_tests = None
             time_features_test = None
 
+        y_groups = Y.groupby(Y.index)
+        if normalize_y:
+            mean = y_groups.transform("mean")
+            std = y_groups.transform("std")
+            std[std == 0] = 1.
+            Y = (Y[mean.columns] - mean) / std
+            if Y_test is not None:
+                y_groups_test = Y_test.groupby(Y.index)
+
+                mean = y_groups_test.transform("mean")
+                std = y_groups_test.transform("std")
+                std[std == 0] = 1.
+                Y_test = (Y_test[mean.columns] - mean) / std
+
         self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
 
@@ -554,9 +569,6 @@ def __init__(self,
                             "known_future_features": known_future_features,
                             "static_features": static_features}
 
-        self.y_train_mean = [0] * len(self.sequence_lengths_train)
-        self.y_train_std = [1] * len(self.sequence_lengths_train)
-
         sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
             X=X, Y=Y,
             X_test=X_test, Y_test=Y_test,
@@ -564,7 +576,6 @@ def __init__(self,
             start_times_test=self.start_times_test,
             time_features_train=time_features_train,
             time_features_test=time_features_test,
-            normalize_y=normalize_y,
             **sequences_kwargs)
 
         self.normalize_y = normalize_y
@@ -721,13 +732,15 @@ def make_sequences_datasets(self,
                                 Y: pd.DataFrame,
                                 start_times_train: List[pd.DatetimeIndex],
                                 time_features_train: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
-                                X_test: Optional[pd.DataFram] = None,
-                                Y_test: Optional[pd.DataFram] = None,
+                                X_test: Optional[pd.DataFrame] = None,
+                                Y_test: Optional[pd.DataFrame] = None,
                                 start_times_test: Optional[List[pd.DatetimeIndex]] = None,
                                 time_features_test: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
-                                normalize_y: bool = True,
-                                **sequences_kwargs: Optional[Dict]) -> \
-            Tuple[List[TimeSeriesSequence], Tuple[List, List], Tuple[List, List]]:
+                                **sequences_kwargs: Optional[Dict]) -> Tuple[
+        List[TimeSeriesSequence],
+        Tuple[Optional[pd.DataFrame], pd.DataFrame],
+        Optional[Tuple[pd.DataFrame, pd.DataFrame]]
+    ]:
         """
         build a series time sequence datasets
         Args:
@@ -749,8 +762,6 @@ def make_sequences_datasets(self,
                 start time for each test series
             time_features_test:Optional[Dict[pd.Timestamp, np.ndarray]]
                 time features for each possible start test times.
-            normalize_y: bool
-                if we want to normalize target vaues (normalization is conducted w.r.t. each sequence)
             sequences_kwargs: Dict
                 additional arguments for test sets
         Returns:
@@ -763,67 +774,39 @@ def make_sequences_datasets(self,
 
         """
         sequence_datasets = []
-        idx_start_train = 0
         idx_start_test = 0
 
-        seq_length_train_flat = self.sequence_lengths_train + self.n_prediction_steps
-        group_ids = np.arange(len(seq_length_train_flat)).repeat(seq_length_train_flat)
-
-        for seq_idx, seq_length_train in enumerate(seq_length_train_flat):
-            idx_end_train = idx_start_train + seq_length_train
-            X_seq = X[idx_start_train: idx_end_train]
-            Y_seq = Y[idx_start_train: idx_end_train]
-
-            if normalize_y:
-                Y_seq_mean = np.mean(Y_seq)
-                Y_seq_std = np.std(Y_seq)
-                Y_seq = (Y_seq - Y_seq_mean) / Y_seq_std
-
-            Y[idx_start_train: idx_end_train] = Y_seq
-
-            if X_test is not None and Y_test is not None:
-                seq_length_test = self.sequence_lengths_tests[seq_idx]
-                idx_end_test = idx_start_test + seq_length_test
-
-                X_test_seq = X_test[idx_start_test: idx_end_test]
-                Y_test_seq = Y_test[idx_start_test: idx_end_test]
+        y_group = Y.groupby(Y.index)
+        if X is not None:
+            x_group = X.groupby(X.index)
+        if Y_test is not None:
+            y_test_group = Y_test.groupby(Y_test.index)
+        if X_test is not None:
+            x_test_group = X_test.groupby(X_test.index)
 
-                if normalize_y:
-                    Y_test_seq_mean = np.mean(Y_test_seq)
-                    Y_test_seq_std = np.std(Y_test_seq)
-                    Y_seq = (Y_seq - Y_test_seq_mean) / Y_test_seq_std
+        for i_ser, (start_train, y) in enumerate(zip(start_times_train, y_group)):
+            ser_id = y[0]
+            y_ser = y[1].transform(np.array).values
+            x_ser = x_group.get_group(ser_id).transform(np.array).values if X is not None else None
 
-                Y_test[idx_start_test: idx_end_test] = Y_seq
+            y_test_ser = y_test_group.get_group(ser_id).transform(np.array).values if Y_test is not None else None
+            x_test_ser = x_test_group.get_group(ser_id).transform(np.array).values if X_test is not None else None
 
-            else:
-                X_test_seq = None
-                Y_test_seq = None
-
-            if X_seq.size == 0:
-                X_seq = None
-                X_test_seq = None
-            start_time_train = start_times_train[seq_idx]
+            start_test = None if start_times_test is None else start_times_test[i_ser]
+            time_feature_test = None if time_features_test is None else time_features_test[start_test][:len(y_test_ser)]
 
             sequence = TimeSeriesSequence(
-                X=X_seq,
-                Y=Y_seq,
-                start_time_train=start_time_train,
-                X_test=X_test_seq,
-                Y_test=Y_test_seq,
-                start_time_test=None if start_times_test is None else start_times_test[seq_idx],
-                time_features=time_features_train[start_time_train][:len(Y_seq)],
+                X=x_ser,
+                Y=y_ser,
+                start_time_train=start_train,
+                X_test=x_test_ser,
+                Y_test=y_test_ser,
+                start_time_test=start_test,
+                time_features=time_features_train[start_train][:len(y_ser)],
+                time_features_test=time_feature_test,
                 **sequences_kwargs)
             sequence_datasets.append(sequence)
-            idx_start_train = idx_end_train
-
-            # self.sequence_lengths_train[seq_idx] = len(sequence)
-
-            # X_seq_all.append(X_seq)
-            # Y_seq_all.append(Y_seq)
 
-            # X_test_seq_all.append(X_test_seq)
-            # Y_test_seq_all.append(Y_test_seq)
-        # train_tensors = (X_seq_all, Y_seq_all)
         train_tensors = (X, Y)
         if Y_test is None:
             test_tensors = None

From 974f8ffee647b1bb7e6f58a5530472ca53173a17 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 14 Apr 2022 18:09:21 +0200
Subject: [PATCH 213/347] observed targets

---
 autoPyTorch/datasets/time_series_dataset.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index a35e83c24..737ca796c 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -86,6 +86,8 @@ def __init__(self,
 
         self.X = X
         self.Y = Y
+
+        self.observed_target = ~np.isnan(self.Y)
         if start_time_train is None:
             start_time_train = pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)
         self.start_time_train = start_time_train
@@ -185,10 +187,14 @@ def __getitem__(self, index: int, train: bool = True) \
         # In case of prediction, the targets are not provided
         targets = self.Y
         if self.only_has_past_targets:
-            targets_future = None
+            future_targets = None
         else:
-            targets_future = targets[index + 1: index + self.n_prediction_steps + 1]
-            targets_future = torch.from_numpy(targets_future)
+            future_targets = targets[index + 1: index + self.n_prediction_steps + 1]
+            future_targets = torch.from_numpy(future_targets)
+            future_targets = {
+                'future_targets': future_targets,
+                'future_observed_targets': self.observed_target[index + 1: index + self.n_prediction_steps + 1]
+            }
 
         if isinstance(past_features, np.ndarray):
             past_features = torch.from_numpy(past_features)
@@ -207,8 +213,8 @@ def __getitem__(self, index: int, train: bool = True) \
                 "future_features": future_features,
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
-                'past_observed_values': past_observed_values,
-                'decoder_lengths': None if targets_future is None else targets_future.shape[0]}, targets_future
+                'past_observed_targets': self.observed_target[:index + 1],
+                'decoder_lengths': None if future_targets is None else future_targets.shape[0]}, future_targets
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps

From 37dd821aa54ae54d6e1096c14f3c69dcee52e100 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 20 Apr 2022 17:13:15 +0200
Subject: [PATCH 214/347] stucture adjustment

---
 autoPyTorch/datasets/time_series_dataset.py   | 136 ++----------------
 .../preprocessing/target_preprocessing.py     |  32 +++++
 .../targets_preprocessing/__init__.py         |   0
 .../ForecastingTargetImputer.py               |  83 +++++++++++
 .../forecasting_target_imputation/__init__.py |   0
 .../TargetMaxAbsScaler.py                     |   3 +-
 .../TargetMeanAbsScaler.py                    |   3 +-
 .../TargetMinMaxScaler.py                     |   3 +-
 .../TargetNoScaler.py                         |   3 +-
 .../TargetStandardScaler.py                   |   3 +-
 .../forecasting_target_scaling/__init__.py    |   4 +-
 .../base_target_scaler.py                     |   9 +-
 .../forecasting_target_scaling/utils.py       |   0
 .../setup/network/forecasting_architecture.py |   7 +-
 .../setup/network/forecasting_network.py      |   9 +-
 .../trainer/forecasting_trainer/__init__.py   |   8 +-
 .../forecasting_base_trainer.py               |   8 +-
 .../pipeline/time_series_forecasting.py       |   2 +-
 18 files changed, 148 insertions(+), 165 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/__init__.py
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/TargetMaxAbsScaler.py (83%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/TargetMeanAbsScaler.py (84%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/TargetMinMaxScaler.py (83%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/TargetNoScaler.py (83%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/TargetStandardScaler.py (84%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/__init__.py (97%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/base_target_scaler.py (89%)
 rename autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/{ => targets_preprocessing}/forecasting_target_scaling/utils.py (100%)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 737ca796c..844460bb5 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -780,7 +780,6 @@ def make_sequences_datasets(self,
 
         """
         sequence_datasets = []
-        idx_start_test = 0
 
         y_group = Y.groupby(Y.index)
         if X is not None:
@@ -822,27 +821,18 @@ def make_sequences_datasets(self,
 
         return sequence_datasets, train_tensors, test_tensors
 
-    def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
+    def replace_data(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame]) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
-        self.update_tensors_seqs(X_train, self.sequence_lengths_train + self.n_prediction_steps, is_train=True)
         if X_test is not None:
-            self.update_tensors_seqs(X_test, self.sequence_lengths_tests, is_train=False)
-        return self
+            X_test_group = X_test.groupby(X_test.index)
+        for seq, x in zip(self.datasets, X_train.groupby(X_train.index)):
+            ser_id = x[0]
+            x_ser = x[1].transform(np.array).values
+            seq.X = x_ser
+            if X_test is not None:
+                seq.X_test = X_test_group.get_group(ser_id).transform(np.array).values
 
-    def update_tensors_seqs(self, X: np.ndarray, sequence_lengths, is_train=True):
-        if X.size == 0:
-            return
-        idx_start = 0
-        if is_train:
-            for seq, seq_length in zip(self.datasets, sequence_lengths):
-                idx_end = idx_start + seq_length
-                seq.X = X[idx_start: idx_end]
-                idx_start = idx_end
-        else:
-            for seq, seq_length in zip(self.datasets, sequence_lengths):
-                idx_end = idx_start + seq_length
-                seq.X_test = X[idx_start: idx_end]
-                idx_start = idx_end
+        return self
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
                          train: bool = True,
@@ -952,7 +942,10 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'time_feature_transform': self.time_feature_transform,
                                    'time_feature_names': self.time_feature_names,
                                    'future_feature_shapes': self.future_feature_shapes,
-                                   'uni_variant': self.is_uni_variant})
+                                   'uni_variant': self.is_uni_variant,
+                                   'targets_have_missing_values': self.train_tensors[1].isnull().values.any(),
+                                   'features_have_missing_values': False if self.train_tensors[0] is None
+                                   else self.train_tensors[0].isnull().values.any()})
         return dataset_properties
 
     def create_cross_val_splits(
@@ -1101,106 +1094,3 @@ def generate_test_seqs(self) -> List[TimeSeriesSequence]:
             test_seq.is_test_set = True
             test_seq.only_has_past_targets = True
         return test_sets
-
-
-def _check_time_series_forecasting_inputs(train: np.ndarray,
-                                          val: Optional[np.ndarray] = None) -> None:
-    if train.ndim != 3 or any(isinstance(i, (list, np.ndarray)) for i in train):
-        raise ValueError(
-            "The training data for time series forecasting has to be a three-dimensional tensor of shape PxLxM. or a"
-            "nested list")
-    if val is not None:
-        if val.ndim != 3 or any(isinstance(i, (list, np.ndarray)) for i in val):
-            raise ValueError(
-                "The validation data for time series forecasting "
-                "has to be a three-dimensional tensor of shape PxLxM or a nested list.")
-
-
-class TimeSeriesDataset(BaseDataset):
-    """
-    Common dataset for time series classification and regression data
-    Args:
-        X (np.ndarray): input training data.
-        Y (Union[np.ndarray, pd.Series]): training data targets.
-        X_test (Optional[np.ndarray]):  input testing data.
-        Y_test (Optional[Union[np.ndarray, pd.DataFrame]]): testing data targets
-        resampling_strategy (Union[CrossValTypes, HoldoutValTypes]),
-            (default=HoldoutValTypes.holdout_validation):
-            strategy to split the training data.
-        resampling_strategy_args (Optional[Dict[str, Any]]): arguments
-            required for the chosen resampling strategy. If None, uses
-            the default values provided in DEFAULT_RESAMPLING_PARAMETERS
-            in ```datasets/resampling_strategy.py```.
-        shuffle:  Whether to shuffle the data before performing splits
-        seed (int), (default=1): seed to be used for reproducibility.
-        train_transforms (Optional[torchvision.transforms.Compose]):
-            Additional Transforms to be applied to the training data.
-        val_transforms (Optional[torchvision.transforms.Compose]):
-            Additional Transforms to be applied to the validation/test data.
-
-        Notes: Support for Numpy Arrays is missing Strings.
-
-        """
-
-    def __init__(self,
-                 train: TIME_SERIES_CLASSIFICATION_INPUT,
-                 val: Optional[TIME_SERIES_CLASSIFICATION_INPUT] = None):
-        _check_time_series_inputs(train=train,
-                                  val=val,
-                                  task_type="time_series_classification")
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
-        self.cross_validators = CrossValFuncs.get_cross_validators(
-            CrossValTypes.stratified_k_fold_cross_validation,
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation,
-            CrossValTypes.stratified_shuffle_split_cross_validation
-        )
-        self.holdout_validators = HoldOutFuncs.get_holdout_validators(
-            HoldoutValTypes.holdout_validation,
-            HoldoutValTypes.stratified_holdout_validation
-        )
-
-
-class TimeSeriesRegressionDataset(BaseDataset):
-    def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np.ndarray, np.ndarray]] = None):
-        _check_time_series_inputs(train=train,
-                                  val=val,
-                                  task_type="time_series_regression")
-        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
-        self.cross_validators = CrossValFuncs.get_cross_validators(
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation
-        )
-        self.holdout_validators = HoldOutFuncs.get_holdout_validators(
-            HoldoutValTypes.holdout_validation
-        )
-
-
-def _check_time_series_inputs(task_type: str,
-                              train: Union[TIME_SERIES_CLASSIFICATION_INPUT, TIME_SERIES_REGRESSION_INPUT],
-                              val: Optional[
-                                  Union[TIME_SERIES_CLASSIFICATION_INPUT, TIME_SERIES_REGRESSION_INPUT]] = None
-                              ) -> None:
-    if len(train) != 2:
-        raise ValueError(f"There must be exactly two training tensors for {task_type}. "
-                         f"The first one containing the data and the second one containing the targets.")
-    if train[0].ndim != 3:
-        raise ValueError(
-            f"The training data for {task_type} has to be a three-dimensional tensor of shape NxSxM.")
-    if train[1].ndim != 1:
-        raise ValueError(
-            f"The training targets for {task_type} have to be of shape N."
-        )
-    if val is not None:
-        if len(val) != 2:
-            raise ValueError(
-                f"There must be exactly two validation tensors for{task_type}. "
-                f"The first one containing the data and the second one containing the targets.")
-        if val[0].ndim != 3:
-            raise ValueError(
-                f"The validation data for {task_type} has to be a "
-                f"three-dimensional tensor of shape NxSxM.")
-        if val[0].ndim != 1:
-            raise ValueError(
-                f"The validation targets for {task_type} have to be of shape N."
-            )
diff --git a/autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py
new file mode 100644
index 000000000..e68d991c0
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import spmatrix
+
+import torch
+
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchPreprocessingComponent
+
+
+class autoPyTorchTargetPreprocessingComponent(autoPyTorchPreprocessingComponent):
+    """
+     Provides abstract interface for preprocessing algorithms in AutoPyTorch.
+    """
+    def __init__(self) -> None:
+        autoPyTorchComponent.__init__()
+        self.add_fit_requirements([
+            FitRequirement('y_train',
+                           (pd.DataFrame, ),
+                           user_defined=True, dataset_property=False),
+            FitRequirement('backend',
+                           (Backend, ),
+                           user_defined=True, dataset_property=False)])
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py
new file mode 100644
index 000000000..9bf4f5930
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py
@@ -0,0 +1,83 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import spmatrix
+
+import torch
+
+from sktime.transformations.series.impute import Imputer
+from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.utils.common import FitRequirement
+
+
+class ForecastingTargetImputer(autoPyTorchComponent):
+    """
+    Forecasting target imputor
+
+    Attributes:
+        random_state (Optional[np.random.RandomState]):
+            The random state to use for the imputer.
+        numerical_strategy (str: default='mean'):
+            The strategy to use for imputing numerical columns.
+            Can be one of ['most_frequent', 'constant_!missing!']
+    """
+
+    def __init__(
+            self,
+            random_state: Optional[np.random.RandomState] = None,
+            impution_strategy: str = 'mean',
+    ):
+        super().__init__()
+        self.random_state = random_state
+        self.inputer = Imputer(method=impution_strategy, random_state=self.random_state, value=0., )
+
+        self.add_fit_requirements([
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('y_train', (pd.DataFrame, ), user_defined=True,
+                           dataset_property=False)])
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "ForecastingTargetImputer":
+        """
+        fits the target inputor  based on the given fit dictionary 'X'.
+
+        Args:
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
+
+        Returns:
+            self:
+                returns an instance of self.
+        """
+        self.check_requirements(X, y)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if X['dataset_properties']['is_small_preprocess']:
+            if 'X_train' in X:
+                X_train = X['X_train']
+            else:
+                # Incorporate the transform to the dataset
+                X_train = X['backend'].load_datamanager().train_tensors[0]
+
+            X['X_train'] = preprocess(dataset=X_train, transforms=transforms)
+        X.update({'y_train': self.inputer.transform(X['y_train'])})
+        return X
+
+
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
similarity index 83%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
index 1751fce32..7e6f3e250 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
-    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetMaxAbsScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
similarity index 84%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
index 51de0c9a7..75f264fe9 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
-    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetMeanAbsScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
similarity index 83%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
index 1aaf95762..9b1267680 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
-    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetMinMaxScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetNoScaler.py
similarity index 83%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetNoScaler.py
index 6eb6332f6..4d80644e9 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetNoScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetNoScaler.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
-    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetNoScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
similarity index 84%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
index 57dcda878..1b65affa1 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.\
-    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetStandardScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/__init__.py
similarity index 97%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/__init__.py
index 1dde467aa..3ce8cd4e1 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/__init__.py
@@ -12,8 +12,8 @@
     autoPyTorchComponent,
     find_components,
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling.\
-    base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.\
+    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 
 scaling_directory = os.path.split(__file__)[0]
 _scalers = find_components(__package__,
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/base_target_scaler.py
similarity index 89%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/base_target_scaler.py
index 45512296a..27c0d9c7f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/base_target_scaler.py
@@ -1,19 +1,16 @@
-from typing import Any, Dict, List, Optional, Union, Tuple
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 
-from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.pipeline import Pipeline
 #from sktime.transformations.panel.compose import ColumnTransformer
-from sklearn.compose import ColumnTransformer
 
 import torch
 
-from autoPyTorch.utils.common import FitRequirement, subsampler
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling\
-    .utils import TargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import TargetScaler
 
 
 class BaseTargetScaler(autoPyTorchTimeSeriesPreprocessingComponent):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/utils.py
similarity index 100%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/forecasting_target_scaling/utils.py
rename to autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/utils.py
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 27a883a79..9b1880a8a 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1,6 +1,4 @@
-from collections import OrderedDict
-from typing import Any, Dict, Optional, Union, Tuple, List
-from enum import Enum
+from typing import Dict, Optional, Union, Tuple, List
 
 from abc import abstractmethod
 
@@ -13,8 +11,7 @@
     TransformedDistribution,
 )
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
-    base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderNetwork,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 9fe307221..6be8364e5 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,9 +1,6 @@
-from collections import OrderedDict
-from typing import Any, Dict, Optional, Union, Tuple, List, Iterable
+from typing import Any, Dict, Optional, Iterable
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
-from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
 
@@ -12,8 +9,7 @@
 
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
-    base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderBlockInfo,
 )
@@ -23,7 +19,6 @@
 
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
 from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
     ForecastingNet,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 42f7b5095..11fd2cccd 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -1,7 +1,7 @@
 import collections
 import os
 
-from typing import Any, Dict, List, Optional, Tuple, cast
+from typing import Dict, List, Optional
 
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
     ForecastingBaseTrainerComponent,
@@ -16,11 +16,9 @@
 )
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker
 
-from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
-from autoPyTorch.pipeline.components.training.losses import get_loss
+from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
-    base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
 
 from autoPyTorch.utils.common import get_device_from_fit_dictionary
 from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index ff80e500c..f19d59c3c 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -1,23 +1,19 @@
 from abc import ABC
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
-import warnings
 import numpy as np
 
 import pandas as pd
 
 import torch
-import torch.nn.functional as F
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
 
 from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
-    base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling. \
-    TargetNoScaler import TargetNoScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet, \
     NBEATSNet, ForecastingSeq2SeqNet
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 1ba8f38eb..f22661a67 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -32,7 +32,7 @@
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.forecasting_target_scaling import \
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import \
     TargetScalerChoice
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices

From 1a6e19d2663855d4c5f0b2c44b5380a7c7956c8b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 22 Apr 2022 17:39:41 +0200
Subject: [PATCH 215/347] refactory ts tasks and preprocessing

---
 autoPyTorch/api/time_series_classification.py | 265 ------------------
 autoPyTorch/api/time_series_regression.py     | 251 -----------------
 autoPyTorch/data/time_series_validator.py     |  54 ----
 .../preprocessing/target_preprocessing.py     |  32 ---
 .../targets_preprocessing/__init__.py         |   0
 .../ForecastingTargetImputer.py               |  83 ------
 .../forecasting_target_imputation/__init__.py |   0
 .../TargetMaxAbsScaler.py                     |   2 +-
 .../TargetMeanAbsScaler.py                    |   2 +-
 .../TargetMinMaxScaler.py                     |   2 +-
 .../TargetNoScaler.py                         |   2 +-
 .../TargetStandardScaler.py                   |   2 +-
 .../forecasting_target_scaling/__init__.py    |   8 +-
 .../base_target_scaler.py                     |   9 +-
 .../forecasting_target_scaling/utils.py       |   0
 .../test_time_series_feature_validator.py     |  86 ------
 16 files changed, 12 insertions(+), 786 deletions(-)
 delete mode 100644 autoPyTorch/api/time_series_classification.py
 delete mode 100644 autoPyTorch/api/time_series_regression.py
 delete mode 100644 autoPyTorch/data/time_series_validator.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/__init__.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/__init__.py
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/TargetMaxAbsScaler.py (72%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/TargetMeanAbsScaler.py (72%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/TargetMinMaxScaler.py (72%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/TargetNoScaler.py (72%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/TargetStandardScaler.py (73%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/__init__.py (92%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/base_target_scaler.py (85%)
 rename autoPyTorch/pipeline/components/{preprocessing/time_series_preprocessing/targets_preprocessing => setup}/forecasting_target_scaling/utils.py (100%)
 delete mode 100644 test/test_data/test_time_series_feature_validator.py

diff --git a/autoPyTorch/api/time_series_classification.py b/autoPyTorch/api/time_series_classification.py
deleted file mode 100644
index caa5a5d0a..000000000
--- a/autoPyTorch/api/time_series_classification.py
+++ /dev/null
@@ -1,265 +0,0 @@
-#TODO Note: This API is still under construction!
-import os
-import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-
-import pandas as pd
-
-from autoPyTorch.api.base_task import BaseTask
-from autoPyTorch.constants import (
-    TASK_TYPES_TO_STRING,
-    TIMESERIES_CLASSIFICATION
-)
-from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
-    HoldoutValTypes,
-)
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
-from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-class TimeSeriesClassificationTask(BaseTask):
-    """
-    Time Series Classification API to the pipelines.
-    Args:
-        seed (int): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
-        logging_config (Optional[Dict]): specifies configuration
-            for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int), (default=50): Number of models added to the ensemble built by
-            Ensemble selection from libraries of models.
-            Models are drawn with replacement.
-        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
-            models to build the ensemble
-        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
-            Also, controls the size of the ensemble as any additional models will be deleted.
-            Must be greater than or equal to 1.
-        temporary_directory (str): folder to store configuration output and log file
-        output_directory (str): folder to store predictions for optional test set
-        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
-            when finished
-        include_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components not to use. Incompatible with include
-            components
-    """
-    def __init__(
-        self,
-        seed: int = 1,
-        n_jobs: int = 1,
-        logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
-        ensemble_nbest: int = 50,
-        max_models_on_disc: int = 50,
-        temporary_directory: Optional[str] = None,
-        output_directory: Optional[str] = None,
-        delete_tmp_folder_after_terminate: bool = True,
-        delete_output_folder_after_terminate: bool = True,
-        include_components: Optional[Dict] = None,
-        exclude_components: Optional[Dict] = None,
-        resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
-        resampling_strategy_args: Optional[Dict[str, Any]] = None,
-        backend: Optional[Backend] = None,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
-    ):
-        super().__init__(
-            seed=seed,
-            n_jobs=n_jobs,
-            logging_config=logging_config,
-            ensemble_size=ensemble_size,
-            ensemble_nbest=ensemble_nbest,
-            max_models_on_disc=max_models_on_disc,
-            temporary_directory=temporary_directory,
-            output_directory=output_directory,
-            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
-            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
-            include_components=include_components,
-            exclude_components=exclude_components,
-            backend=backend,
-            resampling_strategy=resampling_strategy,
-            resampling_strategy_args=resampling_strategy_args,
-            search_space_updates=search_space_updates,
-            task_type=TASK_TYPES_TO_STRING[TIMESERIES_CLASSIFICATION],
-        )
-
-    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
-        if not isinstance(dataset, TimeSeriesDataset):
-            raise ValueError("Dataset is incompatible for the given task,: {}".format(
-                type(dataset)
-            ))
-        return dataset.get_required_dataset_info()
-
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesClassificationPipeline:
-        return TimeSeriesClassificationPipeline(dataset_properties=dataset_properties)
-
-    def search(
-        self,
-        optimize_metric: str,
-        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        dataset_name: Optional[str] = None,
-        budget_type: Optional[str] = None,
-        budget: Optional[float] = None,
-        total_walltime_limit: int = 100,
-        func_eval_time_limit: int = 60,
-        traditional_per_total_budget: float = 0.,
-        memory_limit: Optional[int] = 4096,
-        smac_scenario_args: Optional[Dict[str, Any]] = None,
-        get_smac_object_callback: Optional[Callable] = None,
-        all_supported_metrics: bool = True,
-        precision: int = 32,
-        disable_file_output: List = [],
-        load_models: bool = True,
-    ) -> 'BaseTask':
-        """
-        Search for the best pipeline configuration for the given dataset.
-
-        Fit both optimizes the machine learning models and builds an ensemble out of them.
-        To disable ensembling, set ensemble_size==0.
-        using the optimizer.
-        Args:
-            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
-                A pair of features (X_train) and targets (y_train) used to fit a
-                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
-                be provided to track the generalization performance of each stage.
-            optimize_metric (str): name of the metric that is used to
-                evaluate a pipeline.
-            budget_type (Optional[str]):
-                Type of budget to be used when fitting the pipeline.
-                Either 'epochs' or 'runtime'. If not provided, uses
-                the default in the pipeline config ('epochs')
-            budget (Optional[float]):
-                Budget to fit a single run of the pipeline. If not
-                provided, uses the default in the pipeline config
-            total_walltime_limit (int), (default=100): Time limit
-                in seconds for the search of appropriate models.
-                By increasing this value, autopytorch has a higher
-                chance of finding better models.
-            func_eval_time_limit (int), (default=60): Time limit
-                for a single call to the machine learning model.
-                Model fitting will be terminated if the machine
-                learning algorithm runs over the time limit. Set
-                this value high enough so that typical machine
-                learning algorithms can be fit on the training
-                data.
-            traditional_per_total_budget (float), (default=0.1):
-                Percent of total walltime to be allocated for
-                running traditional classifiers.
-            memory_limit (Optional[int]), (default=4096): Memory
-                limit in MB for the machine learning algorithm. autopytorch
-                will stop fitting the machine learning algorithm if it tries
-                to allocate more than memory_limit MB. If None is provided,
-                no memory limit is set. In case of multi-processing, memory_limit
-                will be per job. This memory limit also applies to the ensemble
-                creation process.
-            smac_scenario_args (Optional[Dict]): Additional arguments inserted
-                into the scenario of SMAC. See the
-                [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
-            get_smac_object_callback (Optional[Callable]): Callback function
-                to create an object of class
-                [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
-                The function must accept the arguments scenario_dict,
-                instances, num_params, runhistory, seed and ta. This is
-                an advanced feature. Use only if you are familiar with
-                [SMAC](https://automl.github.io/SMAC3/master/index.html).
-            all_supported_metrics (bool), (default=True): if True, all
-                metrics supporting current task will be calculated
-                for each pipeline and results will be available via cv_results
-            precision (int), (default=32): Numeric precision used when loading
-                ensemble data. Can be either '16', '32' or '64'.
-            disable_file_output (Union[bool, List]):
-            load_models (bool), (default=True): Whether to load the
-                models after fitting AutoPyTorch.
-
-        Returns:
-            self
-
-        """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
-
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
-
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TimeSeriesInputValidator(
-            is_classification=True,
-            logger_port=self._logger_port,
-        )
-
-        # Fit a input validator to check the provided data
-        # Also, an encoder is fit to both train and test data,
-        # to prevent unseen categories during inference
-        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
-
-        self.dataset = TimeSeriesDataset(
-            X=X_train, Y=y_train,
-            X_test=X_test, Y_test=y_test,
-            validator=self.InputValidator,
-            resampling_strategy=self.resampling_strategy,
-            resampling_strategy_args=self.resampling_strategy_args,
-        )
-
-        if traditional_per_total_budget > 0.:
-            self._logger.warning("Time series classification for now does not support traditional classifiers. "
-                                 "Setting traditional_per_total_budget to 0.")
-            traditional_per_total_budget = 0.
-
-        return self._search(
-            dataset=self.dataset,
-            optimize_metric=optimize_metric,
-            budget_type=budget_type,
-            budget=budget,
-            total_walltime_limit=total_walltime_limit,
-            func_eval_time_limit=func_eval_time_limit,
-            traditional_per_total_budget=traditional_per_total_budget,
-            memory_limit=memory_limit,
-            smac_scenario_args=smac_scenario_args,
-            get_smac_object_callback=get_smac_object_callback,
-            all_supported_metrics=all_supported_metrics,
-            precision=precision,
-            disable_file_output=disable_file_output,
-            load_models=load_models,
-        )
-
-    def predict(
-            self,
-            X_test: np.ndarray,
-            batch_size: Optional[int] = None,
-            n_jobs: int = 1
-    ) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
-            raise ValueError("predict() is only supported after calling search. Kindly call first "
-                             "the estimator fit() method.")
-
-        X_test = self.InputValidator.feature_validator.transform(X_test)
-        predicted_probabilities = super().predict(X_test, batch_size=batch_size,
-                                                  n_jobs=n_jobs)
-
-        if self.InputValidator.target_validator.is_single_column_target():
-            predicted_indexes = np.argmax(predicted_probabilities, axis=1)
-        else:
-            predicted_indexes = (predicted_probabilities > 0.5).astype(int)
-
-        # Allow to predict in the original domain -- that is, the user is not interested
-        # in our encoded values
-        return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
-
-    def predict_proba(self,
-                      X_test: Union[np.ndarray, pd.DataFrame, List],
-                      batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
-            raise ValueError("predict() is only supported after calling search. Kindly call first "
-                             "the estimator fit() method.")
-        X_test = self.InputValidator.feature_validator.transform(X_test)
-        return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
diff --git a/autoPyTorch/api/time_series_regression.py b/autoPyTorch/api/time_series_regression.py
deleted file mode 100644
index aefaed97b..000000000
--- a/autoPyTorch/api/time_series_regression.py
+++ /dev/null
@@ -1,251 +0,0 @@
-#TODO Note: This API is still under construction!
-import os
-import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-
-import pandas as pd
-
-from autoPyTorch.api.base_task import BaseTask
-from autoPyTorch.constants import (
-    TASK_TYPES_TO_STRING, TIMESERIES_REGRESSION
-)
-from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
-    HoldoutValTypes,
-)
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
-from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
-from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
-from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-class TimeSeriesRegressionTask(BaseTask):
-    """
-    Time Series Regression API to the pipelines.
-    Args:
-        seed (int): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
-        logging_config (Optional[Dict]): specifies configuration
-            for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int), (default=50): Number of models added to the ensemble built by
-            Ensemble selection from libraries of models.
-            Models are drawn with replacement.
-        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
-            models to build the ensemble
-        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
-            Also, controls the size of the ensemble as any additional models will be deleted.
-            Must be greater than or equal to 1.
-        temporary_directory (str): folder to store configuration output and log file
-        output_directory (str): folder to store predictions for optional test set
-        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
-            when finished
-        include_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components not to use. Incompatible with include
-            components
-    """
-
-    def __init__(
-            self,
-            seed: int = 1,
-            n_jobs: int = 1,
-            logging_config: Optional[Dict] = None,
-            ensemble_size: int = 50,
-            ensemble_nbest: int = 50,
-            max_models_on_disc: int = 50,
-            temporary_directory: Optional[str] = None,
-            output_directory: Optional[str] = None,
-            delete_tmp_folder_after_terminate: bool = True,
-            delete_output_folder_after_terminate: bool = True,
-            include_components: Optional[Dict] = None,
-            exclude_components: Optional[Dict] = None,
-            resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            backend: Optional[Backend] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
-    ):
-        super().__init__(
-            seed=seed,
-            n_jobs=n_jobs,
-            logging_config=logging_config,
-            ensemble_size=ensemble_size,
-            ensemble_nbest=ensemble_nbest,
-            max_models_on_disc=max_models_on_disc,
-            temporary_directory=temporary_directory,
-            output_directory=output_directory,
-            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
-            delete_output_folder_after_terminate=delete_output_folder_after_terminate,
-            include_components=include_components,
-            exclude_components=exclude_components,
-            backend=backend,
-            resampling_strategy=resampling_strategy,
-            resampling_strategy_args=resampling_strategy_args,
-            search_space_updates=search_space_updates,
-            task_type=TASK_TYPES_TO_STRING[TIMESERIES_REGRESSION],
-        )
-
-    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
-        if not isinstance(dataset, TimeSeriesDataset):
-            raise ValueError("Dataset is incompatible for the given task,: {}".format(
-                type(dataset)
-            ))
-        return dataset.get_required_dataset_info()
-
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TimeSeriesRegressionPipeline:
-        return TimeSeriesRegressionPipeline(dataset_properties=dataset_properties)
-
-    def search(self,
-               optimize_metric: str,
-               X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-               y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-               X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-               y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-               dataset_name: Optional[str] = None,
-               budget_type: Optional[str] = None,
-               budget: Optional[float] = None,
-               total_walltime_limit: int = 100,
-               func_eval_time_limit: int = 60,
-               traditional_per_total_budget: float = 0.,
-               memory_limit: Optional[int] = 4096,
-               smac_scenario_args: Optional[Dict[str, Any]] = None,
-               get_smac_object_callback: Optional[Callable] = None,
-               all_supported_metrics: bool = True,
-               precision: int = 32,
-               disable_file_output: List = [],
-               load_models: bool = True,
-               ) -> 'BaseTask':
-        """
-        Search for the best pipeline configuration for the given dataset.
-
-        Fit both optimizes the machine learning models and builds an ensemble out of them.
-        To disable ensembling, set ensemble_size==0.
-        using the optimizer.
-        Args:
-            X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
-                A pair of features (X_train) and targets (y_train) used to fit a
-                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
-                be provided to track the generalization performance of each stage.
-            optimize_metric (str): name of the metric that is used to
-                evaluate a pipeline.
-            budget_type (Optional[str]):
-                Type of budget to be used when fitting the pipeline.
-                Either 'epochs' or 'runtime'. If not provided, uses
-                the default in the pipeline config ('epochs')
-            budget (Optional[float]):
-                Budget to fit a single run of the pipeline. If not
-                provided, uses the default in the pipeline config
-            total_walltime_limit (int), (default=100): Time limit
-                in seconds for the search of appropriate models.
-                By increasing this value, autopytorch has a higher
-                chance of finding better models.
-            func_eval_time_limit (int), (default=60): Time limit
-                for a single call to the machine learning model.
-                Model fitting will be terminated if the machine
-                learning algorithm runs over the time limit. Set
-                this value high enough so that typical machine
-                learning algorithms can be fit on the training
-                data.
-            traditional_per_total_budget (float), (default=0.1):
-                Percent of total walltime to be allocated for
-                running traditional classifiers.
-            memory_limit (Optional[int]), (default=4096): Memory
-                limit in MB for the machine learning algorithm. autopytorch
-                will stop fitting the machine learning algorithm if it tries
-                to allocate more than memory_limit MB. If None is provided,
-                no memory limit is set. In case of multi-processing, memory_limit
-                will be per job. This memory limit also applies to the ensemble
-                creation process.
-            smac_scenario_args (Optional[Dict]): Additional arguments inserted
-                into the scenario of SMAC. See the
-                [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario)
-            get_smac_object_callback (Optional[Callable]): Callback function
-                to create an object of class
-                [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html).
-                The function must accept the arguments scenario_dict,
-                instances, num_params, runhistory, seed and ta. This is
-                an advanced feature. Use only if you are familiar with
-                [SMAC](https://automl.github.io/SMAC3/master/index.html).
-            all_supported_metrics (bool), (default=True): if True, all
-                metrics supporting current task will be calculated
-                for each pipeline and results will be available via cv_results
-            precision (int), (default=32): Numeric precision used when loading
-                ensemble data. Can be either '16', '32' or '64'.
-            disable_file_output (Union[bool, List]):
-            load_models (bool), (default=True): Whether to load the
-                models after fitting AutoPyTorch.
-
-        Returns:
-            self
-
-        """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
-
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
-
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TimeSeriesInputValidator(
-            is_classification=False,
-            logger_port=self._logger_port,
-        )
-
-        # Fit a input validator to check the provided data
-        # Also, an encoder is fit to both train and test data,
-        # to prevent unseen categories during inference
-        self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
-
-        self.dataset = TimeSeriesDataset(
-            X=X_train, Y=y_train,
-            X_test=X_test, Y_test=y_test,
-            validator=self.InputValidator,
-            resampling_strategy=self.resampling_strategy,
-            resampling_strategy_args=self.resampling_strategy_args,
-        )
-
-        if traditional_per_total_budget > 0.:
-            self._logger.warning("Time series regression for now does not support traditional classifiers. "
-                                 "Setting traditional_per_total_budget to 0.")
-            traditional_per_total_budget = 0.
-
-        return self._search(
-            dataset=self.dataset,
-            optimize_metric=optimize_metric,
-            budget_type=budget_type,
-            budget=budget,
-            total_walltime_limit=total_walltime_limit,
-            func_eval_time_limit=func_eval_time_limit,
-            traditional_per_total_budget=traditional_per_total_budget,
-            memory_limit=memory_limit,
-            smac_scenario_args=smac_scenario_args,
-            get_smac_object_callback=get_smac_object_callback,
-            all_supported_metrics=all_supported_metrics,
-            precision=precision,
-            disable_file_output=disable_file_output,
-            load_models=load_models,
-        )
-
-    def predict(
-            self,
-            X_test: np.ndarray,
-            batch_size: Optional[int] = None,
-            n_jobs: int = 1
-    ) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
-            raise ValueError("predict() is only supported after calling search. Kindly call first "
-                             "the estimator fit() method.")
-
-        X_test = self.InputValidator.feature_validator.transform(X_test)
-        predicted_values = super().predict(X_test, batch_size=batch_size,
-                                           n_jobs=n_jobs)
-
-        # Allow to predict in the original domain -- that is, the user is not interested
-        # in our encoded values
-        return self.InputValidator.target_validator.inverse_transform(predicted_values)
diff --git a/autoPyTorch/data/time_series_validator.py b/autoPyTorch/data/time_series_validator.py
deleted file mode 100644
index 11073363e..000000000
--- a/autoPyTorch/data/time_series_validator.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# -*- encoding: utf-8 -*-
-import logging
-import typing
-
-from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
-from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
-from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
-
-
-class TimeSeriesInputValidator(BaseInputValidator):
-    """
-    Makes sure the input data complies with Auto-PyTorch requirements.
-
-    This class also perform checks for data integrity and flags the user
-    via informative errors.
-
-    Attributes:
-        is_classification (bool):
-            For classification task, this flag indicates that the target data
-            should be encoded
-        feature_validator (FeatureValidator):
-            A FeatureValidator instance used to validate and encode feature columns to match
-            sklearn expectations on the data
-        target_validator (TargetValidator):
-            A TargetValidator instance used to validate and encode (in case of classification)
-            the target values
-    """
-
-    def __init__(
-        self,
-        is_classification: bool = False,
-        logger_port: typing.Optional[int] = None,
-    ) -> None:
-        self.is_classification = is_classification
-        self.logger_port = logger_port
-        if self.logger_port is not None:
-            self.logger: typing.Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
-                name='Validation',
-                port=self.logger_port,
-            )
-        else:
-            self.logger = logging.getLogger('Validation')
-
-        self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
-        self.target_validator = TimeSeriesTargetValidator(
-            is_classification=self.is_classification,
-            logger=self.logger
-        )
-
-        self._is_fitted = False
-
-
-
diff --git a/autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py
deleted file mode 100644
index e68d991c0..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/target_preprocessing.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-import numpy as np
-
-import pandas as pd
-
-from scipy.sparse import spmatrix
-
-import torch
-
-from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchPreprocessingComponent
-
-
-class autoPyTorchTargetPreprocessingComponent(autoPyTorchPreprocessingComponent):
-    """
-     Provides abstract interface for preprocessing algorithms in AutoPyTorch.
-    """
-    def __init__(self) -> None:
-        autoPyTorchComponent.__init__()
-        self.add_fit_requirements([
-            FitRequirement('y_train',
-                           (pd.DataFrame, ),
-                           user_defined=True, dataset_property=False),
-            FitRequirement('backend',
-                           (Backend, ),
-                           user_defined=True, dataset_property=False)])
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py
deleted file mode 100644
index 9bf4f5930..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/ForecastingTargetImputer.py
+++ /dev/null
@@ -1,83 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-import numpy as np
-
-import pandas as pd
-
-from scipy.sparse import spmatrix
-
-import torch
-
-from sktime.transformations.series.impute import Imputer
-from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.utils.common import FitRequirement
-
-
-class ForecastingTargetImputer(autoPyTorchComponent):
-    """
-    Forecasting target imputor
-
-    Attributes:
-        random_state (Optional[np.random.RandomState]):
-            The random state to use for the imputer.
-        numerical_strategy (str: default='mean'):
-            The strategy to use for imputing numerical columns.
-            Can be one of ['most_frequent', 'constant_!missing!']
-    """
-
-    def __init__(
-            self,
-            random_state: Optional[np.random.RandomState] = None,
-            impution_strategy: str = 'mean',
-    ):
-        super().__init__()
-        self.random_state = random_state
-        self.inputer = Imputer(method=impution_strategy, random_state=self.random_state, value=0., )
-
-        self.add_fit_requirements([
-            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('y_train', (pd.DataFrame, ), user_defined=True,
-                           dataset_property=False)])
-
-    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "ForecastingTargetImputer":
-        """
-        fits the target inputor  based on the given fit dictionary 'X'.
-
-        Args:
-            X (Dict[str, Any]):
-                The fit dictionary
-            y (Optional[Any]):
-                Not Used -- to comply with API
-
-        Returns:
-            self:
-                returns an instance of self.
-        """
-        self.check_requirements(X, y)
-        return self
-
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Adds self into the 'X' dictionary and returns it.
-        Args:
-            X (Dict[str, Any]): 'X' dictionary
-
-        Returns:
-            (Dict[str, Any]): the updated 'X' dictionary
-        """
-        if X['dataset_properties']['is_small_preprocess']:
-            if 'X_train' in X:
-                X_train = X['X_train']
-            else:
-                # Incorporate the transform to the dataset
-                X_train = X['backend'].load_datamanager().train_tensors[0]
-
-            X['X_train'] = preprocess(dataset=X_train, transforms=transforms)
-        X.update({'y_train': self.inputer.transform(X['y_train'])})
-        return X
-
-
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_imputation/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
similarity index 72%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
index 7e6f3e250..e79bb95e4 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetMaxAbsScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
similarity index 72%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
index 75f264fe9..cce8cbc08 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMeanAbsScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetMeanAbsScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
similarity index 72%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
index 9b1267680..d345d4334 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetMinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetMinMaxScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetNoScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
similarity index 72%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetNoScaler.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
index 4d80644e9..1c57b1ea1 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetNoScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetNoScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
similarity index 73%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
index 1b65affa1..f077bc730 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/TargetStandardScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 
 
 class TargetStandardScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
similarity index 92%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/__init__.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
index 3ce8cd4e1..01b7b831e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
@@ -5,15 +5,13 @@
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
     find_components,
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.\
-    forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 
 scaling_directory = os.path.split(__file__)[0]
 _scalers = find_components(__package__,
@@ -27,7 +25,7 @@ def add_scaler(scaler: BaseTargetScaler) -> None:
     _addons.add_component(scaler)
 
 
-class TargetScalerChoice(ScalerChoice):
+class TargetScalerChoice(autoPyTorchChoice):
     """
     Allows for dynamically choosing scaling component at runtime, not
     """
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
similarity index 85%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/base_target_scaler.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index 27c0d9c7f..dfa3a841d 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -7,13 +7,12 @@
 
 import torch
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
-    autoPyTorchTimeSeriesPreprocessingComponent
-)
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import TargetScaler
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScaler
 
-class BaseTargetScaler(autoPyTorchTimeSeriesPreprocessingComponent):
+
+class BaseTargetScaler(autoPyTorchComponent):
     def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
         super().__init__()
         self.random_state = random_state
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
similarity index 100%
rename from autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/targets_preprocessing/forecasting_target_scaling/utils.py
rename to autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_time_series_feature_validator.py
deleted file mode 100644
index 5bc638946..000000000
--- a/test/test_data/test_time_series_feature_validator.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import numpy as np
-
-import pandas as pd
-
-import pytest
-
-from scipy import sparse
-
-from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
-
-
-# Fixtures to be used in this class. By default all elements have 100 datapoints
-@pytest.fixture
-def input_data_featuretest(request):
-    if request.param == 'numpy_numericalonly_nonan':
-        return np.array([
-            [[1.0], [2.0], [3.0]],
-            [[-3.0], [-2.0], [-1.0]]
-        ])
-    else:
-        ValueError("Unsupported indirect fixture {}".format(request.param))
-
-
-# Actual checks for the features
-@pytest.mark.parametrize(
-    'input_data_featuretest',
-    (
-        'numpy_numericalonly_nonan',
-    ),
-    indirect=True
-)
-def test_featurevalidator_supported_types(input_data_featuretest):
-    validator = TimeSeriesFeatureValidator()
-    validator.fit(input_data_featuretest, input_data_featuretest)
-    transformed_X = validator.transform(input_data_featuretest)
-    if sparse.issparse(input_data_featuretest):
-        assert sparse.issparse(transformed_X)
-    else:
-        assert isinstance(transformed_X, np.ndarray)
-    assert np.shape(input_data_featuretest) == np.shape(transformed_X)
-    assert np.issubdtype(transformed_X.dtype, np.number)
-    assert validator._is_fitted
-
-
-def test_featurevalidator_unsupported_numpy():
-    validator = TimeSeriesFeatureValidator()
-
-    with pytest.raises(ValueError, match="Input contains NaN, infinity or a value too large *"):
-        validator.fit(X_train=np.array([[[1], [2], [np.nan]], [[4], [5], [6]]]))
-
-
-def test_features_unsupported_calls_are_raised():
-    """
-    Makes sure we raise a proper message to the user,
-    when providing not supported data input or using the validator in a way that is not
-    expected
-    """
-    validator = TimeSeriesFeatureValidator()
-
-    with pytest.raises(ValueError, match="Time series train data must be given as a numpy array, but got *"):
-        validator.fit(
-            pd.DataFrame({'x': [1.0, 2.0, 3.0]})
-        )
-
-    with pytest.raises(ValueError, match="Time series train data must be given as a numpy array, but got *"):
-        validator.fit(
-            [1.0, 2.0, 3.0]
-        )
-
-    with pytest.raises(ValueError, match="Time series train data must be given as a numpy array, but got *"):
-        validator.fit({'input1': 1, 'input2': 2})
-
-    with pytest.raises(ValueError, match="Invalid number of dimensions for time series train data *"):
-        validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]))
-
-    with pytest.raises(ValueError, match="Invalid number of dimensions for time series test data *"):
-        validator.fit(X_train=np.array([[[1], [2], [3]], [[4], [5], [6]]]),
-                      X_test=np.array([[1, 2, 3], [4, 5, 6]]))
-
-    with pytest.raises(ValueError, match="Time series train and test data are expected to have the same shape "
-                                         "except for the batch dimension, but got *"):
-        validator.fit(X_train=np.array([[[1], [2], [3]], [[4], [5], [6]]]),
-                      X_test=np.array([[[1], [2], [3], [4]], [[4], [5], [6], [7]]]))
-
-    with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"):
-        validator.transform(np.array([[1, 2, 3], [4, 5, 6]]))

From 075c6e652b574aacc33dab4b25d49b7eedcf22ce Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 22 Apr 2022 17:40:59 +0200
Subject: [PATCH 216/347] allow nan in targets

---
 .../data/time_series_forecasting_validator.py |  10 +-
 .../data/time_series_target_validator.py      | 176 +++++++++++++++++-
 autoPyTorch/datasets/time_series_dataset.py   |   2 +-
 3 files changed, 180 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 6dd25b0d9..2b8b3ed80 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -1,5 +1,3 @@
-from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
-
 # -*- encoding: utf-8 -*-
 import logging
 import warnings
@@ -12,6 +10,7 @@
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
 
 
 class TimeSeriesForecastingInputValidator(TabularInputValidator):
@@ -20,6 +19,7 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
     As a time series forecasting dataset might contain several time sequence with different length, we will transform
     all the data to DataFrameGroupBy whereas each group represents a series
     """
+
     def __init__(self,
                  is_classification: bool = False,
                  logger_port: Optional[int] = None,
@@ -27,6 +27,8 @@ def __init__(self,
                  ) -> None:
         super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port, dataset_compression)
         self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
+        self.target_validator = TimeSeriesTargetValidator(is_classification=self.is_classification,
+                                                          logger=self.logger)
         self._is_uni_variant = False
         self.known_future_features = None
         self.n_prediction_steps = 1
@@ -179,7 +181,7 @@ def transform(
                 if X is None:
                     raise ValueError('Multi Variant dataset requires X as input!')
                 num_features = self.feature_validator.num_features
-            assert len(X) == len(y), "Length of features must equal to length of targets!"
+                assert len(X) == len(y), "Length of features must equal to length of targets!"
 
             for seq_idx in range(num_sequences):
                 sequence_lengths[seq_idx] = len(y[seq_idx])
@@ -198,7 +200,7 @@ def transform(
                 y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
                 start_idx = end_idx
 
-            y_transformed = self.target_validator.transform(y_flat)
+            y_transformed: np.ndarray = self.target_validator.transform(y_flat)
             if y_transformed.ndim == 1:
                 y_transformed = np.expand_dims(y_transformed, -1)
 
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 50fd9d213..5c48cade4 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -1,4 +1,174 @@
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+from typing import List, Optional, Union, cast
 
-# just define an alias for the tabular target validator
-TimeSeriesTargetValidator = TabularTargetValidator
+import numpy as np
+
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+
+from scipy.sparse import issparse, spmatrix
+
+import sklearn.utils
+from sklearn import preprocessing
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
+from sklearn.utils.multiclass import type_of_target
+
+from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
+from autoPyTorch.utils.common import ispandas
+from autoPyTorch.data.tabular_target_validator import TabularTargetValidator, ArrayType
+
+
+def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
+    """ sklearn check array will make sure we have the correct numerical features for the array """
+    return sklearn.utils.check_array(y, force_all_finite=False, accept_sparse='csr', ensure_2d=False)
+
+
+def _modify_regression_target(y: ArrayType) -> ArrayType:
+    # Regression targets must have numbers after a decimal point.
+    # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+    y_min = np.abs(np.nan_to_num(y, 1e12)).min()
+    offset = max(y_min, 1e-13) * 1e-13  # Sufficiently small number
+    if y_min > 1e12:
+        raise ValueError(
+            "The minimum value for the target labels of regression tasks must be smaller than "
+            f"1e12 to avoid errors caused by an overflow, but got {y_min}"
+        )
+
+    # Since it is all integer, we can just add a random small number
+    if isinstance(y, np.ndarray):
+        y = y.astype(dtype=np.float64) + offset
+    else:
+        y.data = y.data.astype(dtype=np.float64) + offset
+
+    return y
+
+
+class TimeSeriesTargetValidator(TabularTargetValidator):
+    def transform(self, y: SupportedTargetTypes) -> np.ndarray:
+        """
+        Validates and fit a categorical encoder (if needed) to the features.
+        The supported data types are List, numpy arrays and pandas DataFrames.
+
+        Args:
+            y (SupportedTargetTypes)
+                A set of targets that are going to be encoded if the current task
+                is classification
+
+        Returns:
+            np.ndarray:
+                The transformed array
+        """
+        if not self._is_fitted:
+            raise NotFittedError("Cannot call transform on a validator that is not fitted")
+
+        # Check the data here so we catch problems on new test data
+        self._check_data(y)
+        y = self._transform_by_encoder(y)
+
+        # When translating a dataframe to numpy, make sure we honor the ravel requirement
+        if y.ndim == 2 and y.shape[1] == 1:
+            y = np.ravel(y)
+
+        if not self.is_classification and "continuous" not in type_of_target(np.nan_to_num(y)):
+            y = _modify_regression_target(y)
+
+        return y
+
+
+    def _check_data(self, y: SupportedTargetTypes) -> None:
+        """
+        Perform dimensionality and data type checks on the targets, This is nearly the same as
+        TabularTargetValidator._check_data, however, we allow NAN values in target
+
+        Args:
+            y (SupportedTargetTypes):
+                A set of features whose dimensionality and data type is going to be checked
+        """
+        if not isinstance(y, (np.ndarray, pd.DataFrame,
+                              List, pd.Series)) \
+                and not issparse(y):  # type: ignore[misc]
+            raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
+                             " pd.Series, sparse data and Python Lists as targets, yet, "
+                             "the provided input is of type {}".format(
+                                 type(y)
+                             ))
+
+        # Sparse data muss be numerical
+        # Type ignore on attribute because sparse targets have a dtype
+        if issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
+                                             np.number):
+            raise ValueError("When providing a sparse matrix as targets, the only supported "
+                             "values are numerical. Please consider using a dense"
+                             " instead."
+                             )
+
+        if self.data_type is None:
+            self.data_type = type(y)
+        if self.data_type != type(y):
+            self.logger.warning("AutoPyTorch previously received targets of type %s "
+                                "yet the current features have type %s. Changing the dtype "
+                                "of inputs to an estimator might cause problems" % (
+                                    str(self.data_type),
+                                    str(type(y)),
+                                ),
+                                )
+        if ispandas(y):
+            has_nan_values = cast(pd.DataFrame, y).isnull().values.any()
+            if has_nan_values:
+                y = cast(pd.DataFrame, y).fillna(method='pad')
+        if issparse(y):
+            y = cast(spmatrix, y)
+            has_nan_values = not np.array_equal(y.data, y.data)
+            if has_nan_values:
+                type_y = type(y)
+                y = type_y(np.nan_to_num(y.todense()))
+        else:
+            # List and array like values are considered here
+            # np.isnan cannot work on strings, so we have to check for every element
+            # but NaN, are not equal to themselves:
+            has_nan_values = not np.array_equal(y, y)
+            if has_nan_values:
+                y = np.nan_to_num(y)
+
+        # Pandas Series is not supported for multi-label indicator
+        # This format checks are done by type of target
+        try:
+            self.type_of_target = type_of_target(y)
+        except Exception as e:
+            raise ValueError("The provided data could not be interpreted by AutoPyTorch. "
+                             "While determining the type of the targets via type_of_target "
+                             "run into exception: {}.".format(e))
+
+        supported_output_types = ('binary',
+                                  'continuous',
+                                  'continuous-multioutput',
+                                  'multiclass',
+                                  'multilabel-indicator',
+                                  # Notice unknown/multiclass-multioutput are not supported
+                                  # This can only happen during testing only as estimators
+                                  # should filter out unsupported types.
+                                  )
+        if self.type_of_target not in supported_output_types:
+            raise ValueError("Provided targets are not supported by AutoPyTorch. "
+                             "Provided type is {} whereas supported types are {}.".format(
+                                 self.type_of_target,
+                                 supported_output_types
+                             ))
+
+    def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
+        if self.encoder is None:
+            return _check_and_to_array(y)
+
+        # remove ravel warning from pandas Series
+        shape = np.shape(y)
+        if len(shape) > 1:
+            y = self.encoder.transform(y)
+        elif ispandas(y):
+            # The Ordinal encoder expects a 2 dimensional input.
+            # The targets are 1 dimensional, so reshape to match the expected shape
+            y = cast(pd.DataFrame, y)
+            y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
+        else:
+            y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
+
+        return _check_and_to_array(y)
\ No newline at end of file
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 844460bb5..e507d7897 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -619,7 +619,7 @@ def __init__(self,
             self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
-            self.output_type: str = type_of_target(self.train_tensors[1][0])
+            self.output_type: str = type_of_target(self.train_tensors[1][0].fillna(method="pad"))
 
             if self.output_type in ["binary", "multiclass"]:
                 self.output_type = "continuous"

From 248711788b521fd9541adce21d6fe7c4f43cc241 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 22 Apr 2022 17:42:22 +0200
Subject: [PATCH 217/347] preprocessing for time series

---
 .../preprocessing/base_preprocessing.py       |  57 ++++++
 .../TimeSeriesTransformer.py                  | 145 +++++++++++++---
 .../base_time_series_preprocessing.py         |  42 ++++-
 .../imputation/TimeSeriesImputer.py           | 162 ++++++++++++++++++
 .../imputation/__init__.py                    |   0
 .../imputation/base_time_series_imputer.py    |  43 +++++
 .../scaling/utils.py                          |  36 ++--
 .../time_series_preprocessing/utils.py        |  26 ++-
 .../TimeSeriesEarlyPreProcessing.py           |  88 ++++++++++
 .../setup/early_preprocessor/utils.py         |  41 ++++-
 .../forecasting_target_scaling/__init__.py    |  14 +-
 .../base_target_scaler.py                     |   2 +-
 .../setup/forecasting_target_scaling/utils.py |  11 +-
 .../setup/network/forecasting_architecture.py |   2 +-
 .../setup/network/forecasting_network.py      |   2 +-
 .../components/training/metrics/metrics.py    |   1 +
 .../trainer/forecasting_trainer/__init__.py   |   2 +-
 .../forecasting_base_trainer.py               |   4 +-
 .../pipeline/time_series_forecasting.py       |   9 +-
 test/conftest.py                              |  63 -------
 20 files changed, 617 insertions(+), 133 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
 create mode 100644 autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py

diff --git a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
index fb8bbdaa7..c312b88e5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
@@ -68,3 +68,60 @@ def get_hyperparameter_search_space(
             ConfigurationSpace: The configuration space of this algorithm.
         """
         return ConfigurationSpace()
+
+
+class autoPyTorchTargetPreprocessingComponent(autoPyTorchComponent):
+    """
+     Provides abstract interface for target preprocessing algorithms in AutoPyTorch. Most methods defined in this class
+     are the same as autoPyTorch.pipeline.components.preprocessing.base_preprocessing.autoPyTorchPreprocessingComponent
+     However, they are defined as two different classes such that its subclasses will not be identified as feature
+     preprocessor
+    """
+    def __init__(self) -> None:
+        autoPyTorchComponent.__init__()
+        self.add_fit_requirements([
+            FitRequirement('y_train',
+                           (pd.DataFrame, ),
+                           user_defined=True, dataset_property=False),
+            FitRequirement('backend',
+                           (Backend, ),
+                           user_defined=True, dataset_property=False)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the fitted early_preprocessor into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        raise NotImplementedError()
+
+    def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Makes the autoPyTorchPreprocessingComponent Callable. Calling the component
+        calls the transform function of the underlying early_preprocessor and
+        returns the transformed array.
+        Args:
+            X (Union[np.ndarray, torch.Tensor]): input data tensor
+
+        Returns:
+            Union[np.ndarray, torch.Tensor]: Transformed data tensor
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> ConfigurationSpace:
+        """Return the configuration space of this classification algorithm.
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]): Describes the dataset
+               to work on
+
+        Returns:
+            ConfigurationSpace: The configuration space of this algorithm.
+        """
+        return ConfigurationSpace()
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index aa3c05007..87e829fc2 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -1,16 +1,20 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Tuple
 
 import numpy as np
+import pandas as pd
 
+from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline, make_pipeline
-#from sktime.transformations.panel.compose import ColumnTransformer
-from sklearn.compose import ColumnTransformer
-
-import torch
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
-    autoPyTorchTimeSeriesPreprocessingComponent
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.utils import get_time_series_preprocessers
+from sktime.transformations.panel.compose import ColumnTransformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent,
+    autoPyTorchTimeSeriesTargetPreprocessingComponent,
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.utils import (
+    get_time_series_preprocessers,
+    get_time_series_target_preprocessers,
+)
 from autoPyTorch.utils.common import FitRequirement, subsampler
 
 
@@ -34,33 +38,35 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
             "TabularColumnTransformer": an instance of self
         """
         self.check_requirements(X, y)
-        numerical_pipeline = 'drop'
-        categorical_pipeline = 'drop'
 
         preprocessors = get_time_series_preprocessers(X)
-
-        if len(X['dataset_properties']['numerical_columns']):
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+        if len(preprocessors['numerical']) > 0:
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        if len(X['dataset_properties']['categorical_columns']):
+            column_transformers.append(
+                ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
+            )
+        if len(preprocessors['categorical']) > 0:
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
-
-        # as X_train is a 2d array here, we simply use ColumnTransformer from sklearn
-        self.preprocessor = ColumnTransformer([
-            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
-            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
+            column_transformers.append(
+                ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
+            )
+
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
             remainder='passthrough'
         )
 
-
-        """
         # Where to get the data -- Prioritize X_train if any else
         # get from backend
         if 'X_train' in X:
-            X_train = subsampler(X['X_train'], X['train_indices'])
+            X_train = X['X_train']
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
-        """
-        X_train = X['backend'].load_datamanager().train_tensors[0]
+
         self.preprocessor.fit(X_train)
         return self
 
@@ -76,13 +82,96 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'time_series_transformer': self})
         return X
 
-    def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
+    def __call__(self, X: pd.DataFrame) -> pd.DataFrame:
 
         if self.preprocessor is None:
             raise ValueError("cant call {} without fitting the column transformer first."
                              .format(self.__class__.__name__))
 
-        #if len(X.shape) == 2:
-        #    # expand batch dimension when called on a single record
-        #    X = X[np.newaxis, ...]
         return self.preprocessor.transform(X)
+
+    def get_column_transformer(self) -> ColumnTransformer:
+        """
+        Get fitted column transformer that is wrapped around
+        the sklearn early_preprocessor. Can only be called if fit()
+        has been called on the object.
+        Returns:
+            BaseEstimator: Fitted sklearn column transformer
+        """
+        if self.preprocessor is None:
+            raise AttributeError("{} can't return column transformer before transform is called"
+                                 .format(self.__class__.__name__))
+        return self.preprocessor
+
+
+class TimeSeriesTargetTransformer(autoPyTorchTimeSeriesTargetPreprocessingComponent):
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
+        """
+        Creates a column transformer for the chosen tabular
+        preprocessors
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            "TabularColumnTransformer": an instance of self
+        """
+        self.check_requirements(X, y)
+
+        preprocessors = get_time_series_target_preprocessers(X)
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+        if len(preprocessors['target_numerical']) > 0:
+            numerical_pipeline = make_pipeline(*preprocessors['target_numerical'])
+            # TODO the last item needs to be adapted accordingly!
+            column_transformers.append(
+                ('target_numerical_pipeline', numerical_pipeline, list(range(len(preprocessors['target_numerical']))))
+            )
+
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
+            remainder='passthrough'
+        )
+
+        # Where to get the data -- Prioritize X_train if any else
+        # get from backend
+        if 'y_train' in X:
+            y_train = X['y_train']
+        else:
+            y_train = X['backend'].load_datamanager().train_tensors[1]
+
+        self.preprocessor.fit(y_train)
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the time series transformer to fit dictionary
+        Args:
+            X (Dict[str, Any]): fit dictionary
+
+        Returns:
+            X (Dict[str, Any]): updated fit dictionary
+        """
+        X.update({'time_series_target_transformer': self})
+        return X
+
+    def __call__(self, y: pd.DataFrame) -> pd.DataFrame:
+        if self.preprocessor is None:
+            raise ValueError("cant call {} without fitting the column transformer first."
+                             .format(self.__class__.__name__))
+
+        return self.preprocessor.transform(y)
+
+    def get_target_transformer(self) -> ColumnTransformer:
+        """
+        Get fitted column transformer that is wrapped around
+        the sklearn early_preprocessor. Can only be called if fit()
+        has been called on the object.
+        Returns:
+            BaseEstimator: Fitted sklearn column transformer
+        """
+        if self.preprocessor is None:
+            raise AttributeError("{} can't return column transformer before transform is called"
+                                 .format(self.__class__.__name__))
+        return self.preprocessor
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
index 0f8966ac0..cb688891c 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -2,7 +2,10 @@
 
 from sklearn.base import BaseEstimator
 
-from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchPreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import (
+    autoPyTorchPreprocessingComponent,
+    autoPyTorchTargetPreprocessingComponent
+)
 
 
 class autoPyTorchTimeSeriesPreprocessingComponent(autoPyTorchPreprocessingComponent):
@@ -34,3 +37,40 @@ def __str__(self) -> str:
         """ Allow a nice understanding of what components where used """
         string = self.__class__.__name__
         return string
+
+
+class autoPyTorchTimeSeriesTargetPreprocessingComponent(autoPyTorchTargetPreprocessingComponent):
+    """
+     Provides abstract interface for time series target preprocessing algorithms in AutoPyTorch.
+     Currently only numerical target preprocessing is supported.
+     # TODO add support for categorical targets!
+     # TODO define inverse transformation for each inversible numerical transformation (log, deseasonalization, etc. )
+    """
+    """
+     Provides abstract interface for time series preprocessing algorithms in AutoPyTorch.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
+            numerical=None, categorical=None)
+
+    def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
+        """
+        Returns early_preprocessor dictionary containing the sklearn numerical
+        and categorical early_preprocessor with "numerical" and "categorical"
+        keys. May contain None for a key if early_preprocessor does not
+        handle the datatype defined by key
+
+        Returns:
+            Dict[str, BaseEstimator]: early_preprocessor dictionary
+        """
+        if (self.preprocessor['target_numerical'] and self.preprocessor['target_categorical']) is None:
+            raise AttributeError("{} can't return early_preprocessor dict without fitting first"
+                                 .format(self.__class__.__name__))
+        return self.preprocessor
+
+    def __str__(self) -> str:
+        """ Allow a nice understanding of what components where used """
+        string = self.__class__.__name__
+        return string
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
new file mode 100644
index 000000000..1b23374d2
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -0,0 +1,162 @@
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+from sktime.transformations.series.impute import Imputer
+
+from ConfigSpace import ConfigurationSpace
+from autoPyTorch.utils.common import FitRequirement
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.\
+    base_time_series_imputer import BaseTimeSeriesImputer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent,
+    autoPyTorchTimeSeriesTargetPreprocessingComponent
+)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.utils.common import HyperparameterSearchSpace
+
+
+class TimeSeriesFeatureImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesPreprocessingComponent):
+    def __int__(self,
+                random_state: Optional[np.random.RandomState] = None,
+                imputation_strategy: str = 'mean',):
+        super().__init__()
+        self.random_state = random_state
+        self.imputation_strategy = imputation_strategy
+        self.add_fit_requirements([
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImputer:
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
+
+        Args:
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
+
+        Returns:
+            self:
+                returns an instance of self.
+        """
+        # Choose an imputer for any numerical columns
+        numerical_columns = X['dataset_properties']['numerical_columns']
+
+        if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
+            if self.imputation_strategy == 'constant_zero':
+                imputer = Imputer(method='constant', random_state=self.random_state, value=0)
+                self.preprocessor['numerical'] = imputer
+            else:
+                imputer = Imputer(method=self.numerical_strategy, random_state=self.random_state, value=0)
+                self.preprocessor['numerical'] = imputer
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
+            raise ValueError("cant call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'imputer': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='imputation_strategy',
+            value_range=("drift", "linear", "nearest", "constant_zero", "mean", "median", "bfill", "ffill"),
+            default_value="drift",
+        ),
+    ) -> ConfigurationSpace:
+        if dataset_properties.get('features_have_missing_values', False):
+            cs = super().get_hyperparameter_search_space(dataset_properties, imputation_strategy)
+        else:
+            cs = ConfigurationSpace()
+        return cs
+
+
+class TimeSeriesTargetImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesTargetPreprocessingComponent):
+    def __int__(self,
+                random_state: Optional[np.random.RandomState] = None,
+                imputation_strategy: str = 'mean',):
+        super().__init__()
+        self.random_state = random_state
+        self.imputation_strategy = imputation_strategy
+        self.add_fit_requirements([
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImputer:
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
+
+        Args:
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
+
+        Returns:
+            self:
+                returns an instance of self.
+        """
+        # Forecasting tasks always have numerical outputs (TODO add support for categorical HPs)
+        if self.imputation_strategy == 'constant_zero':
+            imputer = Imputer(method='constant', random_state=self.random_state, value=0)
+            self.preprocessor['target_numerical'] = imputer
+        else:
+            imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state, value=0)
+            self.preprocessor['target_numerical'] = imputer
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['target_numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
+            raise ValueError("cant call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'imputer': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='imputation_strategy',
+            value_range=("linear", "nearest", "constant_zero", "bfill", "ffill"),
+            default_value="linear",
+        ),
+    ) -> ConfigurationSpace:
+        """
+        Time series imputor, for the sake of speed, we only allow local imputation here (i.e., the filled value only
+        depends on its neighbours)
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): dataset properties
+            imputation_strategy: which strategy to use, its content is defined by
+             sktime.transformations.series.impute.Imputer
+
+
+        Returns:
+
+        """
+        if dataset_properties.get('features_have_missing_values', False):
+            cs = super().get_hyperparameter_search_space(dataset_properties, imputation_strategy)
+        else:
+            cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
new file mode 100644
index 000000000..f77f9d4e4
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
@@ -0,0 +1,43 @@
+from typing import Any, Dict, List, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+
+
+class BaseTimeSeriesImputer:
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "BaseTimeSeriesImputer":
+        raise NotImplementedError
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='imputation_strategy',
+            value_range=("drift", "linear", "nearest", "constant_zero", "mean", "median", "bfill", "ffill"),
+            default_value="drift",
+        ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the Time Series Imputator
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+            imputation_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for imputation, its hyperparameters are defined by sktime
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a Time Series Imputor with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
+        return cs
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 4c1d51716..f6479382f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,6 +1,8 @@
 from typing import Any, List, Callable, Optional, Union, Tuple
 
 import numpy as np
+import pandas as pd
+from pandas.core.groupby.generic import DataFrameGroupBy
 
 import sklearn
 from sklearn.base import BaseEstimator
@@ -14,31 +16,45 @@ def __init__(self, mode: str):
         #self.loc = 0.  # type: Union[np.ndarray, float]
         #self.scale = 1.  # type: Union[np.ndarray, float]
 
-    def fit(self, X: np.ndarray, y: Any = None) -> "TimeSeriesScaler":
+    def fit(self, X: pd.DataFrame, y: Any = None) -> "TimeSeriesScaler":
         """
         The transformer is transformed on the fly (for each batch)
         """
-        # we assuem that the last two dimensions are [seq, features]
         if self.mode == "standard":
-            self.loc = np.mean(X, axis=-2, keepdims=True)
-            self.scale = np.std(X, axis=-2, keepdims=True)
-            self.scale[self.scale == 0.0] = 1.0
+            X_grouped = X.groupby(X.index)
+
+            self.loc = X_grouped.agg("mean")
+            self.scale = X_grouped.agg("std")
+            # ensure that if all the values are the same in a group, we could still normalize them correctly
+            self.scale.mask(self.scale == 0.0, self.loc)
+            self.scale[self.scale == 0] = 1.
 
         elif self.mode == "min_max":
-            min_ = np.min(X, axis=-2, keepdims=True)
-            max_ = np.max(X, axis=-2, keepdims=True)
+            X_grouped = X.groupby(X.index)
+
+            min_ = X_grouped.agg("min")
+            max_ = X_grouped.agg("max")
 
             diff_ = max_ - min_
             self.loc = min_
             self.scale = diff_
+            self.scale.mask(self.scale == 0.0, self.loc)
             self.scale[self.scale == 0.0] = 1.0
 
         elif self.mode == "max_abs":
-            max_abs_ = np.max(np.abs(X), axis=-2, keepdims=True)
+            X_abs = X.transform("abs")
+            max_abs_ = X_abs.groupby(X_abs.index).transform("max")
             max_abs_[max_abs_ == 0.0] = 1.0
             self.loc = None
             self.scale = max_abs_
 
+        elif self.mode == 'mean_abs':
+            X_abs = X.transform("abs")
+            X_abs = X_abs.groupby(X_abs.index)
+            mean_abs_ = X_abs.agg("mean")
+            self.loc = None
+            self.scale = mean_abs_.mask(mean_abs_ == 0.0, X_abs.agg("max"))
+
         elif self.mode == "none":
             self.loc = None
             self.scale = None
@@ -58,9 +74,9 @@ def transform(self, X: np.ndarray) -> Tuple[np.ndarray, ...]:
         ) # type: np.ndarray
         """
 
-        if self.mode in ['standard', 'min_max']:
+        if self.mode in {"standard", "min_max"}:
             return (X - self.loc) / self.scale
-        elif self.mode == "max_abs":
+        elif self.mode in {"max_abs", "mean_abs"}:
             return X / self.scale
         else:
             return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
index 7072f001c..e2f64fc75 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
@@ -2,7 +2,6 @@
 
 from sklearn.base import BaseEstimator
 
-
 def get_time_series_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator]]:
     """
     Expects fit_dictionary(X) to have numerical/categorical preprocessors
@@ -26,3 +25,28 @@ def get_time_series_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstim
                 preprocessor['categorical'].append(value['categorical'])
 
     return preprocessor
+
+
+def get_time_series_target_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator]]:
+    """
+    Expects fit_dictionary(X) to have target preprocessors
+    I leave here interface to target categorical
+    (fitted numerical/categorical preprocessing nodes) that will build a pipeline in the TimeSeriesTransformer.
+    This function parses X and extracts such components.
+    Creates a dictionary with two keys,
+    numerical- containing list of numerical preprocessors
+    categorical- containing list of categorical preprocessors
+    Args:
+        X: fit dictionary
+    Returns:
+        (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
+    """
+    preprocessor = dict(numerical=list(), categorical=list())  # type: Dict[str, List[BaseEstimator]]
+    for key, value in X.items():
+        if isinstance(value, dict):
+            # as each preprocessor is child of BaseEstimator
+            if 'target_numerical' in value and isinstance(value['target_numerical'], BaseEstimator):
+                preprocessor['target_numerical'].append(value['target_numerical'])
+            if 'target_categorical' in value and isinstance(value['target_categorical'], BaseEstimator):
+                preprocessor['target_categorical'].append(value['target_categorical'])
+    return preprocessor
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
new file mode 100644
index 000000000..b421edef3
--- /dev/null
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -0,0 +1,88 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import spmatrix
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchTargetPreprocessingComponent
+from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
+from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import (
+    get_preprocess_transforms,
+    time_series_preprocess
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class TimeSeriesEarllyPreprocessing(EarlyPreprocessing):
+
+    def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
+        super().__init__()
+        self.random_state = random_state
+        self.add_fit_requirements([
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('X_train', (pd.DataFrame, ), user_defined=True,
+                           dataset_property=False)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+
+        transforms = get_preprocess_transforms(X)
+        if X['dataset_properties']['is_small_preprocess']:
+            if 'X_train' in X:
+                X_train = X['X_train']
+            else:
+                # Incorporate the transform to the dataset
+                X_train = X['backend'].load_datamanager().train_tensors[0]
+
+            X['X_train'] = time_series_preprocess(dataset=X_train, transforms=transforms)
+
+        # We need to also save the preprocess transforms for inference
+        X.update({'preprocess_transforms': transforms})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesEarlyPreprocessing',
+            'name': 'TIme Series Early Preprocessing Node',
+        }
+
+
+class TimeSeriesTargetEarlyPreprocessing(EarlyPreprocessing):
+
+    def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
+        super().__init__()
+        self.random_state = random_state
+        self.add_fit_requirements([
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
+            FitRequirement('y_train', (pd.DataFrame,), user_defined=True,
+                           dataset_property=False)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        # TODO consider inverse transformation
+        transforms = get_preprocess_transforms(X, preprocess_type=autoPyTorchTargetPreprocessingComponent)
+        if X['dataset_properties']['is_small_preprocess']:
+            if 'y_train' in X:
+                y_train = X['y_train']
+            else:
+                # Incorporate the transform to the dataset
+                y_train = X['backend'].load_datamanager().train_tensors[1]
+
+            X['y_train'] = time_series_preprocess(dataset=y_train, transforms=transforms)
+
+        # We need to also save the preprocess transforms for inference
+        X.update({'preprocess_target_transforms': transforms})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesTargetEarlyPreprocessing',
+            'name': 'TIme Series Target Early Preprocessing Node',
+        }
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index d74faffa6..1050707a7 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -1,19 +1,25 @@
 import copy
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Type, Optional, Union
 
 import numpy as np
+import pandas as pd
 
 from sklearn.utils import check_array
 
 import torchvision.transforms
 
-from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchPreprocessingComponent
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import (
+    autoPyTorchPreprocessingComponent as aPTPre,
+    autoPyTorchTargetPreprocessingComponent as aPTTPre
+)
 
 
-def get_preprocess_transforms(X: Dict[str, Any]) -> torchvision.transforms.Compose:
-    candidate_transforms: List[autoPyTorchPreprocessingComponent] = list()
+def get_preprocess_transforms(X: Dict[str, Any],
+                              preprocess_type: Union[Type[aPTPre], Type[aPTTPre]] = aPTPre) \
+        -> torchvision.transforms.Compose:
+    candidate_transforms: List[preprocess_type] = list()
     for key, value in X.items():
-        if isinstance(value, autoPyTorchPreprocessingComponent):
+        if isinstance(value, preprocess_type):
             candidate_transforms.append(copy.deepcopy(value))
 
     return candidate_transforms
@@ -21,7 +27,6 @@ def get_preprocess_transforms(X: Dict[str, Any]) -> torchvision.transforms.Compo
 
 def preprocess(dataset: np.ndarray, transforms: torchvision.transforms.Compose,
                indices: List[int] = None) -> np.ndarray:
-
     composite_transforms = torchvision.transforms.Compose(transforms)
     if indices is None:
         dataset = composite_transforms(dataset)
@@ -37,3 +42,27 @@ def preprocess(dataset: np.ndarray, transforms: torchvision.transforms.Compose,
         ensure_2d=False,
         allow_nd=True,
     )
+
+
+def time_series_preprocess(dataset: pd.DataFrame, transforms: torchvision.transforms.Compose,
+                           indices: Optional[List[int]] = None) -> pd.DataFrame:
+    """
+    preprocess time series data (both features and targets). Dataset should be pandas DataFrame whose index identifies
+    which series the data belongs to.
+    Args:
+        dataset (pd.DataFrame): a dataset contains multiple series, its index identifies the series number
+        transforms (torchvision.transforms.Compose): transformation applied to dataset
+        indices (Optional[List[int]]): the indices that the transformer needs to work with
+
+    Returns:
+
+    """
+    # TODO consider Numpy implementation
+    composite_transforms = torchvision.transforms.Compose(transforms)
+    if indices is None:
+        dataset = composite_transforms(dataset)
+    else:
+        sub_dataset = dataset.iloc[:, indices]
+        sub_dataset = composite_transforms(sub_dataset)
+        dataset.iloc[:, indices] = sub_dataset
+    return dataset
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
index 01b7b831e..44950a654 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
@@ -27,7 +27,8 @@ def add_scaler(scaler: BaseTargetScaler) -> None:
 
 class TargetScalerChoice(autoPyTorchChoice):
     """
-    Allows for dynamically choosing scaling component at runtime, not
+    Allows for dynamically choosing scale component at runtime, Hence we consider it as part of "setup", not
+    "preprocessing"
     """
 
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
@@ -90,14 +91,3 @@ def get_hyperparameter_search_space(self,
         self.configuration_space = cs
         self.dataset_properties = dataset_properties
         return cs
-
-    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
-        """
-        A mechanism in code to ensure the correctness of the fit dictionary
-        It recursively makes sure that the children and parent level requirements
-        are honored before fit.
-        Args:
-            dataset_properties (Dict[str, Any]): dictionary holding the dataset properties
-
-        """
-        super()._check_dataset_properties(dataset_properties)
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index dfa3a841d..cd2f4093a 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -9,7 +9,7 @@
 
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import TargetScaler
 
 
 class BaseTargetScaler(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
index 2fdb33e3d..b644eb83b 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
@@ -28,6 +28,8 @@ def transform(self,
                 loc = torch.mean(past_targets, dim=1, keepdim=True)
                 scale = torch.std(past_targets, dim=1, keepdim=True)
 
+                offset_targets = past_targets - loc
+                scale = torch.where(torch.logical_or(scale == 0.0, scale == torch.nan), offset_targets[:, [-1]], scale)
                 scale[scale == 0.0] = 1.0
                 if future_targets is not None:
                     future_targets = (future_targets - loc) / scale
@@ -39,7 +41,7 @@ def transform(self,
 
                 diff_ = max_ - min_
                 loc = min_ - 1e-10
-                scale = diff_
+                scale = torch.where(diff_ == 0, past_targets[:, [-1]], diff_)
                 scale[scale == 0.0] = 1.0
                 if future_targets is not None:
                     future_targets = (future_targets - loc) / scale
@@ -55,8 +57,8 @@ def transform(self,
 
             elif self.mode == 'mean_abs':
                 mean_abs = torch.mean(torch.abs(past_targets), dim=1, keepdim=True)
-                mean_abs[mean_abs == 0.0] = 1.0
-                scale = mean_abs
+                scale = torch.where(mean_abs == 0.0, past_targets[:, [-1]], mean_abs)
+                scale[scale == 0.0] = 1.0
                 if future_targets is not None:
                     future_targets = future_targets / scale
                 return past_targets / scale, future_targets, None, scale
@@ -106,7 +108,8 @@ def transform(self,
 
             elif self.mode == "max_abs":
                 max_abs_ = torch.max(torch.abs(valid_past_targets), dim=1, keepdim=True)[0]
-                scale = torch.where(max_abs_ == 0, past_targets[:, [-1]], max_abs_)
+                max_abs_[max_abs_ == 0.0] = 1.0
+                scale = max_abs_
                 if future_targets is not None:
                     future_targets = future_targets / scale
                 return past_targets / scale, future_targets, None, scale
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 9b1880a8a..8654441b0 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -11,7 +11,7 @@
     TransformedDistribution,
 )
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderNetwork,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 6be8364e5..38a9311e9 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -9,7 +9,7 @@
 
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderBlockInfo,
 )
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index c8e2caecd..bdff13d17 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -68,6 +68,7 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int, n_pr
     Returns:
         mase_coefficient: inverse of mase_denominator
     """
+    past_target = np.nan_to_num(past_target)
     if sp >= len(past_target):
         # in this case, we simply consider the mean value of the entire sequence
         # TODO condsider if there is a better way of handling this
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 11fd2cccd..d53937391 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -18,7 +18,7 @@
 
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 
 from autoPyTorch.utils.common import get_device_from_fit_dictionary
 from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index f19d59c3c..425fcf4b6 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -12,8 +12,8 @@
 
 
 from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import BaseTargetScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet, \
     NBEATSNet, ForecastingSeq2SeqNet
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index f22661a67..e8f666fb0 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -21,7 +21,12 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesTransformer
 )
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
+#from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesFeatureImputer import (
+#    TimeSeriesFeatureImputer
+#)
+#from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesTargetImputer import (
+#    TimeSeriesTargetImputer
+#)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
@@ -32,7 +37,7 @@
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.targets_preprocessing.forecasting_target_scaling import \
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
     TargetScalerChoice
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
diff --git a/test/conftest.py b/test/conftest.py
index d486d6dba..7e4729fb8 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -23,9 +23,7 @@
 
 from autoPyTorch.automl_common.common.utils.backend import create
 from autoPyTorch.data.tabular_validator import TabularInputValidator
-from autoPyTorch.data.time_series_validator import TimeSeriesInputValidator
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesDataset
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
@@ -276,42 +274,6 @@ def get_tabular_data(task):
 
     return X, y, validator
 
-
-def get_time_series_data(task):
-    length = 10
-    sin_wave = np.sin(np.arange(length))
-    cos_wave = np.cos(np.arange(length))
-    sin_waves = []
-    cos_waves = []
-    # create a dummy dataset with 100 sin and 100 cosine waves
-    for i in range(200):
-        # add some random noise so not every sample is equal
-        sin_waves.append(sin_wave + np.random.randn(length) * 0.1)
-        cos_waves.append(cos_wave + np.random.randn(length) * 0.1)
-    sin_waves = np.stack(sin_waves)[..., np.newaxis]
-    cos_waves = np.stack(cos_waves)[..., np.newaxis]
-
-    if task == "classification_numerical_only":
-        X = np.concatenate([sin_waves, cos_waves])
-        y = np.array([0] * len(sin_waves) + [1] * len(cos_waves))
-
-        validator = TimeSeriesInputValidator(is_classification=True).fit(X.copy(), y.copy())
-
-    elif task == "regression_numerical_only":
-        X = np.concatenate([sin_waves, cos_waves])
-
-        # use the last value of the time series as dummy regression target
-        y = X[:, -1, 0]
-        X = X[:, :-1, :]
-
-        validator = TimeSeriesInputValidator(is_classification=False).fit(X.copy(), y.copy())
-
-    else:
-        raise ValueError("Unsupported task {}".format(task))
-
-    return X, y, validator
-
-
 def get_fit_dictionary(datamanager, backend):
     info = datamanager.get_required_dataset_info()
 
@@ -348,15 +310,6 @@ def get_tabular_fit_dictionary(X, y, validator, backend):
     return get_fit_dictionary(datamanager, backend)
 
 
-def get_time_series_fit_dictionary(X, y, validator, backend):
-    datamanager = TimeSeriesDataset(
-        X=X, Y=y,
-        validator=validator,
-        X_test=X, Y_test=y,
-    )
-    return get_fit_dictionary(datamanager, backend)
-
-
 @pytest.fixture
 def fit_dictionary_tabular_dummy(request, backend):
     if request.param == "classification":
@@ -368,16 +321,6 @@ def fit_dictionary_tabular_dummy(request, backend):
     return get_tabular_fit_dictionary(X, y, validator, backend)
 
 
-@pytest.fixture
-def fit_dictionary_time_series_dummy(request, backend):
-    if request.param == "classification":
-        X, y, validator = get_time_series_data("classification_numerical_only")
-    elif request.param == "regression":
-        X, y, validator = get_time_series_data("regression_numerical_only")
-    else:
-        raise ValueError(f"Unsupported indirect fixture {request.param}")
-    return get_time_series_fit_dictionary(X, y, validator, backend)
-
 
 @pytest.fixture
 def fit_dictionary_tabular(request, backend):
@@ -385,12 +328,6 @@ def fit_dictionary_tabular(request, backend):
     return get_tabular_fit_dictionary(X, y, validator, backend)
 
 
-@pytest.fixture
-def fit_dictionary_time_series(request, backend):
-    X, y, validator = get_time_series_data(request.param)
-    return get_time_series_fit_dictionary(X, y, validator, backend)
-
-
 @pytest.fixture
 def dataset(request):
     return request.getfixturevalue(request.param)

From 86e4e3c3a9b721126ccef299cb7301fbf06220b2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 25 Apr 2022 16:30:54 +0200
Subject: [PATCH 218/347] maint

---
 .../data/time_series_feature_validator.py     |  22 +++-
 .../data/time_series_forecasting_validator.py |  91 +++++++-------
 .../data/time_series_target_validator.py      |  16 ++-
 autoPyTorch/datasets/time_series_dataset.py   |   1 +
 .../scaling/base_scaler_choice.py             | 114 ------------------
 5 files changed, 81 insertions(+), 163 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 3365b7c26..6d4bffcac 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -6,7 +6,7 @@
 import sklearn.utils
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator, SupportedFeatTypes
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
@@ -18,6 +18,9 @@ def __init__(
         super().__init__(logger)
         self.only_contain_series_idx = False
 
+    def get_reordered_columns(self):
+        return self.transformed_columns + list(set(self.column_order) - set(self.transformed_columns))
+
     def fit(self,
             X_train: Union[pd.DataFrame, np.ndarray],
             X_test: Union[pd.DataFrame, np.ndarray] = None,
@@ -68,3 +71,20 @@ def fit(self,
 
         return self
 
+    def transform(
+        self,
+        X: SupportedFeatTypes,
+        index: Optional[Union[pd.Index, np.ndarray]] = None,
+    ) -> Union[pd.DataFrame]:
+        X = super(TimeSeriesFeatureValidator, self).transform(X)
+        if index is None:
+            index = np.array([0.] * len(X))
+        if X.ndim == 1:
+            X = np.expand_dims(X, -1)
+        X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns(),
+                                       index=index)
+        return X
+
+
+
+
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 2b8b3ed80..06da40846 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -7,10 +7,13 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
+from scipy import sparse
+
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
+from autoPyTorch.data.base_feature_validator import SupportedFeatTypes
 
 
 class TimeSeriesForecastingInputValidator(TabularInputValidator):
@@ -18,6 +21,7 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
     A validator designed for a time series forecasting dataset.
     As a time series forecasting dataset might contain several time sequence with different length, we will transform
     all the data to DataFrameGroupBy whereas each group represents a series
+    TODO for multiple output: target names and shapes
     """
 
     def __init__(self,
@@ -34,8 +38,8 @@ def __init__(self,
         self.n_prediction_steps = 1
         self.start_times_train = None
         self.start_times_test = None
-        self.feature_shapes = {}
-        self.feature_names = []
+        self.feature_shapes: Dict[str, int] = {}
+        self.feature_names: List[str] = []
         self.series_idx = None
 
     def fit(
@@ -89,15 +93,16 @@ def fit(
             self._is_uni_variant = True
         if isinstance(y_train, List):
             # X_train and y_train are stored as lists
+            y_train_stacked = self.join_series(y_train)
+            y_test_stacked = self.join_series(y_test) if y_test is not None else None
+
             if self._is_uni_variant:
                 self.feature_validator.num_features = 0
                 self.feature_validator.numerical_columns = []
                 self.feature_validator.categorical_columns = []
 
-                if y_test is not None:
-                    self.target_validator.fit(y_train[0], y_test[0])
-                else:
-                    self.target_validator.fit(y_train[0])
+                self.target_validator.fit(y_train_stacked, y_test_stacked)
+
                 self._is_fitted = True
             else:
                 self.known_future_features = known_future_features
@@ -105,15 +110,16 @@ def fit(
                 if len(X_train) != len(y_train):
                     raise ValueError("Inconsistent number of sequences for features and targets,"
                                      " {} for features and {} for targets".format(len(X_train), len(y_train), ))
-
+                X_train_stacked = self.join_series(X_train)
+                X_test_stacked = self.join_series(X_test) if X_test is not None else None
                 if X_test is not None:
                     if len(X_test) != len(y_test):
                         raise ValueError("Inconsistent number of test datapoints for features and targets,"
                                          " {} for features and {} for targets".format(len(X_test), len(y_test), ))
                     # TODO write a feature input validator to check X_test for known_future_features
                     super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
-                self.feature_validator.fit(X_train[0], None if X_test is None else X_test[0], series_idx=series_idx)
-                self.target_validator.fit(y_train[0], None if y_test is None else y_test[0])
+                self.feature_validator.fit(X_train_stacked, X_test_stacked, series_idx=series_idx)
+                self.target_validator.fit(y_train_stacked, y_test_stacked)
 
                 if self.feature_validator.only_contain_series_idx:
                     self._is_uni_variant = True
@@ -126,13 +132,8 @@ def fit(
 
                 if X_test is not None:
                     self.check_input_shapes(X_test, y_test, is_training=False)
-                if hasattr(X_train[0], 'columns'):
-                    features = X_train[0].columns.values.tolist()
-                else:
-                    features = list(map(str, range(len(X_train[0]))))
-                for feature in features:
-                    self.feature_names.append(feature)
-                    self.feature_shapes[feature] = 1
+                self.feature_names = self.feature_validator.get_reordered_columns()
+                self.feature_shapes = {feature_name: 1 for feature_name in self.feature_names}
         else:
             # TODO X_train and y_train are pd.DataFrame
             raise NotImplementedError
@@ -187,33 +188,16 @@ def transform(
                 sequence_lengths[seq_idx] = len(y[seq_idx])
             sequence_lengths = np.asarray(sequence_lengths)
 
-            num_targets = self.target_validator.out_dimensionality
-
-            num_data = np.sum(sequence_lengths)
+            y_stacked = self.join_series(y)
 
-            start_idx = 0
-
-            y_flat = np.empty([num_data, num_targets])
-
-            for seq_idx, seq_length in enumerate(sequence_lengths):
-                end_idx = start_idx + seq_length
-                y_flat[start_idx: end_idx] = np.array(y[seq_idx]).reshape([-1, num_targets])
-                start_idx = end_idx
-
-            y_transformed: np.ndarray = self.target_validator.transform(y_flat)
-            if y_transformed.ndim == 1:
-                y_transformed = np.expand_dims(y_transformed, -1)
+            y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked)
 
             if self.series_idx is None:
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
                 if not self._is_uni_variant:
-                    if isinstance(X[0], np.ndarray):
-                        x_flat: pd.DataFrame = pd.DataFrame(np.vstack(X))
-                    elif isinstance(X[0], pd.DataFrame):
-                        x_flat: pd.DataFrame = pd.concat(X)
-                    else:
-                        raise NotImplementedError(f'Cannot transform a List of {type(X[0])}')
-                    x_transformed = self.feature_validator.transform(x_flat)
+                    x_stacked = self.join_series(X)
+                    x_transformed = self.feature_validator.transform(x_stacked,
+                                                                     index=series_number)
 
             else:
                 # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
@@ -226,17 +210,32 @@ def transform(
                 series_number = pd.MultiIndex.from_frame(x_flat[self.series_idx])
 
                 if not self._is_uni_variant:
-                    x_transformed = self.feature_validator.transform(x_flat.drop(self.series_idx, axis=1))
-            y_transformed: pd.DataFrame = pd.DataFrame(y_transformed,
-                                                       index=pd.Index(series_number))
+                    x_transformed = self.feature_validator.transform(x_flat.drop(self.series_idx, axis=1),
+                                                                     index=series_number)
+
             if self._is_uni_variant:
                 return None, y_transformed, sequence_lengths
 
-            if x_transformed.ndim == 1:
-                x_transformed = np.expand_dims(x_transformed, -1)
-            x_transformed: pd.DataFrame = pd.DataFrame(x_transformed,
-                                                       index=series_number)
-
             return x_transformed, y_transformed, sequence_lengths
         else:
             raise NotImplementedError
+
+    @staticmethod
+    def join_series(input: List[SupportedFeatTypes]) -> SupportedFeatTypes:
+        """
+        join the series into one single value
+        """
+        if not isinstance(input, List):
+            raise ValueError(f'Input must be a list, but it is {type(input)}')
+        if isinstance(input[0], pd.DataFrame):
+            return pd.concat(input)
+        elif isinstance(input[0], sparse.spmatrix):
+            if len(input[0].shape) > 1:
+                return sparse.vstack(input)
+            else:
+                return sparse.hstack(input)
+        elif isinstance(input[0], (List, np.ndarray)):
+            return np.concatenate(input)
+        else:
+            raise NotImplementedError(f'Unsupported input type: List[{type(input[0])}]')
+
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 5c48cade4..384ec7cac 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -44,7 +44,10 @@ def _modify_regression_target(y: ArrayType) -> ArrayType:
 
 
 class TimeSeriesTargetValidator(TabularTargetValidator):
-    def transform(self, y: SupportedTargetTypes) -> np.ndarray:
+    def transform(self,
+                  y: SupportedTargetTypes,
+                  index: Optional[Union[pd.Index, np.ndarray]] = None,
+                  ) ->pd.DataFrame:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -53,9 +56,11 @@ def transform(self, y: SupportedTargetTypes) -> np.ndarray:
             y (SupportedTargetTypes)
                 A set of targets that are going to be encoded if the current task
                 is classification
+            index (Optional[Union[pd.Index], np.ndarray]):
+                index indentifying which series the data belongs to
 
         Returns:
-            np.ndarray:
+            pd.DataFrame:
                 The transformed array
         """
         if not self._is_fitted:
@@ -72,6 +77,13 @@ def transform(self, y: SupportedTargetTypes) -> np.ndarray:
         if not self.is_classification and "continuous" not in type_of_target(np.nan_to_num(y)):
             y = _modify_regression_target(y)
 
+        if index is None:
+            index = np.array([0.] * len(y))
+        if y.ndim == 1:
+            y = np.expand_dims(y, -1)
+
+        y: pd.DataFrame = pd.DataFrame(y, index=index)
+
         return y
 
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index e507d7897..67ca765b1 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -116,6 +116,7 @@ def __init__(self,
             else:
                 self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp,
                                                                  n_prediction_steps=n_prediction_steps)
+
         else:
             self.mase_coefficient = 1.0
         self.only_has_past_targets = only_has_past_targets
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
deleted file mode 100644
index 5c5dce4cd..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler_choice.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Any, Dict, List, Optional
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
-
-scaling_directory = os.path.split(__file__)[0]
-_scalers = find_components(__package__,
-                           scaling_directory,
-                           BaseScaler)
-
-_addons = ThirdPartyComponents(BaseScaler)
-
-
-def add_scaler(scaler: BaseScaler) -> None:
-    _addons.add_component(scaler)
-
-
-class ScalerChoice(autoPyTorchChoice):
-    """
-    Allows for dynamically choosing scaling component at runtime
-    """
-
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
-        """Returns the available scaler components
-
-        Args:
-            None
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: all BaseScalers components available
-                as choices for scaling
-        """
-        components = OrderedDict()
-        components.update(_scalers)
-        components.update(_addons.components)
-        return components
-
-    def get_hyperparameter_search_space(self,
-                                        dataset_properties: Optional[Dict[str, Any]] = None,
-                                        default: Optional[str] = None,
-                                        include: Optional[List[str]] = None,
-                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None:
-            dataset_properties = dict()
-
-        dataset_properties = {**self.dataset_properties, **dataset_properties}
-
-        available_scalers = self.get_available_components(dataset_properties=dataset_properties,
-                                                          include=include,
-                                                          exclude=exclude)
-
-        if len(available_scalers) == 0:
-            raise ValueError("no scalers found, please add a scaler")
-
-        if default is None:
-            defaults = ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'NoScaler']
-            for default_ in defaults:
-                if default_ in available_scalers:
-                    default = default_
-                    break
-
-        # add only no scaler to choice hyperparameters in case the dataset is only categorical
-        if len(dataset_properties['numerical_features']) == 0:
-            default = 'NoScaler'
-            if include is not None and default not in include:
-                raise ValueError("Provided {} in include, however, "
-                                 "the dataset is incompatible with it".format(include))
-            preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                         ['NoScaler'],
-                                                         default_value=default)
-        else:
-            preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                         list(available_scalers.keys()),
-                                                         default_value=default)
-        cs.add_hyperparameter(preprocessor)
-
-        # add only child hyperparameters of early_preprocessor choices
-        for name in preprocessor.choices:
-            updates = self._get_search_space_updates(prefix=name)
-            config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
-                                                                                   **updates)
-            parent_hyperparameter = {'parent': preprocessor, 'value': name}
-            cs.add_configuration_space(name, config_space,
-                                       parent_hyperparameter=parent_hyperparameter)
-
-        self.configuration_space = cs
-        self.dataset_properties = dataset_properties
-        return cs
-
-    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
-        """
-        A mechanism in code to ensure the correctness of the fit dictionary
-        It recursively makes sure that the children and parent level requirements
-        are honored before fit.
-        Args:
-            dataset_properties (Dict[str, Any]): dictionary holding the dataset properties
-
-        """
-        super()._check_dataset_properties(dataset_properties)
-        assert "numerical_features" in dataset_properties and \
-               "categorical_features" in dataset_properties, \
-            "Dataset properties must contain information about the type of features"

From 2c9944c69a2488f3546dc34ca4c6a73a86fc9344 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 25 Apr 2022 19:12:22 +0200
Subject: [PATCH 219/347] forecasting pipeline

---
 .../preprocessing/base_preprocessing.py       |   2 +-
 .../TimeSeriesTransformer.py                  |   1 -
 .../encoding/NoEncoder.py                     |  32 +++++
 .../encoding/OneHotEncoder.py                 |  38 ++++++
 .../encoding/__init__.py                      |  50 ++++++++
 .../encoding/time_series_base_encoder.py      |  35 ++++++
 .../imputation/TimeSeriesImputer.py           |   9 +-
 .../imputation/base_time_series_imputer.py    |   6 +-
 .../scaling/__init__.py                       | 114 ++++++++++++++++++
 .../base_target_scaler.py                     |  12 +-
 .../setup/forecasting_target_scaling/utils.py |   2 +-
 .../forecasting_decoder/MLPDecoder.py         |   4 +-
 .../seq_encoder/__init__.py                   |  45 +++----
 .../components/training/metrics/metrics.py    |   8 +-
 .../pipeline/time_series_forecasting.py       |  48 +++++---
 15 files changed, 346 insertions(+), 60 deletions(-)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py

diff --git a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
index c312b88e5..9072e8542 100644
--- a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
@@ -78,7 +78,7 @@ class autoPyTorchTargetPreprocessingComponent(autoPyTorchComponent):
      preprocessor
     """
     def __init__(self) -> None:
-        autoPyTorchComponent.__init__()
+        super().__init__()
         self.add_fit_requirements([
             FitRequirement('y_train',
                            (pd.DataFrame, ),
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 87e829fc2..9291e900b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -66,7 +66,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
             X_train = X['X_train']
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
-
         self.preprocessor.fit(X_train)
         return self
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
new file mode 100644
index 000000000..72f49183c
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import NoEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import (
+    TimeSeriesBaseEncoder,
+)
+
+
+class TimeSeriesNoEncoder(TimeSeriesBaseEncoder):
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 ):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesBaseEncoder":
+        NoEncoder.fit(self, X, y)
+        self.feature_shapes = X['dataset_properties']['feature_shapes']
+        return self
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesNoEncoder',
+            'name': 'Time Series No Encoder',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
new file mode 100644
index 000000000..152550285
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import OneHotEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import (
+    TimeSeriesBaseEncoder,
+)
+
+
+class TimeSeriesOneHotEncoder(TimeSeriesBaseEncoder):
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 ):
+        super(TimeSeriesOneHotEncoder, self).__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
+        OneHotEncoder.fit(self, X, y)
+        categorical_columns = X['dataset_properties']['categorical_columns']
+        n_features_cat = X['dataset_properties']['categories']
+        feature_names = X['dataset_properties']['feature_names']
+        if len(n_features_cat) == 0:
+            n_features_cat = self.preprocessor['categorical'].categories
+        for cat_column in categorical_columns:
+            self.feature_shapes[feature_names[cat_column]] = len(n_features_cat[cat_column])
+        return self
+
+    @staticmethod
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'TimeSeriesOneHotEncoder',
+            'name': 'Time Series One Hot Encoder',
+            'handles_sparse': False
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
new file mode 100644
index 000000000..3f71ddc81
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
@@ -0,0 +1,50 @@
+import os
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import (
+    TimeSeriesBaseEncoder
+)
+
+
+encoding_directory = os.path.split(__file__)[0]
+_encoders = find_components(__package__,
+                            encoding_directory,
+                            TimeSeriesBaseEncoder)
+_addons = ThirdPartyComponents(TimeSeriesBaseEncoder)
+
+
+def add_encoder(encoder: TimeSeriesBaseEncoder) -> None:
+    _addons.add_component(encoder)
+
+
+class TimeSeriesEncoderChoice(EncoderChoice):
+    """
+    Allows for dynamically choosing encoding component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available encoder components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseEncoder components available
+                as choices for encoding the categorical columns
+        """
+        components = OrderedDict()
+        components.update(_encoders)
+        components.update(_addons.components)
+        return components
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
new file mode 100644
index 000000000..3b2d29b84
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -0,0 +1,35 @@
+from typing import Any, Dict, List, Tuple
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+    autoPyTorchTimeSeriesPreprocessingComponent,
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class TimeSeriesBaseEncoder(autoPyTorchTimeSeriesPreprocessingComponent):
+    """
+    Base class for encoder
+    """
+    def __init__(self) -> None:
+        super(TimeSeriesBaseEncoder, self).__init__()
+        self.add_fit_requirements([
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categories', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('feature_names', (Tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('feature_shapes', (Dict, ), user_defined=True, dataset_property=True),
+        ])
+        self.feature_shapes = {}
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X['dataset_properties'].update({'feature_shapes': self.feature_shapes})
+        return BaseEncoder.transform(self, X)
+
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index 1b23374d2..84d4f5677 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -18,9 +18,9 @@
 
 
 class TimeSeriesFeatureImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesPreprocessingComponent):
-    def __int__(self,
+    def __init__(self,
                 random_state: Optional[np.random.RandomState] = None,
-                imputation_strategy: str = 'mean',):
+                imputation_strategy: str = 'mean'):
         super().__init__()
         self.random_state = random_state
         self.imputation_strategy = imputation_strategy
@@ -49,7 +49,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImput
                 imputer = Imputer(method='constant', random_state=self.random_state, value=0)
                 self.preprocessor['numerical'] = imputer
             else:
-                imputer = Imputer(method=self.numerical_strategy, random_state=self.random_state, value=0)
+                imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state, value=0)
                 self.preprocessor['numerical'] = imputer
 
         return self
@@ -86,7 +86,7 @@ def get_hyperparameter_search_space(
 
 
 class TimeSeriesTargetImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesTargetPreprocessingComponent):
-    def __int__(self,
+    def __init__(self,
                 random_state: Optional[np.random.RandomState] = None,
                 imputation_strategy: str = 'mean',):
         super().__init__()
@@ -146,6 +146,7 @@ def get_hyperparameter_search_space(
         """
         Time series imputor, for the sake of speed, we only allow local imputation here (i.e., the filled value only
         depends on its neighbours)
+        # TODO: Transformer for mean and median: df.fillna(df.groupby(df.index).agg('mean'))...
         Args:
             dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): dataset properties
             imputation_strategy: which strategy to use, its content is defined by
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
index f77f9d4e4..9550b534e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
@@ -1,10 +1,8 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict,  Optional
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
@@ -40,4 +38,4 @@ def get_hyperparameter_search_space(
         """
         cs = ConfigurationSpace()
         add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
-        return cs
\ No newline at end of file
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
index e69de29bb..5c5dce4cd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
@@ -0,0 +1,114 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+
+scaling_directory = os.path.split(__file__)[0]
+_scalers = find_components(__package__,
+                           scaling_directory,
+                           BaseScaler)
+
+_addons = ThirdPartyComponents(BaseScaler)
+
+
+def add_scaler(scaler: BaseScaler) -> None:
+    _addons.add_component(scaler)
+
+
+class ScalerChoice(autoPyTorchChoice):
+    """
+    Allows for dynamically choosing scaling component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available scaler components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseScalers components available
+                as choices for scaling
+        """
+        components = OrderedDict()
+        components.update(_scalers)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, Any]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        available_scalers = self.get_available_components(dataset_properties=dataset_properties,
+                                                          include=include,
+                                                          exclude=exclude)
+
+        if len(available_scalers) == 0:
+            raise ValueError("no scalers found, please add a scaler")
+
+        if default is None:
+            defaults = ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'NoScaler']
+            for default_ in defaults:
+                if default_ in available_scalers:
+                    default = default_
+                    break
+
+        # add only no scaler to choice hyperparameters in case the dataset is only categorical
+        if len(dataset_properties['numerical_features']) == 0:
+            default = 'NoScaler'
+            if include is not None and default not in include:
+                raise ValueError("Provided {} in include, however, "
+                                 "the dataset is incompatible with it".format(include))
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         ['NoScaler'],
+                                                         default_value=default)
+        else:
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         list(available_scalers.keys()),
+                                                         default_value=default)
+        cs.add_hyperparameter(preprocessor)
+
+        # add only child hyperparameters of early_preprocessor choices
+        for name in preprocessor.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
+                                                                                   **updates)
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, config_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the fit dictionary
+        It recursively makes sure that the children and parent level requirements
+        are honored before fit.
+        Args:
+            dataset_properties (Dict[str, Any]): dictionary holding the dataset properties
+
+        """
+        super()._check_dataset_properties(dataset_properties)
+        assert "numerical_features" in dataset_properties and \
+               "categorical_features" in dataset_properties, \
+            "Dataset properties must contain information about the type of features"
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index cd2f4093a..df191182c 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -3,12 +3,13 @@
 import numpy as np
 
 from sklearn.pipeline import Pipeline
-#from sktime.transformations.panel.compose import ColumnTransformer
 
 import torch
 
-from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from ConfigSpace import ConfigurationSpace
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import TargetScaler
 
 
@@ -65,3 +66,10 @@ def __call__(self,
                                                                         past_observed_values,
                                                                         future_targets)
         return past_target, future_targets, loc, scale
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
index b644eb83b..5649c2817 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
@@ -15,7 +15,7 @@ class TargetScaler(BaseEstimator):
     def __init__(self, mode: str):
         self.mode = mode
 
-    def fit(self, X: Dict, y: Any = None) -> "TimeSeriesScalerBatch":
+    def fit(self, X: Dict, y: Any = None) -> "TargetScaler":
         return self
 
     def transform(self,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 1f966f179..3bc98d148 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -163,7 +163,9 @@ def get_hyperparameter_search_space(
             future_feature_shapes = dataset_properties.get('future_feature_shapes', (0,))
             if num_in_features[-1] != future_feature_shapes[-1]:
                 # deepAR model cannot be applied
-                auto_regressive.value_range = False
+                auto_regressive = HyperparameterSearchSpace(hyperparameter=auto_regressive.hyperparameter,
+                                                            value_range=[False],
+                                                            default_value=False,)
 
         cs = ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index a508428ec..5ea33f371 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -410,29 +410,30 @@ def get_hyperparameter_search_space(
             deep_ar_hp = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
             if deep_ar_hp in cs:
                 deep_ar_hp = cs.get_hyperparameter(deep_ar_hp)
-                forbidden_deep_ar = ForbiddenEqualsClause(deep_ar_hp, True)
-                if min_num_blocks == 1:
-                    if max_num_blocks > 1:
-                        if max_num_blocks - min_num_blocks > 1:
-                            forbidden = ForbiddenAndConjunction(
-                                ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
+                if True in deep_ar_hp.choices:
+                    forbidden_deep_ar = ForbiddenEqualsClause(deep_ar_hp, True)
+                    if min_num_blocks == 1:
+                        if max_num_blocks > 1:
+                            if max_num_blocks - min_num_blocks > 1:
+                                forbidden = ForbiddenAndConjunction(
+                                    ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
+                                    forbidden_deep_ar
+                                )
+                            else:
+                                forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_deep_ar)
+                            cs.add_forbidden_clause(forbidden)
+
+                    forbidden_deep_ars = []
+
+                    hps_forbidden_deep_ar = [variable_selection, use_temporal_fusion]
+                    for hp_forbidden_deep_ar in hps_forbidden_deep_ar:
+                        if True in hp_forbidden_deep_ar.choices:
+                            forbidden_deep_ars.append(ForbiddenAndConjunction(
+                                ForbiddenEqualsClause(hp_forbidden_deep_ar, True),
                                 forbidden_deep_ar
-                            )
-                        else:
-                            forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_deep_ar)
-                        cs.add_forbidden_clause(forbidden)
-
-                forbidden_deep_ars = []
-
-                hps_forbidden_deep_ar = [variable_selection, use_temporal_fusion]
-                for hp_forbidden_deep_ar in hps_forbidden_deep_ar:
-                    if True in hp_forbidden_deep_ar.choices:
-                        forbidden_deep_ars.append(ForbiddenAndConjunction(
-                            ForbiddenEqualsClause(hp_forbidden_deep_ar, True),
-                            forbidden_deep_ar
-                        ))
-                if forbidden_deep_ars:
-                    cs.add_forbidden_clauses(forbidden_deep_ars)
+                            ))
+                    if forbidden_deep_ars:
+                        cs.add_forbidden_clauses(forbidden_deep_ars)
 
         if True in skip_connection.choices:
             forbidden_mlp_skip = []
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index bdff13d17..4548dfef0 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -79,11 +79,11 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int, n_pr
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
                                                                    past_target[:-sp],
                                                                    multioutput="raw_values")
-    if mase_denominator == 0.0:
-        # they will not be counter when computing MASE
-        return np.zeros_like(mase_denominator)
 
-    return 1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
+    return np.where(mase_denominator == 0.0,
+                    np.zeros_like(mase_denominator),
+                    1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
+                    )
 
 
 mean_MASE_forecasting = make_metric('mean_MASE_forecasting',
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index e8f666fb0..8bc1149ef 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -21,13 +21,14 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesTransformer
 )
-#from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesFeatureImputer import (
-#    TimeSeriesFeatureImputer
-#)
-#from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesTargetImputer import (
-#    TimeSeriesTargetImputer
-#)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
+    TimeSeriesEncoderChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
+    TimeSeriesFeatureImputer,
+    TimeSeriesTargetImputer,
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling import ScalerChoice
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
@@ -223,9 +224,10 @@ def _get_hyperparameter_search_space(self,
             if losses_non_ar:
                 forbidden_hp_regression_loss = ForbiddenInClause(hp_loss, losses_non_ar)
                 for hp_ar in hp_deepAR:
-                    forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                    forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
-                    forbidden_losses_all.append(forbidden_hp_dist)
+                    if True in hp_ar.choices:
+                        forbidden_hp_dist = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                        forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
+                        forbidden_losses_all.append(forbidden_hp_dist)
 
             decoder_auto_regressive = cs.get_hyperparameter("network_backbone:seq_encoder:decoder_auto_regressive")
             forecast_strategy = cs.get_hyperparameter("loss:DistributionLoss:forecast_strategy")
@@ -246,19 +248,22 @@ def _get_hyperparameter_search_space(self,
                 forbidden = ['MLPEncoder']
                 forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices]
                 for hp_ar in hp_deepAR:
-                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                    forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp, forbidden_deepAREncoder)
-                    forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
-                    forbidden_losses_all.append(forbidden_hp_ar_mlp)
+                    if True in hp_ar.choices:
+                        forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                        forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp, forbidden_deepAREncoder)
+                        forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
+                        forbidden_losses_all.append(forbidden_hp_ar_mlp)
 
             forecast_strategy = cs.get_hyperparameter('loss:DistributionLoss:forecast_strategy')
             if 'mean' in forecast_strategy.choices:
                 for hp_ar in hp_deepAR:
-                    forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                    forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
-                    forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
-                                                                                forbidden_hp_forecast_strategy)
-                    forbidden_losses_all.append(forbidden_hp_ar_forecast_strategy)
+                    if True in hp_ar.choices:
+
+                        forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                        forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
+                        forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
+                                                                                    forbidden_hp_forecast_strategy)
+                        forbidden_losses_all.append(forbidden_hp_ar_forecast_strategy)
 
             cs.add_forbidden_clauses(forbidden_losses_all)
 
@@ -319,13 +324,16 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
 
         if not default_dataset_properties.get("uni_variant", False):
             steps.extend([("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
-                          ("imputer", SimpleImputer(random_state=self.random_state)),
+                          ("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
                           ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+                          ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
+                                                               random_state=self.random_state)),
                           ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
                           ])
 
         # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
+            ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
             ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
             ("target_scaler", TargetScalerChoice(default_dataset_properties,
                                                  random_state=self.random_state)),

From 7eb51396e35fde94d3dfeee51716f8a0fe09e8f4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 26 Apr 2022 12:07:03 +0200
Subject: [PATCH 220/347] maint

---
 .../time_series_preprocessing/TimeSeriesTransformer.py    | 4 ++--
 .../imputation/TimeSeriesImputer.py                       | 4 ++--
 .../early_preprocessor/TimeSeriesEarlyPreProcessing.py    | 2 +-
 autoPyTorch/pipeline/time_series_forecasting.py           | 8 ++++++--
 requirements.txt                                          | 2 +-
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 9291e900b..60695eb5b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -5,8 +5,8 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline, make_pipeline
-from sktime.transformations.panel.compose import ColumnTransformer
-
+#from sktime.transformations.panel.compose import ColumnTransformer
+from sklearn.compose import ColumnTransformer
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent,
     autoPyTorchTimeSeriesTargetPreprocessingComponent,
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index 84d4f5677..2d0db6d7e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -49,7 +49,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImput
                 imputer = Imputer(method='constant', random_state=self.random_state, value=0)
                 self.preprocessor['numerical'] = imputer
             else:
-                imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state, value=0)
+                imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state)
                 self.preprocessor['numerical'] = imputer
 
         return self
@@ -114,7 +114,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImput
             imputer = Imputer(method='constant', random_state=self.random_state, value=0)
             self.preprocessor['target_numerical'] = imputer
         else:
-            imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state, value=0)
+            imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state)
             self.preprocessor['target_numerical'] = imputer
 
         return self
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index b421edef3..fe9c57df4 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -18,7 +18,7 @@
 from autoPyTorch.utils.common import FitRequirement
 
 
-class TimeSeriesEarllyPreprocessing(EarlyPreprocessing):
+class TimeSeriesEarlyPreprocessing(EarlyPreprocessing):
 
     def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
         super().__init__()
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 8bc1149ef..6edf595b0 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -29,7 +29,10 @@
     TimeSeriesTargetImputer,
 )
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling import ScalerChoice
-from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
+from autoPyTorch.pipeline.components.setup.early_preprocessor.TimeSeriesEarlyPreProcessing import (
+    TimeSeriesEarlyPreprocessing,
+    TimeSeriesTargetEarlyPreprocessing
+)
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
 from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
@@ -323,7 +326,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             default_dataset_properties.update(dataset_properties)
 
         if not default_dataset_properties.get("uni_variant", False):
-            steps.extend([("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
+            steps.extend([("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
                           ("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
                           ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
                           ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
@@ -333,6 +336,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
 
         # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
+            ("target_preprocessing", TimeSeriesTargetEarlyPreprocessing(random_state=self.random_state)),
             ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
             ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
             ("target_scaler", TargetScalerChoice(default_dataset_properties,
diff --git a/requirements.txt b/requirements.txt
index 1757e3727..7940e7f04 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,5 +18,5 @@ catboost
 lightgbm
 flaky
 tabulate
-sktime>=0.8.0
+sktime>=0.11.0
 

From 22fc0bc26c0aeebb027122302a99e0fea47d5561 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 26 Apr 2022 15:58:57 +0200
Subject: [PATCH 221/347] embedding and maint

---
 .../data/time_series_feature_validator.py     |  4 --
 .../data/time_series_forecasting_validator.py |  4 +-
 autoPyTorch/datasets/time_series_dataset.py   | 15 ++--
 .../TimeSeriesTransformer.py                  |  4 +-
 .../TimeSeriesEarlyPreProcessing.py           |  4 +-
 .../setup/network/forecasting_architecture.py | 68 +++++++++----------
 .../setup/network/forecasting_network.py      |  4 +-
 .../base_forecasting_encoder.py               |  5 +-
 .../LearnedEntityEmbedding.py                 | 32 +++++----
 .../setup/network_embedding/NoEmbedding.py    |  8 ++-
 .../base_network_embedding.py                 | 30 ++++++--
 .../training/data_loader/time_series_util.py  | 11 +--
 .../pipeline/components/training/losses.py    |  8 +--
 .../trainer/forecasting_trainer/__init__.py   |  6 +-
 .../forecasting_base_trainer.py               | 60 ++++++++++------
 15 files changed, 151 insertions(+), 112 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 6d4bffcac..fe3edeffb 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -84,7 +84,3 @@ def transform(
         X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns(),
                                        index=index)
         return X
-
-
-
-
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 06da40846..f1257e308 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -190,8 +190,6 @@ def transform(
 
             y_stacked = self.join_series(y)
 
-            y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked)
-
             if self.series_idx is None:
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
                 if not self._is_uni_variant:
@@ -212,6 +210,8 @@ def transform(
                 if not self._is_uni_variant:
                     x_transformed = self.feature_validator.transform(x_flat.drop(self.series_idx, axis=1),
                                                                      index=series_number)
+            y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked, index=series_number)
+
 
             if self._is_uni_variant:
                 return None, y_transformed, sequence_lengths
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 67ca765b1..cc082434c 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -173,6 +173,7 @@ def __getitem__(self, index: int, train: bool = True) \
                                                  ])
                 else:
                     future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+
         if future_features is not None and future_features.shape[0] == 0:
             future_features = None
 
@@ -194,7 +195,9 @@ def __getitem__(self, index: int, train: bool = True) \
             future_targets = torch.from_numpy(future_targets)
             future_targets = {
                 'future_targets': future_targets,
-                'future_observed_targets': self.observed_target[index + 1: index + self.n_prediction_steps + 1]
+                'future_observed_targets': torch.from_numpy(
+                    self.observed_target[index + 1: index + self.n_prediction_steps + 1]
+                )
             }
 
         if isinstance(past_features, np.ndarray):
@@ -206,16 +209,13 @@ def __getitem__(self, index: int, train: bool = True) \
         past_target = targets[:index + 1]
         past_target = torch.from_numpy(past_target)
 
-        # TODO combine with imputer!
-        past_observed_values = torch.ones([past_target.shape[0], 1], dtype=torch.bool)
-
         return {"past_targets": past_target,
                 "past_features": past_features,
                 "future_features": future_features,
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
-                'past_observed_targets': self.observed_target[:index + 1],
-                'decoder_lengths': None if future_targets is None else future_targets.shape[0]}, future_targets
+                'past_observed_targets': torch.from_numpy(self.observed_target[:index + 1]),
+                'decoder_lengths': 0 if future_targets is None else future_targets['future_targets'].shape[0]}, future_targets
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
@@ -584,7 +584,6 @@ def __init__(self,
             time_features_train=time_features_train,
             time_features_test=time_features_test,
             **sequences_kwargs)
-
         self.normalize_y = normalize_y
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
@@ -824,6 +823,8 @@ def make_sequences_datasets(self,
 
     def replace_data(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame]) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
+        if X_train is None:
+            return self
         if X_test is not None:
             X_test_group = X_test.groupby(X_test.index)
         for seq, x in zip(self.datasets, X_train.groupby(X_train.index)):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 60695eb5b..1639e7cd5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -5,8 +5,8 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline, make_pipeline
-#from sktime.transformations.panel.compose import ColumnTransformer
 from sklearn.compose import ColumnTransformer
+
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent,
     autoPyTorchTimeSeriesTargetPreprocessingComponent,
@@ -15,7 +15,7 @@
     get_time_series_preprocessers,
     get_time_series_target_preprocessers,
 )
-from autoPyTorch.utils.common import FitRequirement, subsampler
+from autoPyTorch.utils.common import FitRequirement
 
 
 class TimeSeriesTransformer(autoPyTorchTimeSeriesPreprocessingComponent):
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index fe9c57df4..21fdc7f5d 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -21,7 +21,7 @@
 class TimeSeriesEarlyPreprocessing(EarlyPreprocessing):
 
     def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
-        super().__init__()
+        super(EarlyPreprocessing, self).__init__()
         self.random_state = random_state
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
@@ -56,7 +56,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
 class TimeSeriesTargetEarlyPreprocessing(EarlyPreprocessing):
 
     def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
-        super().__init__()
+        super(EarlyPreprocessing, self).__init__()
         self.random_state = random_state
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 8654441b0..cded15796 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -313,7 +313,7 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
         raise NotImplementedError
@@ -349,7 +349,7 @@ def repeat_intermediate_values(self,
 class ForecastingNet(AbstractForecastingNet):
     def pre_processing(self,
                        past_targets: torch.Tensor,
-                       past_observed_values: torch.BoolTensor,
+                       past_observed_targets: torch.BoolTensor,
                        past_features: Optional[torch.Tensor] = None,
                        future_features: Optional[torch.Tensor] = None,
                        static_features: Optional[torch.Tensor] = None,
@@ -360,10 +360,10 @@ def pre_processing(self,
         if self.encoder_lagged_input:
             past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
                 past_targets[:, -self.window_size:],
-                past_observed_values[:, -self.window_size:]
+                past_observed_targets[:, -self.window_size:]
             )
             past_targets[:, :-self.window_size] = torch.where(
-                past_observed_values[:, :-self.window_size],
+                past_observed_targets[:, :-self.window_size],
                 self.scale_value(past_targets[:, :-self.window_size], loc, scale),
                 past_targets[:, :-self.window_size])
             x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
@@ -373,8 +373,8 @@ def pre_processing(self,
         else:
             if self.window_size < past_targets.shape[1]:
                 past_targets = past_targets[:, -self.window_size:]
-                past_observed_values = past_observed_values[:, -self.window_size:]
-            past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
+                past_observed_targets = past_observed_targets[:, -self.window_size:]
+            past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
             x_past = past_targets
 
         if self.network_structure.variable_selection:
@@ -445,12 +445,12 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.BoolTensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden, _ = self.pre_processing(
             past_targets=past_targets,
-            past_observed_values=past_observed_values,
+            past_observed_targets=past_observed_targets,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
@@ -468,7 +468,7 @@ def forward(self,
         if self.has_temporal_fusion:
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
-                                                  past_observed_values=past_observed_values,
+                                                  past_observed_values=past_observed_targets,
                                                   decoder_length=self.n_prediction_steps,
                                                   static_embedding=x_static
                                                   )
@@ -509,13 +509,13 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.BoolTensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
                           static_features=static_features,
-                          past_observed_values=past_observed_values)
+                          past_observed_targets=past_observed_targets)
         return self.pred_from_net_output(net_output)
 
 
@@ -568,11 +568,11 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.BoolTensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
             past_targets=past_targets,
-            past_observed_values=past_observed_values,
+            past_observed_targets=past_observed_targets,
             past_features=past_features,
             future_features=future_features,
             static_features=static_features,
@@ -611,7 +611,7 @@ def forward(self,
             if self.has_temporal_fusion:
                 decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                       decoder_output=decoder_output,
-                                                      past_observed_values=past_observed_values,
+                                                      past_observed_targets=past_observed_targets,
                                                       decoder_length=self.n_prediction_steps,
                                                       static_embedding=x_static
                                                       )
@@ -663,7 +663,7 @@ def forward(self,
                             decoder_output_all = decoder_output
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
-                                                              past_observed_values=past_observed_values,
+                                                              past_observed_targets=past_observed_targets,
                                                               decoder_length=idx_pred + 1,
                                                               static_embedding=x_static
                                                               )[:, -1:]
@@ -693,12 +693,12 @@ def forward(self,
                     repeats=self.num_samples)
 
                 if self.has_temporal_fusion:
-                    intermediate_values = self.repeat_intermediate_values([encoder_output, past_observed_values],
+                    intermediate_values = self.repeat_intermediate_values([encoder_output, past_observed_targets],
                                                                           is_hidden_states=[False, False],
                                                                           repeats=self.num_samples)
 
                     encoder_output = intermediate_values[0]
-                    past_observed_values = intermediate_values[1]
+                    past_observed_targets = intermediate_values[1]
 
                 if self.decoder_lagged_input:
                     max_lag_seq_length = max(self.decoder_lagged_value) + 1
@@ -752,7 +752,7 @@ def forward(self,
                             decoder_output_all = decoder_output
                         decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                               decoder_output=decoder_output_all,
-                                                              past_observed_values=past_observed_values,
+                                                              past_observed_targets=past_observed_targets,
                                                               decoder_length=idx_pred + 1,
                                                               static_embedding=x_static,
                                                               )
@@ -778,13 +778,13 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.BoolTensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
                           static_features=static_features,
-                          past_observed_values=past_observed_values)
+                          past_observed_targets=past_observed_targets)
         if self.output_type == 'regression':
             return self.pred_from_net_output(net_output)
         else:
@@ -846,17 +846,17 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.training:
             if self.encoder_lagged_input:
                 past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
                     past_targets[:, -self.window_size:],
-                    past_observed_values[:, -self.window_size:]
+                    past_observed_targets[:, -self.window_size:]
                 )
 
                 past_targets[:, :-self.window_size] = torch.where(
-                    past_observed_values[:, :-self.window_size],
+                    past_observed_targets[:, :-self.window_size],
                     self.scale_value(past_targets[:, :-self.window_size], loc, scale),
                     past_targets[:, :-self.window_size])
 
@@ -872,8 +872,8 @@ def forward(self,
             else:
                 if self.window_size < past_targets.shape[1]:
                     past_targets = past_targets[:, -self.window_size:]
-                    past_observed_values = past_observed_values[:, -self.window_size:]
-                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
+                    past_observed_targets = past_observed_targets[:, -self.window_size:]
+                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
                 future_targets = self.scale_value(future_targets, loc, scale)
                 targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
 
@@ -914,11 +914,11 @@ def forward(self,
             if self.encoder_lagged_input:
                 past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
                     past_targets[:, -self.window_size:],
-                    past_observed_values[:, -self.window_size:],
+                    past_observed_targets[:, -self.window_size:],
                 )
 
                 past_targets[:, :-self.window_size] = torch.where(
-                    past_observed_values[:, :-self.window_size],
+                    past_observed_targets[:, :-self.window_size],
                     self.scale_value(past_targets[:, :-self.window_size], loc, scale),
                     past_targets[:, :-self.window_size])
 
@@ -929,8 +929,8 @@ def forward(self,
             else:
                 if self.window_size < past_targets.shape[1]:
                     past_targets = past_targets[:, -self.window_size:]
-                    past_observed_values = past_observed_values[:, -self.window_size]
-                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
+                    past_observed_targets = past_observed_targets[:, -self.window_size]
+                past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
                 x_past = past_targets
 
             if self.network_structure.variable_selection:
@@ -1070,13 +1070,13 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.BoolTensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
                           static_features=static_features,
-                          past_observed_values=past_observed_values)
+                          past_observed_targets=past_observed_targets)
         return net_output
 
 
@@ -1089,13 +1089,13 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 static_features: Optional[torch.Tensor] = None,
-                past_observed_values: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.window_size < past_targets.shape[1]:
             past_targets = past_targets[:, -self.window_size:]
-            past_observed_values = past_observed_values[:, -self.window_size:]
+            past_observed_targets = past_observed_targets[:, -self.window_size:]
 
-        past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_values)
+        past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
 
         past_targets = past_targets.to(self.device)
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 38a9311e9..946a17f76 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -143,7 +143,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
             past_features = X_batch['past_features']
             future_features = X_batch["future_features"]
             static_features = X_batch["static_features"]
-            past_observed_values = X_batch['past_observed_values']
+            past_observed_targets = X_batch['past_observed_targets']
 
             if past_targets.ndim == 2:
                 past_targets = past_targets.unsqueeze(-1)
@@ -157,7 +157,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
                 if pred_kwargs[key] is not None:
                     pred_kwargs[key] = pred_kwargs[key].float()
 
-            pred_kwargs.update({'past_observed_values': past_observed_values})
+            pred_kwargs.update({'past_observed_targets': past_observed_targets})
 
             with torch.no_grad():
                 Y_batch_pred = self.network.predict(**pred_kwargs)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index f75f9930e..fa2145242 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -45,8 +45,6 @@ def __init__(self,
     def _required_fit_arguments(self) -> List[FitRequirement]:
         return [
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                           dataset_property=False),
             FitRequirement('y_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
                            dataset_property=False),
             FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
@@ -60,7 +58,8 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.check_requirements(X, y)
-        X_train = X['X_train']
+
+        X_train = X.get('X_train', None)
         y_train = X['y_train']
 
         input_shape = X["dataset_properties"]['input_shape']
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 8ad6549a2..3823f75c0 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Tuple
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -94,22 +94,26 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg
         super().__init__(random_state=random_state)
         self.config = kwargs
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
-        return _LearnedEntityEmbedding(config=self.config,
-                                       num_input_features=num_input_features,
-                                       num_numerical_features=num_numerical_features)
+    def build_embedding(self,
+                        num_input_features: np.ndarray,
+                        num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
+        embedding = _LearnedEntityEmbedding(config=self.config,
+                                            num_input_features=num_input_features,
+                                            num_numerical_features=num_numerical_features)
+        return embedding, embedding.num_output_dimensions
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter="min_unique_values_for_embedding",
-            value_range=(3, 7),
-            default_value=5,
-            log=True),
-        dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
-                                                                                   value_range=(0, 1),
-                                                                                   default_value=0.5),
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="min_unique_values_for_embedding",
+                value_range=(3, 7),
+                default_value=5,
+                log=True),
+            dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="dimension_reduction",
+                value_range=(0, 1),
+                default_value=0.5),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter)
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index 028dfb77b..9dabac6ad 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Union, Tuple, List
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
@@ -24,8 +24,10 @@ class NoEmbedding(NetworkEmbeddingComponent):
     def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
         super().__init__(random_state=random_state)
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
-        return _NoEmbedding()
+    def build_embedding(self,
+                        num_input_features: np.ndarray,
+                        num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
+        return _NoEmbedding(), list(num_input_features)
 
     @staticmethod
     def get_hyperparameter_search_space(
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index f8698f38d..1002393b8 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union, List
 
 import numpy as np
 
@@ -15,21 +15,33 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         super().__init__()
         self.embedding: Optional[nn.Module] = None
         self.random_state = random_state
+        self.feature_shapes: Dict[str, int] = {}
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         num_numerical_columns, num_input_features = self._get_args(X)
 
-        self.embedding = self.build_embedding(
+        self.embedding, num_output_features = self.build_embedding(
             num_input_features=num_input_features,
-            num_numerical_features=num_numerical_columns)
+            num_numerical_features=num_numerical_columns,
+        )
+        if "feature_shapes" in X['dataset_properties']:
+            # forecasting tasks
+            feature_names = X['dataset_properties']['feature_names']
+            for idx_cat, n_output_cat in enumerate(num_output_features[num_numerical_columns:]):
+                cat_feature_name = feature_names[idx_cat]
+                self.feature_shapes[cat_feature_name] = n_output_cat
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'network_embedding': self.embedding})
+        if "feature_shapes" in X['dataset_properties']:
+            X['dataset_properties'].update({"feature_shapes": self.feature_shapes})
         return X
 
-    def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
+    def build_embedding(self,
+                        num_input_features: np.ndarray,
+                        num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
         raise NotImplementedError
 
     def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
@@ -46,12 +58,16 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
                     named_transformers_['numerical_pipeline']
             else:
                 raise ValueError("Either a tabular or time_series transformer must be contained!")
-            num_numerical_columns = numerical_column_transformer.transform(
-                X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
+            if hasattr(X_train, 'iloc'):
+                num_numerical_columns = numerical_column_transformer.transform(
+                    X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1]
+            else:
+                num_numerical_columns = numerical_column_transformer.transform(
+                    X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
         num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
                                       dtype=np.int32)
         categories = X['dataset_properties']['categories']
 
         for i, category in enumerate(categories):
-            num_input_features[num_numerical_columns + i, ] = len(category)
+            num_input_features[num_numerical_columns + i,] = len(category)
         return num_numerical_columns, num_input_features
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 6512de1e1..5d4469223 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -112,11 +112,12 @@ def __call__(self, batch, sample_interval=1, seq_minimal_length=1, padding_value
         elif isinstance(elem, collections.abc.Mapping):
             # only past targets and features needs to be transformed
 
-            return {key: self([d[key] for d in batch]) if "past" not in key else self([d[key] for d in batch],
-                                                                                      self.sample_interval,
-                                                                                      self.window_size,
-                                                                                      self.target_padding_value) for key
-                    in elem}
+            return {
+                key: self([d[key] for d in batch]) if "past" not in key else self([d[key] for d in batch],
+                                                                                  self.sample_interval,
+                                                                                  self.window_size,
+                                                                                  self.target_padding_value) for key
+                in elem}
 
         elif elem is None:
             return None
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 2730e9227..a241eaa7a 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -29,7 +29,7 @@ class LogProbLoss(Loss):
     __constants__ = ['reduction']
 
     def __init__(self, reduction: str = 'mean') -> None:
-        super(LogProbLoss, self).__init__(reduction)
+        super(LogProbLoss, self).__init__(reduction=reduction)
 
     def forward(self, input_dist: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
         scores = input_dist.log_prob(target_tensor)
@@ -45,7 +45,7 @@ class MAPELoss(Loss):
     __constants__ = ['reduction']
 
     def __init__(self, reduction: str = 'mean') -> None:
-        super(MAPELoss, self).__init__(reduction)
+        super(MAPELoss, self).__init__(reduction=reduction)
 
     def forward(self, input: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
         # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/model/n_beats/_network.py
@@ -68,7 +68,7 @@ class MASELoss(Loss):
     __constants__ = ['reduction']
 
     def __init__(self, reduction: str = 'mean') -> None:
-        super(MASELoss, self).__init__(reduction)
+        super(MASELoss, self).__init__(reduce=reduction)
         self._mase_coefficient = 1.0
 
     def set_mase_coefficient(self, mase_coefficient: torch.Tensor) -> 'MASELoss':
@@ -93,7 +93,7 @@ class QuantileLoss(Loss):
     __constants__ = ['reduction']
 
     def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5], loss_weights=None) -> None:
-        super(QuantileLoss, self).__init__(reduction)
+        super(QuantileLoss, self).__init__(reduction=reduction)
         self.quantiles = quantiles
 
     def set_quantiles(self, quantiles = List[float]):
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index d53937391..2c29c0908 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -67,6 +67,10 @@ def prepare_trainer(self, X):
             metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=X['additional_metrics']))
         if 'optimize_metric' in X and X['optimize_metric'] not in [m.name for m in metrics]:
             metrics.extend(get_metrics(dataset_properties=X['dataset_properties'], names=[X['optimize_metric']]))
+        if hasattr(X['y_train'], "to_numpy"):
+            labels = X['y_train'].to_numpy()[X['backend'].load_datamanager().splits[X['split_id']][0]]
+        else:
+            labels = X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]]
 
         self.choice.prepare(
             model=X['network'],
@@ -78,7 +82,7 @@ def prepare_trainer(self, X):
             metrics_during_training=X['metrics_during_training'],
             scheduler=X['lr_scheduler'],
             task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
-            labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]],
+            labels=labels,
             step_interval=X['step_interval'],
             window_size=X['window_size'],
             dataset_properties=X['dataset_properties'],
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 425fcf4b6..6062e39a9 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -58,6 +58,13 @@ def prepare(
                         labels=labels,
                         step_interval=step_interval
                         )
+        # Weights for the loss function
+        kwargs = {}
+        if self.weighted_loss:
+            kwargs = self.get_class_weights(criterion, labels)
+        kwargs["reduction"] = 'none'
+        # Setup the loss function
+        self.criterion = criterion(**kwargs)
         metric_kwargs = {"sp": dataset_properties.get("sp", 1),
                          "n_prediction_steps": dataset_properties.get("n_prediction_steps", 1)}
         self.metrics_kwargs = metric_kwargs
@@ -139,7 +146,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             float: the loss incurred in the prediction
         """
         past_target = data['past_targets'].float()
-        past_observed_values = data['past_observed_values']
+        past_observed_targets = data['past_observed_targets']
 
         past_features = data["past_features"]
         if past_features is not None:
@@ -151,7 +158,10 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         if static_features is not None:
             static_features = static_features.float()
 
-        future_targets = self.cast_targets(future_targets)
+        future_observed_targets = future_targets["future_observed_targets"]
+        future_targets_values = future_targets["future_targets"]
+
+        future_targets_values = self.cast_targets(future_targets_values)
 
         if isinstance(self.criterion, MASELoss):
             self.criterion.set_mase_coefficient(data['mase_coefficient'].float().to(self.device))
@@ -161,11 +171,12 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
 
         if isinstance(self.model, NBEATSNet):
             past_target = past_target[:, -self.window_size:]
-            past_observed_values = past_observed_values[:, -self.window_size:]
+            past_observed_targets = past_observed_targets[:, -self.window_size:]
             past_target, criterion_kwargs_past = self.data_preparation(past_target,
                                                                        past_target.to(self.device))
-            past_target, criterion_kwargs_future = self.data_preparation(past_target, future_targets.to(self.device))
-            backcast, forecast = self.model(past_targets=past_target, past_observed_values=past_observed_values)
+            past_target, criterion_kwargs_future = self.data_preparation(past_target,
+                                                                         future_targets_values.to(self.device))
+            backcast, forecast = self.model(past_targets=past_target, past_observed_targets=past_observed_targets)
 
             loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
             loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
@@ -179,26 +190,28 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         else:
             if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
                 if self.window_size > past_target.shape[1]:
-                    all_targets = torch.cat([past_target[:, 1:, ], future_targets], dim=1)
+                    all_targets = torch.cat([past_target[:, 1:, ], future_targets_values], dim=1)
                 else:
                     if self.window_size == 1:
-                        all_targets = future_targets
+                        all_targets = future_targets_values
                     else:
-                        all_targets = torch.cat([past_target[:, 1 - self.window_size:, ], future_targets], dim=1)
+                        all_targets = torch.cat([past_target[:, 1 - self.window_size:, ],
+                                                 future_targets_values], dim=1)
                 past_target, criterion_kwargs = self.data_preparation(past_target, all_targets.to(self.device))
             else:
-                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets.to(self.device))
+                past_target, criterion_kwargs = self.data_preparation(past_target,
+                                                                      future_targets_values.to(self.device))
 
             outputs = self.model(past_targets=past_target,
                                  past_features=past_features,
                                  future_features=future_features,
                                  static_features=static_features,
                                  future_targets=future_targets,
-                                 past_observed_values=past_observed_values)
+                                 past_observed_targets=past_observed_targets)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
 
-            loss = loss_func(self.criterion, outputs)
+            loss = torch.mean(loss_func(self.criterion, outputs) * future_observed_targets)
 
         loss.backward()
         self.optimizer.step()
@@ -236,7 +249,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         with torch.no_grad():
             for step, (data, future_targets) in enumerate(test_loader):
                 past_target = data['past_targets'].float()
-                past_observed_values = data['past_observed_values']
+                past_observed_targets = data['past_observed_targets']
 
                 past_features = data["past_features"]
                 if past_features is not None:
@@ -254,32 +267,35 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 batch_size = past_target.shape[0]
 
-                future_targets = self.cast_targets(future_targets)
+                future_observed_targets = future_targets["future_observed_targets"]
+                future_targets_values = future_targets["future_targets"]
+
+                future_targets_values = self.cast_targets(future_targets_values)
 
-                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets)
+                past_target, criterion_kwargs = self.data_preparation(past_target, future_targets_values)
 
                 if isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
                     outputs = self.model(past_targets=past_target,
                                          past_features=past_features,
-                                         future_targets=future_targets,
+                                         future_targets=future_targets_values,
                                          future_features=future_features,
                                          static_features=static_features,
-                                         past_observed_values=past_observed_values)
+                                         past_observed_targets=past_observed_targets)
                 else:
                     outputs = self.model(past_targets=past_target,
                                          past_features=past_features,
                                          future_features=future_features,
                                          static_features=static_features,
-                                         past_observed_values=past_observed_values)
+                                         past_observed_targets=past_observed_targets)
 
                 # prepare
-                future_targets = future_targets.to(self.device)
+                future_targets_values = future_targets_values.to(self.device)
 
                 if isinstance(outputs, list) and self.model.output_type != 'quantile':
-                    loss = [self.criterion(output, future_targets) for output in outputs]
-                    loss = torch.mean(torch.Tensor(loss))
+                    loss = [self.criterion(output, future_targets_values) for output in outputs]
+                    loss = torch.mean(torch.Tensor(loss) * future_observed_targets)
                 else:
-                    loss = self.criterion(outputs, future_targets)
+                    loss = torch.mean(self.criterion(outputs, future_targets_values) * future_observed_targets)
                 outputs = self.model.pred_from_net_output(outputs)
                 outputs = outputs.detach().cpu()
 
@@ -287,7 +303,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 N += batch_size
 
                 outputs_data.append(outputs)
-                targets_data.append(future_targets.detach().cpu())
+                targets_data.append(future_targets_values.detach().cpu())
 
                 if writer:
                     writer.add_scalar(

From 1759fdf6c4373a5ea05d9527f2d8c9d8e2997a41 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 26 Apr 2022 18:48:43 +0200
Subject: [PATCH 222/347] move targets to the tail of the features

---
 autoPyTorch/datasets/time_series_dataset.py   | 17 ++++----
 .../setup/network/forecasting_architecture.py | 16 ++++----
 .../forecasting_backbone/cells.py             | 39 ++++++++++++++-----
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index cc082434c..19108cfbf 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -151,8 +151,9 @@ def __getitem__(self, index: int, train: bool = True) \
                 past_features = self.X[:index + 1]
 
             if self.known_future_features:
-                future_features = self.X.iloc[index + 1: index + self.n_prediction_steps + 1,
-                                  self.known_future_features]
+                future_features = self.X.iloc[
+                                  index + 1: index + self.n_prediction_steps + 1, self.known_future_features
+                                  ]
             else:
                 future_features = None
         else:
@@ -164,13 +165,14 @@ def __getitem__(self, index: int, train: bool = True) \
                 self.compute_time_features()
 
                 if past_features:
-                    past_features = np.hstack([self._cached_time_features[:index + 1], past_features])
+                    past_features = np.hstack(past_features, [self._cached_time_features[:index + 1]])
                 else:
                     past_features = self._cached_time_features[:index + 1]
                 if future_features:
-                    future_features = np.hstack([self._cached_time_features[
-                                                 index + 1:index + self.n_prediction_steps + 1], past_features
-                                                 ])
+                    future_features = np.hstack([
+                        past_features,
+                        self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+                    ])
                 else:
                     future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
 
@@ -215,7 +217,8 @@ def __getitem__(self, index: int, train: bool = True) \
                 "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
                 'past_observed_targets': torch.from_numpy(self.observed_target[:index + 1]),
-                'decoder_lengths': 0 if future_targets is None else future_targets['future_targets'].shape[0]}, future_targets
+                'decoder_lengths': 0 if future_targets is None else future_targets['future_targets'].shape[
+                    0]}, future_targets
 
     def __len__(self) -> int:
         return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index cded15796..919b26e84 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -429,7 +429,7 @@ def pre_processing(self,
         else:
             if past_features is not None:
                 past_features = past_features[:, -self.window_size:]
-                x_past = torch.cat([x_past, past_features], dim=-1)
+                x_past = torch.cat([past_features, x_past], dim=-1)
 
             x_past = x_past.to(device=self.device)
             if future_features is not None:
@@ -598,7 +598,7 @@ def forward(self,
             if self.network_structure.variable_selection:
                 x_future = self.decoder_select_variable(future_targets, future_features)
             else:
-                x_future = future_targets if future_features is None else torch.cat([future_targets, future_features],
+                x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
                                                                                     dim=-1)
             x_future = x_future.to(self.device)
 
@@ -645,8 +645,8 @@ def forward(self,
                             future_features=future_features[:, [idx_pred]] if future_features is not None else None
                         )
                     else:
-                        x_future = x_future if future_features is None else torch.cat([x_future,
-                                                                                       future_features[:, [idx_pred]]],
+                        x_future = x_future if future_features is None else torch.cat([future_features[:, [idx_pred]],
+                                                                                       x_future],
                                                                                       dim=-1)
                         x_future = x_future.to(self.device)
 
@@ -736,7 +736,7 @@ def forward(self,
                             future_features=None if repeated_time_feat is None else repeated_time_feat[:, [idx_pred]])
                     else:
                         x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [x_future, repeated_time_feat[:, [idx_pred], :]], dim=-1)
+                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
 
                         x_future = x_future.to(self.device)
 
@@ -890,7 +890,7 @@ def forward(self,
                 if past_features is not None:
                     past_features = past_features[:, -self.window_size:]
                     features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
-                    x_input = torch.cat([targets_all, features_all], dim=-1)
+                    x_input = torch.cat([features_all, targets_all], dim=-1)
 
                 x_input = x_input.to(self.device)
 
@@ -958,7 +958,7 @@ def forward(self,
                         features_all = future_features
                 else:
                     features_all = None
-                x_past = x_past if features_all is None else torch.cat([x_past, features_all[:, :self.window_size]],
+                x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
                                                                        dim=-1)
 
                 x_past = x_past.to(self.device)
@@ -1040,7 +1040,7 @@ def forward(self,
 
                 else:
                     if repeated_time_feat is not None:
-                        x_next = torch.cat([x_next, repeated_time_feat[:, [k - 1]]], dim=-1)
+                        x_next = torch.cat([repeated_time_feat[:, [k - 1]], x_next], dim=-1)
                     x_next = x_next.to(self.device)
                 encoder2decoder, _ = self.encoder(encoder_input=x_next,
                                                   additional_input=[None] * self.network_structure.num_blocks,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 2355abba4..136115d7b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -190,7 +190,7 @@ def get_attention_mask(self, past_observed_values: torch.BoolTensor, decoder_len
 class VariableSelector(nn.Module):
     def __init__(self,
                  network_structure: NetworkStructure,
-                 dataset_properties: Dict,
+                 dataset_properties: Dict[str, Any],
                  network_encoder: Dict[str, EncoderBlockInfo],
                  auto_regressive: bool = False,
                  feature_names: Tuple[str] = (),
@@ -198,6 +198,24 @@ def __init__(self,
                  feature_shapes: Dict[str, int] = {},
                  time_feature_names: Tuple[str] = (),
                  ):
+        """
+        Variable Selector. This models follows the implementation from
+        pytorch_forecasting.models.temporal_fusion_transformer.sub_modules.VariableSelectionNetwork
+        However, we adjust the structure to fit the data extracted from our dataloader: we record the feature index from
+        each feature names and break the input features on the fly.
+
+        The order of the input variables is as follows:
+        [features (from the dataset), time_features (from time feature transformers), targets]
+        Args:
+            network_structure (NetworkStructure): contains the information of the overall architecture information
+            dataset_properties (Dict): dataset properties
+            network_encoder(Dict[str, EncoderBlockInfo]): Network encoders
+            auto_regressive bool: if it belongs to an auto-regressive model
+            feature_names Tuple[str]: feature names, used to construct the selection network
+            known_future_features Tuple[str]: known future features
+            feature_shapes Dict[str, int]: shapes of each features
+            time_feature_names Tuple[str]: time feature names, used to complement feature_shapes
+        """
         super().__init__()
         first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
         static_input_sizes = dataset_properties['static_features_shape']
@@ -212,15 +230,6 @@ def __init__(self,
         future_feature_name2tensor_idx = {}
         idx_tracker = 0
         idx_tracker_future = 0
-        if time_feature_names:
-            for name in time_feature_names:
-                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker+1]
-                future_feature_name2tensor_idx[name] = [idx_tracker_future, idx_tracker_future + 1]
-                idx_tracker += 1
-                idx_tracker_future += 1
-                pre_scalar[name] = nn.Linear(1, self.hidden_size)
-                encoder_input_sizes[name] = self.hidden_size
-                decoder_input_sizes[name] = self.hidden_size
 
         if feature_names:
             for name in feature_names:
@@ -237,6 +246,16 @@ def __init__(self,
             future_feature_name2tensor_idx[future_name] = [idx_tracker_future, idx_tracker_future + feature_shape]
             idx_tracker_future += feature_shape
 
+        if time_feature_names:
+            for name in time_feature_names:
+                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker+1]
+                future_feature_name2tensor_idx[name] = [idx_tracker_future, idx_tracker_future + 1]
+                idx_tracker += 1
+                idx_tracker_future += 1
+                pre_scalar[name] = nn.Linear(1, self.hidden_size)
+                encoder_input_sizes[name] = self.hidden_size
+                decoder_input_sizes[name] = self.hidden_size
+
         if not feature_names or not known_future_features:
             # Ensure that at least one feature is applied
             placeholder_features = 'placeholder_features'

From 9652c80c75572d3987e59ad887be76a3f1d4f75e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 26 Apr 2022 21:29:59 +0200
Subject: [PATCH 223/347] maint

---
 .../encoding/OneHotEncoder.py                 |  5 ++-
 .../TimeSeriesEarlyPreProcessing.py           | 35 +++++++++++++++----
 .../setup/early_preprocessor/utils.py         |  2 ++
 .../forecasting_backbone/cells.py             |  5 +--
 .../base_forecasting_encoder.py               |  6 +++-
 .../base_network_embedding.py                 |  6 ++--
 .../pipeline/time_series_forecasting.py       |  6 ++--
 7 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
index 152550285..fd4e109cf 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -21,10 +21,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
         categorical_columns = X['dataset_properties']['categorical_columns']
         n_features_cat = X['dataset_properties']['categories']
         feature_names = X['dataset_properties']['feature_names']
+        feature_shapes = X['dataset_properties']['feature_shapes']
+
         if len(n_features_cat) == 0:
             n_features_cat = self.preprocessor['categorical'].categories
         for cat_column in categorical_columns:
-            self.feature_shapes[feature_names[cat_column]] = len(n_features_cat[cat_column])
+            feature_shapes[feature_names[cat_column]] = len(n_features_cat[cat_column])
+        self.feature_shapes = feature_shapes
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index 21fdc7f5d..9b27f950d 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -1,13 +1,9 @@
-from typing import Any, Dict, Optional, Union
-
-from ConfigSpace.configuration_space import ConfigurationSpace
+from typing import Any, Dict, Optional, Union, Tuple, List
 
 import numpy as np
 
 import pandas as pd
 
-from scipy.sparse import spmatrix
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchTargetPreprocessingComponent
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
@@ -26,9 +22,26 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
             FitRequirement('X_train', (pd.DataFrame, ), user_defined=True,
-                           dataset_property=False)])
+                           dataset_property=False),
+            FitRequirement('feature_names', (Tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+        ])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        if dataset is small process, we transform the entire dataset here.
+        Before transformation, the order of the dataset is:
+        [(unknown_columns), categorical_columns, numerical_columns]
+        While after transformation, the order of the dataset is:
+        [numerical_columns, categorical_columns, unknown_columns]
+        we need to change feature_names and feature_shapes accordingly
+        Args:
+            X(Dict): fit dictionary
+
+        Returns:
+            X_transformed(Dict): transformed fit dictionary
+        """
 
         transforms = get_preprocess_transforms(X)
         if X['dataset_properties']['is_small_preprocess']:
@@ -39,6 +52,16 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
                 X_train = X['backend'].load_datamanager().train_tensors[0]
 
             X['X_train'] = time_series_preprocess(dataset=X_train, transforms=transforms)
+        feature_names = X['dataset_properties']['feature_names']
+        numerical_columns = X['dataset_properties']['numerical_columns']
+        categorical_columns = X['dataset_properties']['categorical_columns']
+
+        # resort feature_names
+        new_feature_names = [feature_names[num_col] for num_col in numerical_columns]
+        new_feature_names += [feature_names[cat_col] for cat_col in categorical_columns]
+        if set(feature_names) != set(new_feature_names):
+            new_feature_names += list(set(feature_names) - set(new_feature_names))
+        X['dataset_properties']['feature_names'] = tuple(new_feature_names)
 
         # We need to also save the preprocess transforms for inference
         X.update({'preprocess_transforms': transforms})
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index 1050707a7..0b2094ad6 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -60,7 +60,9 @@ def time_series_preprocess(dataset: pd.DataFrame, transforms: torchvision.transf
     # TODO consider Numpy implementation
     composite_transforms = torchvision.transforms.Compose(transforms)
     if indices is None:
+        index = dataset.index
         dataset = composite_transforms(dataset)
+        dataset = pd.DataFrame(dataset, index=index)
     else:
         sub_dataset = dataset.iloc[:, indices]
         sub_dataset = composite_transforms(sub_dataset)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 136115d7b..e6eef8b09 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -221,8 +221,9 @@ def __init__(self,
         static_input_sizes = dataset_properties['static_features_shape']
         self.hidden_size = first_encoder_output_shape
 
-        assert set(feature_names) == set(feature_shapes.keys()), "feature_names and feature_shapes must have " \
-                                                                 "the same variable names"
+        assert set(feature_names) == set(feature_shapes.keys()), f"feature_names and feature_shapes must have " \
+                                                                 f"the same variable names but they are different" \
+                                                                 f"at {set(feature_names) ^ set(feature_shapes.keys())}"
         pre_scalar = {'past_targets': nn.Linear(dataset_properties['output_shape'][-1], self.hidden_size)}
         encoder_input_sizes = {'past_targets': self.hidden_size}
         decoder_input_sizes = {}
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index fa2145242..a2b24b402 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -68,16 +68,20 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         if self.block_number == 1:
             if not X["dataset_properties"]["uni_variant"]:
-                if not X["dataset_properties"]["is_small_preprocess"]:
+                if X["dataset_properties"]["is_small_preprocess"]:
+                    input_shape = X_train.shape[1:]
+                else:
                     # get input shape by transforming first two elements of the training set
                     transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
                     X_train = X_train[:1, np.newaxis, ...]
                     X_train = transforms(X_train)
                     input_shape = np.concatenate(X_train).shape[1:]
+
             if X['transform_time_features']:
                 n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
             else:
                 n_time_feature_transform = 0
+            input_shape = (*input_shape[:-1], input_shape[-1] + n_time_feature_transform)
 
             if 'network_embedding' in X.keys():
                 input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 1002393b8..895eb2fa3 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -26,11 +26,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             num_numerical_features=num_numerical_columns,
         )
         if "feature_shapes" in X['dataset_properties']:
+            feature_shapes = X['dataset_properties']['feature_shapes']
             # forecasting tasks
             feature_names = X['dataset_properties']['feature_names']
             for idx_cat, n_output_cat in enumerate(num_output_features[num_numerical_columns:]):
-                cat_feature_name = feature_names[idx_cat]
-                self.feature_shapes[cat_feature_name] = n_output_cat
+                cat_feature_name = feature_names[idx_cat + num_numerical_columns]
+                feature_shapes[cat_feature_name] = n_output_cat
+            self.feature_shapes = feature_shapes
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 6edf595b0..cb2e97d03 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -326,18 +326,18 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             default_dataset_properties.update(dataset_properties)
 
         if not default_dataset_properties.get("uni_variant", False):
-            steps.extend([("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
-                          ("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
+            steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
                           ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
                           ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
                                                                random_state=self.random_state)),
                           ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+                          ("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
                           ])
 
         # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
-            ("target_preprocessing", TimeSeriesTargetEarlyPreprocessing(random_state=self.random_state)),
             ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
+            ("target_preprocessing", TimeSeriesTargetEarlyPreprocessing(random_state=self.random_state)),
             ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
             ("target_scaler", TargetScalerChoice(default_dataset_properties,
                                                  random_state=self.random_state)),

From 1d8963647e32146e0478d6bfafe4fa39b779c146 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Apr 2022 13:20:37 +0200
Subject: [PATCH 224/347] static features

---
 .../data/time_series_feature_validator.py     | 25 +++++++++++-------
 .../data/time_series_forecasting_validator.py | 26 ++++++++++++++-----
 .../data/time_series_target_validator.py      |  3 ++-
 autoPyTorch/datasets/time_series_dataset.py   | 10 +++----
 4 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index fe3edeffb..7a9d54576 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -12,11 +12,12 @@
 
 class TimeSeriesFeatureValidator(TabularFeatureValidator):
     def __init__(
-        self,
-        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+            self,
+            logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
     ):
         super().__init__(logger)
         self.only_contain_series_idx = False
+        self.static_features = ()
 
     def get_reordered_columns(self):
         return self.transformed_columns + list(set(self.column_order) - set(self.transformed_columns))
@@ -58,29 +59,33 @@ def fit(self,
                     self.categorical_columns = []
                     return self
 
-                X_train_ = X_train.drop(series_idx, axis=1)
+                X_train = X_train.drop(series_idx, axis=1)
 
-                X_test_ = X_test.drop(series_idx, axis=1) if X_test is not None else None
+                X_test = X_test.drop(series_idx, axis=1) if X_test is not None else None
 
-                super().fit(X_train_, X_test_)
+                super().fit(X_train, X_test)
             else:
                 raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
                                           f"X_train is {type(X_train)} ")
         else:
             super().fit(X_train, X_test)
+        if isinstance(X_train, np.ndarray):
+            X_train = pd.DataFrame(X_train, index=[0] * len(X_train))
+        static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
+        self.static_features = (idx for idx in static_features.index if static_features[idx])
 
         return self
 
     def transform(
-        self,
-        X: SupportedFeatTypes,
-        index: Optional[Union[pd.Index, np.ndarray]] = None,
+            self,
+            X: SupportedFeatTypes,
+            index: Optional[Union[pd.Index, np.ndarray]] = None,
     ) -> Union[pd.DataFrame]:
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if index is None:
             index = np.array([0.] * len(X))
         if X.ndim == 1:
             X = np.expand_dims(X, -1)
-        X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns(),
-                                       index=index)
+        X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
+        X.index = index
         return X
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index f1257e308..ae369f351 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -166,7 +166,7 @@ def transform(
             self,
             X: Optional[Union[List, pd.DataFrame]],
             y: Optional[Union[List, pd.DataFrame]] = None,
-    ) -> Tuple[Optional[pd.DataFrame], pd.DataFrame, np.ndarray]:
+    ) -> Tuple[Optional[pd.DataFrame], pd.DataFrame, List[int]]:
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
@@ -212,7 +212,6 @@ def transform(
                                                                      index=series_number)
             y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked, index=series_number)
 
-
             if self._is_uni_variant:
                 return None, y_transformed, sequence_lengths
 
@@ -221,21 +220,34 @@ def transform(
             raise NotImplementedError
 
     @staticmethod
-    def join_series(input: List[SupportedFeatTypes]) -> SupportedFeatTypes:
+    def join_series(input: List[SupportedFeatTypes],
+                    return_seq_lengths: bool = False) -> Union[pd.DataFrame,
+                                                               Tuple[pd.DataFrame, List[int]]]:
         """
         join the series into one single value
         """
+        num_sequences = len(input)
+        sequence_lengths = [0] * num_sequences
+        for seq_idx in range(num_sequences):
+            sequence_lengths[seq_idx] = len(input[seq_idx])
+        series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
         if not isinstance(input, List):
             raise ValueError(f'Input must be a list, but it is {type(input)}')
         if isinstance(input[0], pd.DataFrame):
-            return pd.concat(input)
+            joint_input = pd.concat(input)
         elif isinstance(input[0], sparse.spmatrix):
             if len(input[0].shape) > 1:
-                return sparse.vstack(input)
+                joint_input = sparse.vstack(input)
             else:
-                return sparse.hstack(input)
+                joint_input = sparse.hstack(input)
         elif isinstance(input[0], (List, np.ndarray)):
-            return np.concatenate(input)
+            joint_input = np.concatenate(input)
         else:
             raise NotImplementedError(f'Unsupported input type: List[{type(input[0])}]')
+        joint_input = pd.DataFrame(joint_input)
+        joint_input.index = series_number
 
+        if return_seq_lengths:
+            return joint_input, sequence_lengths
+        else:
+            return joint_input
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 384ec7cac..34c879f0f 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -82,7 +82,8 @@ def transform(self,
         if y.ndim == 1:
             y = np.expand_dims(y, -1)
 
-        y: pd.DataFrame = pd.DataFrame(y, index=index)
+        y: pd.DataFrame = pd.DataFrame(y)
+        y.index = index
 
         return y
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 19108cfbf..593dbf980 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -351,7 +351,6 @@ def __init__(self,
                  n_prediction_steps: int = 1,
                  dataset_name: Optional[str] = None,
                  normalize_y: bool = False,
-                 static_features: Optional[np.ndarray] = None,
                  ):
         """
         :param target_variables:  Optional[Union[Tuple[int], int]] used for multi-variant forecasting
@@ -437,6 +436,8 @@ def __init__(self,
         self.start_times_train = self.validator.start_times_train
         self.start_times_test = self.validator.start_times_test
 
+        self.static_features = self.validator.feature_validator.static_features
+
         self._transform_time_feature = False
         if not time_feature_transform:
             time_feature_transform = time_features_from_frequency_str(self.freq)
@@ -577,7 +578,7 @@ def __init__(self,
                             "n_prediction_steps": n_prediction_steps,
                             "sp": self.seasonality,
                             "known_future_features": known_future_features,
-                            "static_features": static_features}
+                            "static_features": self.static_features}
 
         sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
             X=X, Y=Y,
@@ -591,7 +592,6 @@ def __init__(self,
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
         self.known_future_features = known_future_features
-        self.static_features = static_features
 
         self.seq_length_min = int(np.min(self.sequence_lengths_train))
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
@@ -611,10 +611,6 @@ def __init__(self,
         self.issparse: bool = issparse(self.train_tensors[0])
         # TODO find a way to edit input shape!
         self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)
-        if static_features is None:
-            self.static_features_shape: int = 0
-        else:
-            self.static_features_shape: int = static_features.size
 
         if known_future_features is None:
             self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)

From 282d63bc9b50c7d6bfbe66ba3a7c333a0395faef Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Apr 2022 17:28:50 +0200
Subject: [PATCH 225/347] adjsut scaler to static features

---
 .../data/time_series_feature_validator.py     |   4 +-
 autoPyTorch/datasets/time_series_dataset.py   |   3 +-
 .../scaling/MaxAbsScaler.py                   |  33 -----
 .../scaling/MinMaxScaler.py                   |  32 -----
 .../scaling/NoScaler.py                       |  43 -------
 .../scaling/StandardScaler.py                 |  32 -----
 .../scaling/__init__.py                       | 114 ------------------
 .../scaling/base_scaler.py                    |  56 ++++++++-
 .../scaling/utils.py                          |  96 ++++++++++-----
 .../setup/network/forecasting_network.py      |   4 +-
 .../forecasting_backbone/cells.py             |   1 -
 .../base_forecasting_decoder.py               |   3 +-
 .../base_forecasting_encoder.py               |   6 +-
 .../pipeline/time_series_forecasting.py       |   4 +-
 14 files changed, 125 insertions(+), 306 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 7a9d54576..40f70097b 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -72,8 +72,8 @@ def fit(self,
         if isinstance(X_train, np.ndarray):
             X_train = pd.DataFrame(X_train, index=[0] * len(X_train))
         static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
-        self.static_features = (idx for idx in static_features.index if static_features[idx])
-
+        self.static_features = tuple(idx for idx in static_features.index if static_features[idx])
+        self.get_reordered_columns()
         return self
 
     def transform(
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 593dbf980..2f31ed508 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -64,7 +64,7 @@ def __init__(self,
                  start_time_test: Optional[pd.DatetimeIndex] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
-                 static_features: Optional[np.ndarray] = None,
+                 static_features: Tuple[Union[int, str]] = None,
                  n_prediction_steps: int = 0,
                  sp: int = 1,
                  known_future_features: Optional[Tuple[str]] = None,
@@ -214,7 +214,6 @@ def __getitem__(self, index: int, train: bool = True) \
         return {"past_targets": past_target,
                 "past_features": past_features,
                 "future_features": future_features,
-                "static_features": self.static_features,
                 "mase_coefficient": self.mase_coefficient,
                 'past_observed_targets': torch.from_numpy(self.observed_target[:index + 1]),
                 'decoder_lengths': 0 if future_targets is None else future_targets['future_targets'].shape[
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
deleted file mode 100644
index 4818e20b4..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MaxAbsScaler.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
-
-
-class MaxAbsScaler(BaseScaler):
-    """
-    Scales numerical features into range [-1, 1]
-    """
-
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
-        """
-        Args:
-            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
-        """
-        super().__init__()
-        self.random_state = random_state
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
-        self.check_requirements(X, y)
-
-        self.preprocessor['numerical'] = TimeSeriesScaler(mode="max_abs")
-        return self
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'MaxAbsScaler',
-            'name': 'MaxAbsScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
deleted file mode 100644
index c23d8cf06..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/MinMaxScaler.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
-
-
-class MinMaxScaler(BaseScaler):
-    """
-    Scales numerical features into range [0, 1]
-    """
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
-        """
-        Args:
-            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
-        """
-        super().__init__()
-        self.random_state = random_state
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
-        self.check_requirements(X, y)
-
-        self.preprocessor["numerical"] = TimeSeriesScaler(mode="min_max")
-        return self
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'MinMaxScaler',
-            'name': 'MinMaxScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
deleted file mode 100644
index 67a2c55fd..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/NoScaler.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
-
-
-class NoScaler(BaseScaler):
-    """
-    No scaling performed
-    """
-
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
-        """
-        Args:
-            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
-        """
-        super().__init__()
-        self.random_state = random_state
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
-        """
-        The fit function calls the fit function of the underlying model
-        and returns the transformed array.
-        Args:
-            X (np.ndarray): input features
-            y (Optional[np.ndarray]): input labels
-
-        Returns:
-            instance of self
-        """
-        self.check_requirements(X, y)
-
-        self.preprocessor["numerical"] = TimeSeriesScaler(mode="none")
-        return self
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'NoScaler',
-            'name': 'NoScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
deleted file mode 100644
index b831e222a..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/StandardScaler.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
-
-
-class StandardScaler(BaseScaler):
-    """
-    Standardise numerical features by removing mean and scaling to unit variance
-    """
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
-        """
-        Args:
-            random_state (Optional[Union[np.random.RandomState, int]]): Determines random number generation
-        """
-        super().__init__()
-        self.random_state = random_state
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:
-        self.check_requirements(X, y)
-
-        self.preprocessor['numerical'] = TimeSeriesScaler(mode="standard")
-        return self
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'StandardScaler',
-            'name': 'Standard Scaler'
-        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
index 5c5dce4cd..e69de29bb 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/__init__.py
@@ -1,114 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Any, Dict, List, Optional
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
-
-scaling_directory = os.path.split(__file__)[0]
-_scalers = find_components(__package__,
-                           scaling_directory,
-                           BaseScaler)
-
-_addons = ThirdPartyComponents(BaseScaler)
-
-
-def add_scaler(scaler: BaseScaler) -> None:
-    _addons.add_component(scaler)
-
-
-class ScalerChoice(autoPyTorchChoice):
-    """
-    Allows for dynamically choosing scaling component at runtime
-    """
-
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
-        """Returns the available scaler components
-
-        Args:
-            None
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: all BaseScalers components available
-                as choices for scaling
-        """
-        components = OrderedDict()
-        components.update(_scalers)
-        components.update(_addons.components)
-        return components
-
-    def get_hyperparameter_search_space(self,
-                                        dataset_properties: Optional[Dict[str, Any]] = None,
-                                        default: Optional[str] = None,
-                                        include: Optional[List[str]] = None,
-                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None:
-            dataset_properties = dict()
-
-        dataset_properties = {**self.dataset_properties, **dataset_properties}
-
-        available_scalers = self.get_available_components(dataset_properties=dataset_properties,
-                                                          include=include,
-                                                          exclude=exclude)
-
-        if len(available_scalers) == 0:
-            raise ValueError("no scalers found, please add a scaler")
-
-        if default is None:
-            defaults = ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'NoScaler']
-            for default_ in defaults:
-                if default_ in available_scalers:
-                    default = default_
-                    break
-
-        # add only no scaler to choice hyperparameters in case the dataset is only categorical
-        if len(dataset_properties['numerical_features']) == 0:
-            default = 'NoScaler'
-            if include is not None and default not in include:
-                raise ValueError("Provided {} in include, however, "
-                                 "the dataset is incompatible with it".format(include))
-            preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                         ['NoScaler'],
-                                                         default_value=default)
-        else:
-            preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                         list(available_scalers.keys()),
-                                                         default_value=default)
-        cs.add_hyperparameter(preprocessor)
-
-        # add only child hyperparameters of early_preprocessor choices
-        for name in preprocessor.choices:
-            updates = self._get_search_space_updates(prefix=name)
-            config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
-                                                                                   **updates)
-            parent_hyperparameter = {'parent': preprocessor, 'value': name}
-            cs.add_configuration_space(name, config_space,
-                                       parent_hyperparameter=parent_hyperparameter)
-
-        self.configuration_space = cs
-        self.dataset_properties = dataset_properties
-        return cs
-
-    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
-        """
-        A mechanism in code to ensure the correctness of the fit dictionary
-        It recursively makes sure that the children and parent level requirements
-        are honored before fit.
-        Args:
-            dataset_properties (Dict[str, Any]): dictionary holding the dataset properties
-
-        """
-        super()._check_dataset_properties(dataset_properties)
-        assert "numerical_features" in dataset_properties and \
-               "categorical_features" in dataset_properties, \
-            "Dataset properties must contain information about the type of features"
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
index 2567032b0..2b3bb7905 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -1,20 +1,42 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, Optional, List, Union
 
+import numpy as np
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent
 )
 from autoPyTorch.utils.common import FitRequirement
-
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
 
 class BaseScaler(autoPyTorchTimeSeriesPreprocessingComponent):
     """
     Provides abstract class interface for time series scalers in AutoPytorch
     """
 
-    def __init__(self) -> None:
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None,
+                 scaling_mode: str = 'standard'):
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True)
+        ])
+        self.random_state = random_state
+        self.scaling_mode = scaling_mode
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> 'BaseScaler':
+        self.check_requirements(X, y)
+        dataset_is_small_preprocess = X["dataset_properties"]["is_small_preprocess"]
+        static_features = X['dataset_properties'].get('static_features', ())
+        self.preprocessor['numerical'] = TimeSeriesScaler(mode=self.scaling_mode,
+                                                          dataset_is_small_preprocess=dataset_is_small_preprocess,
+                                                          static_features=static_features)
+        return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -29,3 +51,29 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             raise ValueError(f"can not call transform on {self.__class__.__name__} without fitting first.")
         X.update({'scaler': self.preprocessor})
         return X
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            scaling_mode: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='scaling_mode',
+                value_range=("standard", "min_max", "max_abs", "mean_abs", "none"),
+                default_value="standard",
+            ),
+    ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the Time Series Imputator
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+            scaling_mode (HyperparameterSearchSpace: default = ...)
+                The strategy to use for scaling, its hyperparameters are defined by sktime
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a Time Series Imputor with the given
+                `dataset_properties`
+        """
+        cs = ConfigurationSpace()
+        add_hyperparameter(cs, scaling_mode, CategoricalHyperparameter)
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index f6479382f..62389bd0c 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -11,74 +11,106 @@
 # Similar to / inspired by
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
 class TimeSeriesScaler(BaseEstimator):
-    def __init__(self, mode: str):
+    def __init__(self, mode: str,
+                 dataset_is_small_preprocess: bool = False,
+                 static_features: Tuple[Union[str, int]] = ()):
         self.mode = mode
-        #self.loc = 0.  # type: Union[np.ndarray, float]
-        #self.scale = 1.  # type: Union[np.ndarray, float]
+        self.dataset_is_small_preprocess = dataset_is_small_preprocess
+        self.static_features = static_features
 
     def fit(self, X: pd.DataFrame, y: Any = None) -> "TimeSeriesScaler":
         """
         The transformer is transformed on the fly (for each batch)
         """
+        static_features = [static_fea for static_fea in self.static_features if static_fea in X.columns]
+        self.static_features = static_features
+        return self
+
+    def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
+        """
+        X = sklearn.utils.check_array(
+            X,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        ) # type: np.ndarray
+        """
         if self.mode == "standard":
-            X_grouped = X.groupby(X.index)
+            if self.dataset_is_small_preprocess:
+                X_grouped = X.groupby(X.index)
+
+                self.loc = X_grouped.agg("mean")
+                self.scale = X_grouped.agg("std")
+
+                # for static features, if we do normalization w.r.t. each group, then they will become the same values,
+                # thus we treat them differently: normalize with the entire dataset
+                self.scale[self.static_features] = self.loc[self.static_features].std()
+                self.loc[self.static_features] = self.loc[self.static_features].mean()
+            else:
+                self.loc = X.mean()
+                self.scale = X.std()
 
-            self.loc = X_grouped.agg("mean")
-            self.scale = X_grouped.agg("std")
             # ensure that if all the values are the same in a group, we could still normalize them correctly
             self.scale.mask(self.scale == 0.0, self.loc)
             self.scale[self.scale == 0] = 1.
 
+            return (X - self.loc) / self.scale
+
         elif self.mode == "min_max":
-            X_grouped = X.groupby(X.index)
+            if self.dataset_is_small_preprocess:
+                X_grouped = X.groupby(X.index)
+                min_ = X_grouped.agg("min")
+                max_ = X_grouped.agg("max")
 
-            min_ = X_grouped.agg("min")
-            max_ = X_grouped.agg("max")
+                min_[self.static_features] = min_[self.static_features].min()
+                max_[self.static_features] = max_[self.static_features].max()
+
+            else:
+                min_ = X.min()
+                max_ = X.max()
 
             diff_ = max_ - min_
             self.loc = min_
             self.scale = diff_
             self.scale.mask(self.scale == 0.0, self.loc)
             self.scale[self.scale == 0.0] = 1.0
+            return (X - self.loc) / self.scale
 
         elif self.mode == "max_abs":
             X_abs = X.transform("abs")
-            max_abs_ = X_abs.groupby(X_abs.index).transform("max")
+            if self.dataset_is_small_preprocess:
+                max_abs_ = X_abs.groupby(X_abs.index).transform("max")
+                max_abs_[self.static_features] = max_abs_[self.static_features].max()
+            else:
+                max_abs_ = X_abs.max()
+
             max_abs_[max_abs_ == 0.0] = 1.0
             self.loc = None
             self.scale = max_abs_
 
+            return X / self.scale
+
         elif self.mode == 'mean_abs':
             X_abs = X.transform("abs")
-            X_abs = X_abs.groupby(X_abs.index)
-            mean_abs_ = X_abs.agg("mean")
+            if self.dataset_is_small_preprocess:
+                X_abs = X_abs.groupby(X_abs.index)
+                mean_abs_ = X_abs.agg("mean")
+                mean_abs_[self.static_features] = mean_abs_[self.static_features].mean()
+            else:
+                mean_abs_ = X_abs.mean()
             self.loc = None
             self.scale = mean_abs_.mask(mean_abs_ == 0.0, X_abs.agg("max"))
 
+            return X / self.scale
+
         elif self.mode == "none":
             self.loc = None
             self.scale = None
-        else:
-            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
-        return self
 
-    def transform(self, X: np.ndarray) -> Tuple[np.ndarray, ...]:
-        """
-        X = sklearn.utils.check_array(
-            X,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        ) # type: np.ndarray
-        """
-
-        if self.mode in {"standard", "min_max"}:
-            return (X - self.loc) / self.scale
-        elif self.mode in {"max_abs", "mean_abs"}:
-            return X / self.scale
-        else:
             return X
+        else:
+            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 946a17f76..9ad9f9c87 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -142,7 +142,6 @@ def predict(self, loader: torch.utils.data.DataLoader,
             past_targets = X_batch['past_targets']
             past_features = X_batch['past_features']
             future_features = X_batch["future_features"]
-            static_features = X_batch["static_features"]
             past_observed_targets = X_batch['past_observed_targets']
 
             if past_targets.ndim == 2:
@@ -150,8 +149,7 @@ def predict(self, loader: torch.utils.data.DataLoader,
 
             pred_kwargs = {"past_targets": past_targets,
                            "past_features": past_features,
-                           "future_features": future_features,
-                           "static_features": static_features}
+                           "future_features": future_features}
 
             for key in pred_kwargs.keys():
                 if pred_kwargs[key] is not None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index e6eef8b09..bcfd4be5e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -218,7 +218,6 @@ def __init__(self,
         """
         super().__init__()
         first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
-        static_input_sizes = dataset_properties['static_features_shape']
         self.hidden_size = first_encoder_output_shape
 
         assert set(feature_names) == set(feature_shapes.keys()), f"feature_names and feature_shapes must have " \
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index fe2e5b917..859c7deba 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -67,7 +67,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         self.check_requirements(X, y)
         output_shape = X['dataset_properties']['output_shape']
-        static_features_shape = X["dataset_properties"]["static_features_shape"]
 
         encoder_output_shape = X['network_encoder'][f'block_{self.block_number}'].encoder_output_shape
 
@@ -92,7 +91,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             if self.block_number == network_structure.num_blocks:
                 self.is_last_decoder = True
 
-            future_in_features = future_feature_shapes[-1] + static_features_shape
+            future_in_features = future_feature_shapes[-1]
             if variable_selection:
                 future_in_features = X['network_encoder']['block_1'].encoder_output_shape[-1]
             else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index a2b24b402..fdbb88733 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -50,7 +50,6 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
             FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('static_features_shape', (int,), user_defined=True, dataset_property=True),
             FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
@@ -64,7 +63,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         input_shape = X["dataset_properties"]['input_shape']
         output_shape = X["dataset_properties"]['output_shape']
-        static_features_shape = X["dataset_properties"]["static_features_shape"]
 
         if self.block_number == 1:
             if not X["dataset_properties"]["uni_variant"]:
@@ -91,9 +89,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 in_features = self.n_encoder_output_feature()
             elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
                 in_features = len(self.lagged_value) * output_shape[-1] + \
-                              input_shape[-1] + static_features_shape + n_time_feature_transform
+                              input_shape[-1] + n_time_feature_transform
             else:
-                in_features = output_shape[-1] + input_shape[-1] + static_features_shape + n_time_feature_transform
+                in_features = output_shape[-1] + input_shape[-1] + n_time_feature_transform
 
             input_shape = (X['window_size'], in_features)
         else:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index cb2e97d03..d96208bb2 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -28,7 +28,7 @@
     TimeSeriesFeatureImputer,
     TimeSeriesTargetImputer,
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
 from autoPyTorch.pipeline.components.setup.early_preprocessor.TimeSeriesEarlyPreProcessing import (
     TimeSeriesEarlyPreprocessing,
     TimeSeriesTargetEarlyPreprocessing
@@ -327,7 +327,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
 
         if not default_dataset_properties.get("uni_variant", False):
             steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
-                          ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
+                          ("scaler", BaseScaler(random_state=self.random_state)),
                           ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
                                                                random_state=self.random_state)),
                           ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),

From fb8b805b6de3b38f2d16186bcd5c7cb595837442 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Apr 2022 18:07:58 +0200
Subject: [PATCH 226/347] remove static features from forward dict

---
 .../setup/network/forecasting_architecture.py | 26 -------------------
 .../forecasting_base_trainer.py               |  9 -------
 2 files changed, 35 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 919b26e84..331f84013 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -312,7 +312,6 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
@@ -327,7 +326,6 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None
                 ):
         raise NotImplementedError
 
@@ -352,7 +350,6 @@ def pre_processing(self,
                        past_observed_targets: torch.BoolTensor,
                        past_features: Optional[torch.Tensor] = None,
                        future_features: Optional[torch.Tensor] = None,
-                       static_features: Optional[torch.Tensor] = None,
                        length_past: int = 0,
                        length_future: int = 0,
                        variable_selector_kwargs: Dict = {},
@@ -418,7 +415,6 @@ def pre_processing(self,
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
                 x_past=x_past,
                 x_future=x_future,
-                x_static=static_features,
                 batch_size=batch_size,
                 length_past=length_past,
                 length_future=length_future,
@@ -434,8 +430,6 @@ def pre_processing(self,
             x_past = x_past.to(device=self.device)
             if future_features is not None:
                 future_features = future_features.to(self.device)
-            if static_features is not None:
-                static_features = static_features.to(self.device)
             x_past = self.embedding(x_past)  # TODO embedding for future features!
             return x_past, future_features, static_features, loc, scale, None, past_targets
 
@@ -444,7 +438,6 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ):
@@ -453,7 +446,6 @@ def forward(self,
             past_observed_targets=past_observed_targets,
             past_features=past_features,
             future_features=future_features,
-            static_features=static_features,
             length_past=min(self.window_size, past_targets.shape[1]),
             length_future=self.n_prediction_steps
         )
@@ -508,13 +500,11 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
-                          static_features=static_features,
                           past_observed_targets=past_observed_targets)
         return self.pred_from_net_output(net_output)
 
@@ -567,7 +557,6 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
@@ -575,7 +564,6 @@ def forward(self,
             past_observed_targets=past_observed_targets,
             past_features=past_features,
             future_features=future_features,
-            static_features=static_features,
             length_past=min(self.window_size, past_targets.shape[1]),
             length_future=0,
             variable_selector_kwargs={'cache_static_contex': True}
@@ -709,10 +697,6 @@ def forward(self,
                 repeated_predicted_target = repeated_past_target[:, [-1]]
                 repeated_past_target = repeated_past_target[:, :-1, ]
 
-                repeated_static_feat = static_features.repeat_interleave(
-                    repeats=self.num_samples, dim=0
-                ).unsqueeze(dim=1) if static_features is not None else None
-
                 repeated_time_feat = future_features.repeat_interleave(
                     repeats=self.num_samples, dim=0
                 ) if future_features is not None else None
@@ -777,13 +761,11 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
-                          static_features=static_features,
                           past_observed_targets=past_observed_targets)
         if self.output_type == 'regression':
             return self.pred_from_net_output(net_output)
@@ -845,7 +827,6 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.training:
@@ -1008,10 +989,6 @@ def forward(self,
                     repeats=self.num_samples,
                     dim=0).squeeze(1)
 
-                repeated_static_feat = static_features.repeat_interleave(
-                    repeats=self.num_samples, dim=0
-                ).unsqueeze(dim=1) if static_features is not None else None
-
                 if future_features is not None:
                     time_feature = future_features[:, 1:]
                 else:
@@ -1069,13 +1046,11 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 ):
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
-                          static_features=static_features,
                           past_observed_targets=past_observed_targets)
         return net_output
 
@@ -1088,7 +1063,6 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                static_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
         if self.window_size < past_targets.shape[1]:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 6062e39a9..137eee2fc 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -154,9 +154,6 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
         future_features = data['future_features']
         if future_features is not None:
             future_features = future_features.float()
-        static_features = data['static_features']
-        if static_features is not None:
-            static_features = static_features.float()
 
         future_observed_targets = future_targets["future_observed_targets"]
         future_targets_values = future_targets["future_targets"]
@@ -205,7 +202,6 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             outputs = self.model(past_targets=past_target,
                                  past_features=past_features,
                                  future_features=future_features,
-                                 static_features=static_features,
                                  future_targets=future_targets,
                                  past_observed_targets=past_observed_targets)
 
@@ -257,9 +253,6 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 future_features = data['future_features']
                 if future_features is not None:
                     future_features = future_features.float()
-                static_features = data['static_features']
-                if static_features is not None:
-                    static_features = static_features.float()
 
                 mase_coefficients.append(data['mase_coefficient'])
                 if isinstance(self.criterion, MASELoss):
@@ -279,13 +272,11 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                                          past_features=past_features,
                                          future_targets=future_targets_values,
                                          future_features=future_features,
-                                         static_features=static_features,
                                          past_observed_targets=past_observed_targets)
                 else:
                     outputs = self.model(past_targets=past_target,
                                          past_features=past_features,
                                          future_features=future_features,
-                                         static_features=static_features,
                                          past_observed_targets=past_observed_targets)
 
                 # prepare

From 533f12d70e2c92b76b1503b12e99347127278301 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 27 Apr 2022 20:26:53 +0200
Subject: [PATCH 227/347] test transform

---
 .../data/time_series_forecasting_validator.py | 28 ++++++++---------
 autoPyTorch/datasets/time_series_dataset.py   | 16 +++++-----
 ...time_series_forecasting_train_evaluator.py |  1 -
 .../setup/network/forecasting_architecture.py | 31 +++++++++++++------
 .../setup/network/forecasting_network.py      |  4 ++-
 .../forecasting_backbone/cells.py             | 25 ++++++++++-----
 .../time_series_forecasting_data_loader.py    | 29 ++++++++++++++---
 .../forecasting_base_trainer.py               |  8 +++--
 .../pipeline/time_series_forecasting.py       |  5 ++-
 9 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index ae369f351..ecba891fa 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -220,30 +220,30 @@ def transform(
             raise NotImplementedError
 
     @staticmethod
-    def join_series(input: List[SupportedFeatTypes],
+    def join_series(X: List[SupportedFeatTypes],
                     return_seq_lengths: bool = False) -> Union[pd.DataFrame,
                                                                Tuple[pd.DataFrame, List[int]]]:
         """
         join the series into one single value
         """
-        num_sequences = len(input)
+        num_sequences = len(X)
         sequence_lengths = [0] * num_sequences
         for seq_idx in range(num_sequences):
-            sequence_lengths[seq_idx] = len(input[seq_idx])
+            sequence_lengths[seq_idx] = len(X[seq_idx])
         series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
-        if not isinstance(input, List):
-            raise ValueError(f'Input must be a list, but it is {type(input)}')
-        if isinstance(input[0], pd.DataFrame):
-            joint_input = pd.concat(input)
-        elif isinstance(input[0], sparse.spmatrix):
-            if len(input[0].shape) > 1:
-                joint_input = sparse.vstack(input)
+        if not isinstance(X, List):
+            raise ValueError(f'Input must be a list, but it is {type(X)}')
+        if isinstance(X[0], pd.DataFrame):
+            joint_input = pd.concat(X)
+        elif isinstance(X[0], sparse.spmatrix):
+            if len(X[0].shape) > 1:
+                joint_input = sparse.vstack(X)
             else:
-                joint_input = sparse.hstack(input)
-        elif isinstance(input[0], (List, np.ndarray)):
-            joint_input = np.concatenate(input)
+                joint_input = sparse.hstack(X)
+        elif isinstance(X[0], (List, np.ndarray)):
+            joint_input = np.concatenate(X)
         else:
-            raise NotImplementedError(f'Unsupported input type: List[{type(input[0])}]')
+            raise NotImplementedError(f'Unsupported input type: List[{type(X[0])}]')
         joint_input = pd.DataFrame(joint_input)
         joint_input.index = series_number
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 2f31ed508..3708313d0 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -54,13 +54,13 @@
 
 class TimeSeriesSequence(Dataset):
     def __init__(self,
-                 X: Optional[Union[np.ndarray, pd.DataFrame]],
-                 Y: Union[np.ndarray, pd.Series],
+                 X: Optional[np.ndarray],
+                 Y: Union[np.ndarray],
                  start_time_train: Optional[pd.DatetimeIndex] = None,
                  freq: str = '1Y',
                  time_feature_transform: List[TimeFeature] = [],
-                 X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 X_test: Optional[np.ndarray] = None,
+                 Y_test: Optional[np.ndarray] = None,
                  start_time_test: Optional[pd.DatetimeIndex] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -164,13 +164,13 @@ def __getitem__(self, index: int, train: bool = True) \
             if self.time_feature_transform:
                 self.compute_time_features()
 
-                if past_features:
-                    past_features = np.hstack(past_features, [self._cached_time_features[:index + 1]])
+                if past_features is not None:
+                    past_features = np.hstack([past_features, self._cached_time_features[:index + 1]])
                 else:
                     past_features = self._cached_time_features[:index + 1]
-                if future_features:
+                if future_features is not None:
                     future_features = np.hstack([
-                        past_features,
+                        future_features,
                         self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
                     ])
                 else:
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 444a7b3e8..7ee18b830 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -106,7 +106,6 @@ def fit_predict_and_loss(self) -> None:
 
             self.Y_optimization = self.datamanager.get_test_target(test_split)
 
-
             # self.Y_actual_train = self.y_train[train_split]
             y_train_pred, y_opt_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
                                                                                         train_indices=train_split,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 331f84013..e63bb011a 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -163,6 +163,7 @@ def __init__(self,
                  feature_names: Optional[Tuple[str]] = (),
                  known_future_features: Optional[Tuple[str]] = (),
                  feature_shapes: Optional[Dict[str, int]] = (),
+                 static_features: Tuple[Union[str, int]] = (),
                  time_feature_names: Optional[Tuple[str]] = (),
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
@@ -182,12 +183,10 @@ def __init__(self,
             network_head (nn.Module): network head, maps the output of decoder to the final output
             dataset_properties (Dict): dataset properties
             auto_regressive (bool): if the overall model is auto-regressive model
-            encoder_properties (Dict): encoder properties
-            decoder_properties: (Dict): decoder properties
             output_type (str): the form that the network outputs. It could be regression, distribution and
-            (TODO) quantile
+                quantile
             forecast_strategy (str): only valid if output_type is distribution or quantile, how the network transforms
-            its output to predicted values, could be mean or sample
+                its output to predicted values, could be mean or sample
             num_samples (int): only valid if output_type is not regression and forecast_strategy is sample. this
             indicates the number of the points to sample when doing prediction
             aggregation (str): how the samples are aggregated. We could take their mean or median values.
@@ -205,6 +204,7 @@ def __init__(self,
                                                       feature_names=feature_names,
                                                       known_future_features=known_future_features,
                                                       feature_shapes=feature_shapes,
+                                                      static_features=static_features,
                                                       time_feature_names=time_feature_names,
                                                       )
             self.lazy_modules.append(self.variable_selector)
@@ -376,6 +376,7 @@ def pre_processing(self,
 
         if self.network_structure.variable_selection:
             batch_size = x_past.shape[0]
+            x_static = {}
             if length_past > 0:
                 if past_features is not None:
                     past_features = past_features[:, -self.window_size:].to(self.device)
@@ -384,7 +385,12 @@ def pre_processing(self,
                 if past_features is not None:
                     for feature_name in self.variable_selector.feature_names:
                         tensor_idx = self.variable_selector.feature_names2tensor_idx[feature_name]
-                        x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                        if feature_name not in self.variable_selector.static_features:
+                            x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                        else:
+                            static_feature = past_features[:, [0], tensor_idx[0]: tensor_idx[1]]
+                            static_feature = static_feature.repeat(1, length_past + length_future, 1)
+                            x_static[feature_name] = static_feature
 
                 if hasattr(self.variable_selector, 'placeholder_features'):
                     for placehold in self.variable_selector.placeholder_features:
@@ -408,13 +414,21 @@ def pre_processing(self,
                 if future_features is not None:
                     for feature_name in self.variable_selector.known_future_features:
                         tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
-                        x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                        if feature_name not in self.variable_selector.static_features:
+                            x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                        else:
+                            if length_past == 0:
+                                static_feature = future_features[:, [0], tensor_idx[0]: tensor_idx[1]]
+                                static_feature = static_feature.repeat(1, length_past + length_future, 1)
+                                x_static[feature_name] = static_feature
+
             else:
                 x_future = None
 
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
                 x_past=x_past,
                 x_future=x_future,
+                x_static=x_static,
                 batch_size=batch_size,
                 length_past=length_past,
                 length_future=length_future,
@@ -431,7 +445,7 @@ def pre_processing(self,
             if future_features is not None:
                 future_features = future_features.to(self.device)
             x_past = self.embedding(x_past)  # TODO embedding for future features!
-            return x_past, future_features, static_features, loc, scale, None, past_targets
+            return x_past, future_features, None, loc, scale, None, past_targets
 
     def forward(self,
                 past_targets: torch.Tensor,
@@ -726,7 +740,7 @@ def forward(self,
 
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
-                                                  pos_idx=(x_past.shape[1]+idx_pred, x_past.shape[1] + idx_pred+1),
+                                                  pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
                                                   cache_intermediate_state=True,
                                                   incremental_update=idx_pred > 0)
                     if self.has_temporal_fusion:
@@ -841,7 +855,6 @@ def forward(self,
                     self.scale_value(past_targets[:, :-self.window_size], loc, scale),
                     past_targets[:, :-self.window_size])
 
-
                 future_targets = self.scale_value(future_targets, loc, scale)
 
                 targets_all = torch.cat([past_targets, future_targets[:, :-1]], dim=1)
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 9ad9f9c87..8fac068e3 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Iterable
+from typing import Any, Dict, Optional, Iterable, Tuple
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
@@ -54,6 +54,7 @@ def _required_fit_requirements(self):
             FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('static_features', (Tuple,), user_defined=True, dataset_property=False),
             FitRequirement('time_feature_names', (Iterable,), user_defined=False, dataset_property=True)
         ]
 
@@ -92,6 +93,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
                                    feature_shapes=feature_shapes,
                                    known_future_features=known_future_features,
                                    time_feature_names=time_feature_names,
+                                   static_features=X['dataset_properties']['static_features']
                                    )
         if net_output_type == 'distribution':
             dist_forecasting_strategy = X['dist_forecasting_strategy']  # type: DisForecastingStrategy
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index bcfd4be5e..3f337e905 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -1,6 +1,4 @@
-from pytorch_forecasting.utils import create_mask
-
-from typing import Any, Dict, Optional, List, Tuple, Set
+from typing import Any, Dict, Optional, List, Tuple, Set, Union
 
 import torch
 from torch import nn
@@ -196,6 +194,7 @@ def __init__(self,
                  feature_names: Tuple[str] = (),
                  known_future_features: Tuple[str] = tuple(),
                  feature_shapes: Dict[str, int] = {},
+                 static_features: Tuple[Union[str, int]] = (),
                  time_feature_names: Tuple[str] = (),
                  ):
         """
@@ -226,11 +225,17 @@ def __init__(self,
         pre_scalar = {'past_targets': nn.Linear(dataset_properties['output_shape'][-1], self.hidden_size)}
         encoder_input_sizes = {'past_targets': self.hidden_size}
         decoder_input_sizes = {}
-        feature_names2tensor_idx = {}
         future_feature_name2tensor_idx = {}
+        feature_names2tensor_idx = {}
         idx_tracker = 0
         idx_tracker_future = 0
 
+        static_features = set(static_features)
+        static_features_input_size = {}
+
+        # static_features should always be known beforehand
+        known_future_features = tuple(set(known_future_features) | static_features)
+
         if feature_names:
             for name in feature_names:
                 feature_shape = feature_shapes[name]
@@ -240,6 +245,8 @@ def __init__(self,
                 encoder_input_sizes[name] = self.hidden_size
                 if name in known_future_features:
                     decoder_input_sizes[name] = self.hidden_size
+                if name in static_features:
+                    static_features_input_size[name] = self.hidden_size
 
         for future_name in known_future_features:
             feature_shape = feature_shapes[future_name]
@@ -294,12 +301,13 @@ def __init__(self,
         if not dataset_properties['uni_variant']:
             # TODO
             self.static_variable_selection = VariableSelectionNetwork(
-                input_sizes=static_input_sizes,
+                input_sizes=static_features_input_size,
                 hidden_size=self.hidden_size,
                 input_embedding_flags={},
                 dropout=network_structure.grn_dropout_rate,
             )
-        self.static_input_sizes = static_input_sizes
+        self.static_input_sizes = static_features_input_size
+        self.static_features = static_features
 
         self.auto_regressive = auto_regressive
 
@@ -379,7 +387,7 @@ def device(self, device: torch.device):
     def forward(self,
                 x_past: Optional[Dict[str, torch.Tensor]],
                 x_future: Optional[Dict[str, torch.Tensor]],
-                x_static: Optional[Dict[str, torch.Tensor]] = None,
+                x_static: Optional[Dict[str, torch.Tensor]],
                 length_past: int = 0,
                 length_future: int = 0,
                 batch_size: int = 0,
@@ -391,8 +399,9 @@ def forward(self,
         if length_past == 0 and length_future == 0:
             raise ValueError("Either length_past or length_future must be given!")
         timesteps = length_past + length_future
+
         if not use_cached_static_contex:
-            if self.static_input_sizes > 0:
+            if len(self.static_input_sizes) > 0:
                 static_embedding, _ = self.static_variable_selection(x_static)
             else:
                 model_dtype = next(iter(x_past.values())).dtype if length_past > 0 else next(
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 8ba9e0b28..34ebb3263 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -7,6 +7,7 @@
 from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
+import pandas as pd
 
 import torch
 from torch.utils.data.sampler import SubsetRandomSampler
@@ -83,6 +84,7 @@ def __init__(self,
         self.transform_time_features = transform_time_features
         self.freq = "1Y"
         self.time_feature_transform = []
+        self.dataset_columns = []
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -98,7 +100,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         self.check_requirements(X, y)
 
         # Incorporate the transform to the dataset
-        datamanager = X['backend'].load_datamanager()  # type: TimeSeriesForcecastingDataset
+        datamanager: TimeSeriesForecastingDataset = X['backend'].load_datamanager()
 
         self.n_prediction_steps = datamanager.n_prediction_steps
         if self.backcast:
@@ -119,6 +121,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         max_lagged_value = max(X['dataset_properties'].get('lagged_value', [np.inf]))
         max_lagged_value += self.window_size + self.n_prediction_steps
 
+        self.dataset_columns = datamanager.feature_names
 
         self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value, max_lagged_value)
 
@@ -270,7 +273,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
         #    candidate_transformations.extend(X['preprocess_transforms'])
 
         candidate_transformations.append(ExpandTransformTimeSeries())
-        if "test" in mode or not X['dataset_properties']['is_small_preprocess']:
+        if mode == 'test' or not X['dataset_properties']['is_small_preprocess']:
             if "preprocess_transforms" in X:
                 candidate_transformations.extend(X['preprocess_transforms'])
 
@@ -288,7 +291,20 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         if isinstance(X, TimeSeriesSequence):
             X = [X]
         if isinstance(X, List):
-            for x_seq in X:
+            if self.dataset_small_preprocess and not self._is_uni_variant:
+                num_sequences = len(X)
+                sequence_lengths = [0] * num_sequences
+                for seq_idx, x_seq in enumerate(X):
+                    sequence_lengths[seq_idx] = len(x_seq.X)
+                x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
+                series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+                x_all.index = series_number
+
+                x_all = pd.DataFrame(self.test_transform(x_all))
+                x_all.index = series_number
+                x_all = x_all.groupby(x_all.index)
+
+            for i, x_seq in enumerate(X):
                 if not isinstance(x_seq, TimeSeriesSequence):
                     raise NotImplementedError('Test Set must be a TimeSeriesSequence or a'
                                               ' list of time series objects!')
@@ -296,7 +312,9 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     # WE need to recompute the cached time features (However, this should not happen)
                     x_seq._cached_time_features = None
 
-                x_seq.update_transform(self.test_transform, train=False)
+                if self.dataset_small_preprocess and not self._is_uni_variant:
+                    x_seq.X = x_all.get_group(i).transform(np.array).values
+
                 x_seq.update_attribute(freq=self.freq,
                                        transform_time_features=self.transform_time_features,
                                        time_feature_transform=self.time_feature_transform,
@@ -307,7 +325,8 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     x_seq.compute_time_features()
 
                 x_seq.freq = self.freq
-                x_seq.update_transform(self.test_transform, train=False)
+                if not self.dataset_small_preprocess:
+                    x_seq.update_transform(self.test_transform, train=False)
         else:
             raise NotImplementedError('Unsupported data type for time series data loader!')
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 137eee2fc..40233ce64 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -87,6 +87,8 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
+        import time
+        time_start = time.time()
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -117,6 +119,8 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
+        print(f'Time Used for training: {time.time() - time_start}')
+
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
@@ -132,7 +136,7 @@ def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
             targets = targets.long()
         return targets
 
-    def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor) \
+    def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, torch.Tensor]) \
             -> Tuple[float, torch.Tensor]:
         """
         Allows to train 1 step of gradient descent, given a batch of train/labels
@@ -202,7 +206,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: torch.Tensor
             outputs = self.model(past_targets=past_target,
                                  past_features=past_features,
                                  future_features=future_features,
-                                 future_targets=future_targets,
+                                 future_targets=future_targets_values,
                                  past_observed_targets=past_observed_targets)
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index d96208bb2..a44ab548d 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -400,7 +400,9 @@ def _get_estimator_hyperparameter_name(self) -> str:
         """
         return "time_series_forecasting"
 
-    def predict(self, X: Union[Dict[str, np.ndarray], pd.DataFrame], batch_size: Optional[int] = None) -> np.ndarray:
+    def predict(self,
+                X: Union[Dict[str, np.ndarray], pd.DataFrame],
+                batch_size: Optional[int] = None) -> np.ndarray:
         """Predict the output using the selected model.
 
         Args:
@@ -408,6 +410,7 @@ def predict(self, X: Union[Dict[str, np.ndarray], pd.DataFrame], batch_size: Opt
             batch_size (Optional[int]): batch_size controls whether the pipeline will be
                 called on small chunks of the data. Useful when calling the
                 predict method on the whole array X results in a MemoryError.
+            transform_X (bool): if we want to transform
 
         Returns:
             np.ndarray: the predicted values given input X

From f8be97c86d4dfa8d6708e1805d203652d8f93a47 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 28 Apr 2022 18:46:25 +0200
Subject: [PATCH 228/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    | 19 +++--
 autoPyTorch/datasets/time_series_dataset.py   | 65 ++++++++++++-----
 .../setup/network/forecasting_architecture.py | 19 ++---
 .../forecasting_backbone/cells.py             | 14 ++--
 .../LearnedEntityEmbedding.py                 | 71 +++++++++++++++++--
 .../setup/network_embedding/NoEmbedding.py    |  3 +
 .../time_series_forecasting_data_loader.py    | 57 +++++++++------
 7 files changed, 180 insertions(+), 68 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 99a886beb..e70083f47 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -20,7 +20,7 @@
     get_dataset_compression_mapping,
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -220,7 +220,7 @@ def _get_dataset_input_validator(
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
         input_validator.fit(X_train=X_train, y_train=y_train, start_times_train=start_times_train,
-                                X_test=X_test, y_test=y_test, start_times_test=start_times_test)
+                            X_test=X_test, y_test=y_test, start_times_test=start_times_test)
 
         dataset = TimeSeriesForecastingDataset(
             X=X_train, Y=y_train,
@@ -452,22 +452,19 @@ def search(
 
     def predict(
             self,
-            X_test: Optional[Union[Union[List[np.ndarray]], pd.DataFrame, Dict]] = None,
+            X_test: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]], pd.DataFrame] = None,
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
-            past_targets: Optional[List[np.ndarray]] = None,
+            targets_tests: Optional[List[np.ndarray]] = None,
+            start_times: List[pd.DatetimeIndex] = []
     ) -> np.ndarray:
         """
                     target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
                 (used for multi-variable prediction), indicates which value needs to be predicted
         """
-        if not self.dataset.is_uni_variant:
-            if past_targets is None:
-                if not isinstance(X_test, Dict) or "past_targets" not in X_test:
-                    raise ValueError("Past Targets must be given")
-            else:
-                X_test = {"features": X_test,
-                          "past_targets": past_targets}
+        if not isinstance(X_test[0], TimeSeriesSequence):
+            # Validate and construct TimeSeriesSequence TODO
+            pass
         flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
         if self.dataset.num_target == 1:
             return flattened_res.reshape([-1, self.dataset.n_prediction_steps])
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 3708313d0..9097a3b99 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -52,6 +52,28 @@
 TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
 
 
+def extract_feature_index(feature_shapes: Dict[str, int],
+                          feature_names: Tuple[str],
+                          queried_features: Tuple[str]) -> Tuple[int]:
+    """
+    extract the index of a set of queried_features from the extracted feature_shapes
+    Args:
+        feature_shapes (dict): feature_shapes recoding the shape of each features
+        feature_names (List[str]): names of the features
+        queried_features (Tuple[str]): names of the features that we expect their index
+
+    Returns:
+        feature_index (Tuple[int]):
+    """
+    df_range = pd.DataFrame(feature_shapes, columns=feature_names, index=[0])
+    df_range_end = df_range.cumsum(axis=1)
+    df_range = pd.concat([df_range_end - df_range, df_range_end])
+    value_ranges = df_range[list(queried_features)].T.values
+    feature_index: List[int] = sum([list(range(*value_r)) for value_r in value_ranges], [])
+    feature_index.sort()
+    return tuple(feature_index)
+
+
 class TimeSeriesSequence(Dataset):
     def __init__(self,
                  X: Optional[np.ndarray],
@@ -64,10 +86,9 @@ def __init__(self,
                  start_time_test: Optional[pd.DatetimeIndex] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
-                 static_features: Tuple[Union[int, str]] = None,
                  n_prediction_steps: int = 0,
                  sp: int = 1,
-                 known_future_features: Optional[Tuple[str]] = None,
+                 known_future_features_index: Optional[List[int]] = None,
                  only_has_past_targets: bool = False,
                  compute_mase_coefficient_value: bool = True,
                  time_features=None,
@@ -100,7 +121,6 @@ def __init__(self,
         self.start_time_test = start_time_test
 
         self.time_feature_transform = time_feature_transform
-        self.static_features = static_features
 
         self.freq = freq
 
@@ -120,7 +140,7 @@ def __init__(self,
         else:
             self.mase_coefficient = 1.0
         self.only_has_past_targets = only_has_past_targets
-        self.known_future_features = known_future_features
+        self.known_future_features_index = known_future_features_index
 
         self.transform_time_features = False
         self._cached_time_features: Optional[np.ndarray] = time_features
@@ -145,15 +165,14 @@ def __getitem__(self, index: int, train: bool = True) \
             index = self.__len__() + index
 
         if self.X is not None:
-            if hasattr(self.X, 'loc'):
+            if hasattr(self.X, 'iloc'):
                 past_features = self.X.iloc[:index + 1]
             else:
                 past_features = self.X[:index + 1]
 
-            if self.known_future_features:
-                future_features = self.X.iloc[
-                                  index + 1: index + self.n_prediction_steps + 1, self.known_future_features
-                                  ]
+            if self.known_future_features_index:
+                future_features = self.X[index + 1: index + self.n_prediction_steps + 1,
+                                  self.known_future_features_index]
             else:
                 future_features = None
         else:
@@ -169,10 +188,15 @@ def __getitem__(self, index: int, train: bool = True) \
                 else:
                     past_features = self._cached_time_features[:index + 1]
                 if future_features is not None:
-                    future_features = np.hstack([
-                        future_features,
-                        self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
-                    ])
+                    try:
+                        future_features = np.hstack([
+                            future_features,
+                            self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+                        ])
+                    except Exception:
+                        import pdb
+
+                        pdb.set_trace()
                 else:
                     future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
 
@@ -302,8 +326,7 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                                       train_transforms=self.train_transform,
                                       val_transforms=self.val_transform,
                                       n_prediction_steps=self.n_prediction_steps,
-                                      static_features=self.static_features,
-                                      known_future_features=self.known_future_features,
+                                      known_future_features_index=self.known_future_features_index,
                                       sp=self.sp,
                                       only_has_past_targets=True,
                                       compute_mase_coefficient_value=False,
@@ -568,6 +591,9 @@ def __init__(self,
 
         if known_future_features is None:
             known_future_features = tuple()
+        known_future_features_index = extract_feature_index(self.feature_shapes,
+                                                            self.feature_names,
+                                                            queried_features=known_future_features)
 
         # initialize datasets
         sequences_kwargs = {"freq": self.freq,
@@ -576,8 +602,7 @@ def __init__(self,
                             "val_transforms": self.val_transform,
                             "n_prediction_steps": n_prediction_steps,
                             "sp": self.seasonality,
-                            "known_future_features": known_future_features,
-                            "static_features": self.static_features}
+                            "known_future_features_index": known_future_features_index}
 
         sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
             X=X, Y=Y,
@@ -819,7 +844,10 @@ def make_sequences_datasets(self,
 
         return sequence_datasets, train_tensors, test_tensors
 
-    def replace_data(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame]) -> 'BaseDataset':
+    def replace_data(self,
+                     X_train: pd.DataFrame,
+                     X_test: Optional[pd.DataFrame],
+                     known_future_features_index: List[int] = []) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
         if X_train is None:
             return self
@@ -831,6 +859,7 @@ def replace_data(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame]) ->
             seq.X = x_ser
             if X_test is not None:
                 seq.X_test = X_test_group.get_group(ser_id).transform(np.array).values
+            seq.known_future_features_index = known_future_features_index
 
         return self
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index e63bb011a..907412f9f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -12,6 +12,7 @@
 )
 
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderNetwork,
@@ -194,6 +195,11 @@ def __init__(self,
         super().__init__()
         self.network_structure = network_structure
         self.embedding = network_embedding
+        if len(known_future_features) > 0:
+            known_future_features_idx = [feature_names.index(kff) for kff in known_future_features]
+            self.embedding_future = self.embedding.get_partial_models(known_future_features_idx)
+        else:
+            self.embedding_future = _NoEmbedding()
         # modules that generate tensors while doing forward pass
         self.lazy_modules = []
         if network_structure.variable_selection:
@@ -379,7 +385,7 @@ def pre_processing(self,
             x_static = {}
             if length_past > 0:
                 if past_features is not None:
-                    past_features = past_features[:, -self.window_size:].to(self.device)
+                    past_features = self.embedding(past_features[:, -self.window_size:].to(self.device))
                 x_past = {'past_targets': x_past.to(device=self.device)}
 
                 if past_features is not None:
@@ -388,8 +394,7 @@ def pre_processing(self,
                         if feature_name not in self.variable_selector.static_features:
                             x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
                         else:
-                            static_feature = past_features[:, [0], tensor_idx[0]: tensor_idx[1]]
-                            static_feature = static_feature.repeat(1, length_past + length_future, 1)
+                            static_feature = past_features[:, 0, tensor_idx[0]: tensor_idx[1]]
                             x_static[feature_name] = static_feature
 
                 if hasattr(self.variable_selector, 'placeholder_features'):
@@ -401,7 +406,7 @@ def pre_processing(self,
                 x_past = None
             if length_future > 0:
                 if future_features is not None:
-                    future_features = future_features.to(self.device)
+                    future_features = self.embedding_future(future_features.to(self.device))
                 x_future = {}
                 if hasattr(self.variable_selector, 'placeholder_features'):
                     for placehold in self.variable_selector.placeholder_features:
@@ -418,8 +423,7 @@ def pre_processing(self,
                             x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
                         else:
                             if length_past == 0:
-                                static_feature = future_features[:, [0], tensor_idx[0]: tensor_idx[1]]
-                                static_feature = static_feature.repeat(1, length_past + length_future, 1)
+                                static_feature = future_features[:, 0, tensor_idx[0]: tensor_idx[1]]
                                 x_static[feature_name] = static_feature
 
             else:
@@ -444,7 +448,6 @@ def pre_processing(self,
             x_past = x_past.to(device=self.device)
             if future_features is not None:
                 future_features = future_features.to(self.device)
-            x_past = self.embedding(x_past)  # TODO embedding for future features!
             return x_past, future_features, None, loc, scale, None, past_targets
 
     def forward(self,
@@ -544,7 +547,7 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         length_future = future_targets.shape[1]
         future_targets = future_targets.to(self.device)
         if future_features is not None:
-            future_features = future_features.to(self.device)
+            future_features = self.embedding_future(future_features.to(self.device))
         x_future = {}
         if hasattr(self.variable_selector, 'placeholder_features'):
             for placeholder in self.variable_selector.placeholder_features:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 3f337e905..5e1bee7be 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -234,7 +234,7 @@ def __init__(self,
         static_features_input_size = {}
 
         # static_features should always be known beforehand
-        known_future_features = tuple(set(known_future_features) | static_features)
+        known_future_features = tuple(known_future_features)
 
         if feature_names:
             for name in feature_names:
@@ -242,17 +242,19 @@ def __init__(self,
                 feature_names2tensor_idx[name] = [idx_tracker, idx_tracker + feature_shape]
                 idx_tracker += feature_shape
                 pre_scalar[name] = nn.Linear(feature_shape, self.hidden_size)
-                encoder_input_sizes[name] = self.hidden_size
-                if name in known_future_features:
-                    decoder_input_sizes[name] = self.hidden_size
                 if name in static_features:
                     static_features_input_size[name] = self.hidden_size
+                else:
+                    encoder_input_sizes[name] = self.hidden_size
+                    if name in known_future_features:
+                        decoder_input_sizes[name] = self.hidden_size
 
         for future_name in known_future_features:
             feature_shape = feature_shapes[future_name]
             future_feature_name2tensor_idx[future_name] = [idx_tracker_future, idx_tracker_future + feature_shape]
             idx_tracker_future += feature_shape
 
+
         if time_feature_names:
             for name in time_feature_names:
                 feature_names2tensor_idx[name] = [idx_tracker, idx_tracker+1]
@@ -294,17 +296,17 @@ def __init__(self,
             pre_scalar.update({'future_prediction': nn.Linear(dataset_properties['output_shape'][-1],
                                                               self.hidden_size)})
             decoder_input_sizes.update({'future_prediction': self.hidden_size})
-        self.pre_scalars = {nn.ModuleDict(pre_scalar)}
+        self.pre_scalars = nn.ModuleDict(pre_scalar)
 
         self._device = torch.device('cpu')
 
         if not dataset_properties['uni_variant']:
-            # TODO
             self.static_variable_selection = VariableSelectionNetwork(
                 input_sizes=static_features_input_size,
                 hidden_size=self.hidden_size,
                 input_embedding_flags={},
                 dropout=network_structure.grn_dropout_rate,
+                prescalers=self.pre_scalars
             )
         self.static_input_sizes = static_features_input_size
         self.static_features = static_features
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 3823f75c0..7ae0dd894 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -19,13 +19,19 @@
 class _LearnedEntityEmbedding(nn.Module):
     """ Learned entity embedding module for categorical features"""
 
-    def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int):
+    def __init__(self,
+                 config: Dict[str, Any],
+                 num_input_features: np.ndarray,
+                 num_numerical_features: int):
         """
         Args:
             config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer
             num_input_features (np.ndarray): column wise information of number of output columns after transformation
                 for each categorical column and 0 for numerical columns
             num_numerical_features (int): number of numerical features in X
+            num_output_dimensions Optional[List[int]]: number of output dimensions, this is applied to quickly
+                construct a new Embedding network
+            ee_layers (Optional[nn.Module])
         """
         super().__init__()
         self.config = config
@@ -48,10 +54,39 @@ def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_n
         self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in
                                       zip(self.num_output_dimensions, self.embed_features,
                                           self.num_input_features)]
+
         self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
 
         self.ee_layers = self._create_ee_layers()
 
+    def get_partial_models(self, subset_features: List[int]) -> "_LearnedEntityEmbedding":
+        """
+        extract a partial models that only works on a subset of the data that ought to be passed to the embedding
+        network, this function is implemented for time series forecasting tasks where the known future features is only
+        a subset of the known past features
+        Args:
+            subset_features: a set of index identifying which features will pass through the partial model
+
+        Returns:
+            partial_model (_LearnedEntityEmbedding) a new partial model
+        """
+        num_input_features = self.num_input_features[subset_features]
+        num_numerical_features = sum([sf < self.num_numerical for sf in subset_features])
+
+        num_output_dimensions = [self.num_output_dimensions[sf] for sf in subset_features]
+        embed_features = [self.embed_features[sf] for sf in subset_features]
+
+        ee_layers = []
+        ee_layer_tracker = 0
+        for sf in subset_features:
+            if self.embed_features[sf]:
+                ee_layers.append(self.ee_layers[ee_layer_tracker])
+                ee_layer_tracker += 1
+        ee_layers = nn.ModuleList(ee_layers)
+
+        return PartialLearnedEntityEmbedding(num_input_features, num_numerical_features, embed_features,
+                                             num_output_dimensions, ee_layers)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # pass the columns of each categorical feature through entity embedding layer
         # before passing it through the model
@@ -64,15 +99,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 x_pointer += 1
                 continue
             if x_pointer > last_concat:
-                concat_seq.append(x[:, last_concat: x_pointer])
-            categorical_feature_slice = x[:, x_pointer: x_pointer + num_in]
+                concat_seq.append(x[..., last_concat: x_pointer])
+            categorical_feature_slice = x[..., x_pointer: x_pointer + num_in]
             concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice))
             layer_pointer += 1
             x_pointer += num_in
             last_concat = x_pointer
 
-        concat_seq.append(x[:, last_concat:])
-        return torch.cat(concat_seq, dim=1)
+        concat_seq.append(x[..., last_concat:])
+        return torch.cat(concat_seq, dim=-1)
 
     def _create_ee_layers(self) -> nn.ModuleList:
         # entity embeding layers are Linear Layers
@@ -85,6 +120,31 @@ def _create_ee_layers(self) -> nn.ModuleList:
         return layers
 
 
+class PartialLearnedEntityEmbedding(_LearnedEntityEmbedding):
+    def __init__(self,
+                 num_input_features: np.ndarray,
+                 num_numerical_features: int,
+                 embed_features: List[bool],
+                 num_output_dimensions: Optional[List[int]],
+                 ee_layers: nn.Module
+                 ):
+        super(_LearnedEntityEmbedding, self).__init__()
+        self.num_numerical = num_numerical_features
+        # list of number of categories of categorical data
+        # or 0 for numerical data
+        self.num_input_features = num_input_features
+        categorical_features = self.num_input_features > 0
+
+        self.num_categorical_features = self.num_input_features[categorical_features]
+
+        self.embed_features = embed_features
+
+        self.num_output_dimensions = num_output_dimensions
+        self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
+
+        self.ee_layers = ee_layers
+
+
 class LearnedEntityEmbedding(NetworkEmbeddingComponent):
     """
     Class to learn an embedding for categorical hyperparameters.
@@ -97,6 +157,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg
     def build_embedding(self,
                         num_input_features: np.ndarray,
                         num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
+
         embedding = _LearnedEntityEmbedding(config=self.config,
                                             num_input_features=num_input_features,
                                             num_numerical_features=num_numerical_features)
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index 9dabac6ad..31afb3d51 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -12,6 +12,9 @@
 
 
 class _NoEmbedding(nn.Module):
+    def get_partial_models(self, subset_features: List[int]) -> "_NoEmbedding":
+        return self
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 34ebb3263..1710ddba0 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union, Sequence, List, Iterator, Sized
+from typing import Any, Dict, Optional, Union, Tuple, List
 import warnings
 from functools import partial
 
@@ -14,8 +14,12 @@
 
 import torchvision
 
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.datasets.time_series_dataset import (
+    TimeSeriesForecastingDataset,
+    TimeSeriesSequence,
+    extract_feature_index)
 from autoPyTorch.utils.common import (
+    FitRequirement,
     HyperparameterSearchSpace,
     custom_collate_fn,
     add_hyperparameter,
@@ -77,8 +81,7 @@ def __init__(self,
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector = None
 
-        self.static_features = None
-        self.known_future_features = None
+        self.known_future_features_index = None
         self._is_uni_variant = False
 
         self.transform_time_features = transform_time_features
@@ -86,6 +89,11 @@ def __init__(self,
         self.time_feature_transform = []
         self.dataset_columns = []
 
+        self.add_fit_requirements(
+            [FitRequirement("known_future_features", (Tuple,), user_defined=True, dataset_property=True),
+             FitRequirement("feature_shapes", (Dict,), user_defined=True, dataset_property=True),
+             FitRequirement("feature_names", (Tuple, ), user_defined=True, dataset_property=True)])
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
         Fits a component by using an input dictionary with pre-requisites
@@ -106,14 +114,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         if self.backcast:
             self.window_size = self.backcast_period * self.n_prediction_steps
 
-        self.static_features = datamanager.static_features
-        self.known_future_features = datamanager.known_future_features
-
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
 
-
         if sample_interval > 1:
             # for lower resolution, window_size should be smaller
             self.window_size = (self.window_size - 1) // sample_interval + 1
@@ -123,7 +127,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         self.dataset_columns = datamanager.feature_names
 
-        self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value, max_lagged_value)
+        known_future_features_index = extract_feature_index(
+            feature_shapes=X['dataset_properties']['feature_shapes'],
+            feature_names=X['dataset_properties']['feature_names'],
+            queried_features=X['dataset_properties']['known_future_features']
+        )
+        self.known_future_features_index = tuple(known_future_features_index)
+
+        self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value,
+                                                      max_lagged_value)
 
         # this value corresponds to budget type num_sequence
         fraction_seq = X.get('fraction_seq', 1.0)
@@ -143,11 +155,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             self.val_transform,
             train=False,
         )
+
         datamanager.transform_time_features = self.transform_time_features
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
-            datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
+            datamanager.replace_data(X['X_train'],
+                                     X['X_test'] if 'X_test' in X else None,
+                                     known_future_features_index=known_future_features_index)
             self.dataset_small_preprocess = True
         else:
             self.dataset_small_preprocess = False
@@ -183,7 +198,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         # create masks for masking
         seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)[0]
         if len(seq_idx_inactivate) == seq_train_length.size:
-            seq_idx_inactivate = self.random_state.choice(seq_idx_inactivate, len(seq_idx_inactivate)-1, replace=False )
+            seq_idx_inactivate = self.random_state.choice(seq_idx_inactivate, len(seq_idx_inactivate) - 1,
+                                                          replace=False)
         # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
 
         """
@@ -314,13 +330,14 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
 
                 if self.dataset_small_preprocess and not self._is_uni_variant:
                     x_seq.X = x_all.get_group(i).transform(np.array).values
-
-                x_seq.update_attribute(freq=self.freq,
-                                       transform_time_features=self.transform_time_features,
-                                       time_feature_transform=self.time_feature_transform,
-                                       static_features=self.static_features,
-                                       known_future_features=self.known_future_features,
-                                       )
+                    update_dict = {"known_future_features_index": self.known_future_features_index}
+                else:
+                    update_dict = {}
+                update_dict.update(dict(freq=self.freq,
+                                        transform_time_features=self.transform_time_features,
+                                        time_feature_transform=self.time_feature_transform, ))
+
+                x_seq.update_attribute(**update_dict)
                 if self.transform_time_features:
                     x_seq.compute_time_features()
 
@@ -435,8 +452,8 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
             if seq_length_max <= window_size.value_range[0]:
                 warnings.warn('The base window_size is larger than the maximal sequence length in the dataset,'
                               'we simply set it as a constant value with maximal sequence length')
-                window_size = HyperparameterSearchSpace(hyperparameter=window_size.hyperparameter, 
-                                                        value_range=(seq_length_max, ),
+                window_size = HyperparameterSearchSpace(hyperparameter=window_size.hyperparameter,
+                                                        value_range=(seq_length_max,),
                                                         default_value=seq_length_max)
                 window_size = get_hyperparameter(window_size, Constant)
             else:

From e8c9071399ce2ef6e519696d468d4f3bc2431a56 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 28 Apr 2022 20:58:23 +0200
Subject: [PATCH 229/347] test sets

---
 .../data/time_series_forecasting_validator.py | 11 ------
 autoPyTorch/datasets/time_series_dataset.py   | 37 ++++++-------------
 2 files changed, 11 insertions(+), 37 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index ecba891fa..685a88f9e 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -50,7 +50,6 @@ def fit(
             X_test: Optional[Union[List, pd.DataFrame]] = None,
             y_test: Optional[Union[List, pd.DataFrame]] = None,
             start_times_train: Optional[List[pd.DatetimeIndex]] = None,
-            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
             freq: str = '1Y',
             n_prediction_steps: int = 1,
             known_future_features: Optional[List[Union[int, str]]] = None,
@@ -71,23 +70,13 @@ def fit(
         self.series_idx = series_idx
         self.n_prediction_steps = n_prediction_steps
 
-        if y_test is not None and bool(start_times_test) != bool(start_times_train):
-            warnings.warn('One of start_times_test or start_times_train is missing! This might result in the '
-                          'risk of not proper evaluated ')
-
         if start_times_train is None:
             start_times_train = [pd.DatetimeIndex(pd.to_datetime(['2000-01-01']), freq=freq)] * len(y_train)
         else:
             assert len(start_times_train) == len(y_train), 'start_times_train must have the same length as y_train!'
 
-        if y_test is not None:
-            if start_times_test is None:
-                start_times_test = [pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)] * len(y_test)
-            else:
-                assert len(start_times_train) == len(y_train), 'start_times_train must have the same length as y_train!'
 
         self.start_times_train = start_times_train
-        self.start_times_test = start_times_test
 
         if X_train is None:
             self._is_uni_variant = True
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 9097a3b99..ef7ac4d6d 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -83,7 +83,6 @@ def __init__(self,
                  time_feature_transform: List[TimeFeature] = [],
                  X_test: Optional[np.ndarray] = None,
                  Y_test: Optional[np.ndarray] = None,
-                 start_time_test: Optional[pd.DatetimeIndex] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  n_prediction_steps: int = 0,
@@ -92,7 +91,6 @@ def __init__(self,
                  only_has_past_targets: bool = False,
                  compute_mase_coefficient_value: bool = True,
                  time_features=None,
-                 time_features_test=None,
                  is_test_set=False,
                  ):
         """
@@ -118,7 +116,6 @@ def __init__(self,
 
         self.X_test = X_test
         self.Y_tet = Y_test
-        self.start_time_test = start_time_test
 
         self.time_feature_transform = time_feature_transform
 
@@ -352,10 +349,10 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
     cumulative_sizes: List[int]
 
     def __init__(self,
-                 X: Optional[Union[np.ndarray, List[List]]],
-                 Y: Union[np.ndarray, pd.Series],
-                 X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
-                 Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+                 X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]],
+                 Y: Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]],
+                 X_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                 Y_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                  start_times_train: Optional[List[pd.DatetimeIndex]] = None,
                  start_times_test: Optional[List[pd.DatetimeIndex]] = None,
                  known_future_features: Optional[Tuple[str]] = None,
@@ -478,10 +475,8 @@ def __init__(self,
 
         if Y_test is not None:
             X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
-            time_features_test = self.compute_time_features(self.start_times_test, self.sequence_lengths_tests)
         else:
             self.sequence_lengths_tests = None
-            time_features_test = None
 
         y_groups = Y.groupby(Y.index)
         if normalize_y:
@@ -604,13 +599,12 @@ def __init__(self,
                             "sp": self.seasonality,
                             "known_future_features_index": known_future_features_index}
 
-        sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
+        sequence_datasets, train_tensors = self.make_sequences_datasets(
             X=X, Y=Y,
             X_test=X_test, Y_test=Y_test,
             start_times_train=self.start_times_train,
             start_times_test=self.start_times_test,
             time_features_train=time_features_train,
-            time_features_test=time_features_test,
             **sequences_kwargs)
         self.normalize_y = normalize_y
 
@@ -628,7 +622,7 @@ def __init__(self,
 
         self.train_tensors = train_tensors
 
-        self.test_tensors = test_tensors
+        self.test_tensors = None  # Test tensor is not applied to forecasting tasks
         self.val_tensors = None
 
         self.task_type: Optional[str] = None
@@ -764,11 +758,9 @@ def make_sequences_datasets(self,
                                 X_test: Optional[pd.DataFrame] = None,
                                 Y_test: Optional[pd.DataFrame] = None,
                                 start_times_test: Optional[List[pd.DatetimeIndex]] = None,
-                                time_features_test: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
                                 **sequences_kwargs: Optional[Dict]) -> Tuple[
         List[TimeSeriesSequence],
-        Tuple[Optional[pd.DataFrame], pd.DataFrame],
-        Optional[Tuple[pd.DataFrame, pd.DataFrame]]
+        Tuple[Optional[pd.DataFrame], pd.DataFrame]
     ]:
         """
         build a series time sequence datasets
@@ -798,8 +790,6 @@ def make_sequences_datasets(self,
                 a
             train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
                 training tensors
-            test_tensors: Option[Tuple List[np.ndarray, List[np.ndarray]]
-                test tensors
 
         """
         sequence_datasets = []
@@ -821,7 +811,6 @@ def make_sequences_datasets(self,
             x_test_ser = x_test_group.get_group(ser_id).transform(np.array).values if X_test is not None else None
 
             start_test = None if start_times_test is None else start_times_test[i_ser]
-            time_feature_test = None if time_features_test is None else time_features_test[start_test][:len(y_test_ser)]
 
             sequence = TimeSeriesSequence(
                 X=x_ser,
@@ -829,20 +818,13 @@ def make_sequences_datasets(self,
                 start_time_train=start_train,
                 X_test=x_test_ser,
                 Y_test=y_test_ser,
-                start_time_test=start_test,
                 time_features=time_features_train[start_train][:len(y_ser)],
-                time_features_test=time_feature_test,
                 **sequences_kwargs)
             sequence_datasets.append(sequence)
 
         train_tensors = (X, Y)
-        if Y_test is None:
-            test_tensors = None
-        else:
-            # test_tensors = (X_test_seq_all, Y_test_seq_all)
-            test_tensors = (X_test, Y_test)
 
-        return sequence_datasets, train_tensors, test_tensors
+        return sequence_datasets, train_tensors
 
     def replace_data(self,
                      X_train: pd.DataFrame,
@@ -1122,4 +1104,7 @@ def generate_test_seqs(self) -> List[TimeSeriesSequence]:
         for test_seq in test_sets:
             test_seq.is_test_set = True
             test_seq.only_has_past_targets = True
+            if len(self.known_future_features) > 0 and test_seq.X_test is None:
+                raise ValueError("If future features are required, X_test must be given!")
+            test_seq.X = np.concatenate([test_seq.X, test_seq.X_test])
         return test_sets

From 27790156367dd2030fee8cbb2fcccbf54d7bb502 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 29 Apr 2022 13:00:14 +0200
Subject: [PATCH 230/347] adjust dataset to allow future known features

---
 autoPyTorch/api/time_series_forecasting.py    |  22 +-
 .../data/time_series_forecasting_validator.py | 109 +++--
 autoPyTorch/datasets/time_series_dataset.py   | 407 +++++++++++-------
 3 files changed, 342 insertions(+), 196 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index e70083f47..dca66b3d3 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -455,7 +455,8 @@ def predict(
             X_test: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]], pd.DataFrame] = None,
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
-            targets_tests: Optional[List[np.ndarray]] = None,
+            past_targets: Optional[List[np.ndarray]] = None,
+            future_targets: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]], pd.DataFrame] = None,
             start_times: List[pd.DatetimeIndex] = []
     ) -> np.ndarray:
         """
@@ -463,9 +464,20 @@ def predict(
                 (used for multi-variable prediction), indicates which value needs to be predicted
         """
         if not isinstance(X_test[0], TimeSeriesSequence):
-            # Validate and construct TimeSeriesSequence TODO
-            pass
+            # Validate and construct TimeSeriesSequence
+            X_test, _ = self.dataset.transform_data_into_time_series_sequence(X=X_test,
+                                                                              Y=past_targets,
+                                                                              X_test=future_targets,
+                                                                              start_times=start_times,
+                                                                              is_test_set=True
+                                                                              )
         flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
         if self.dataset.num_target == 1:
-            return flattened_res.reshape([-1, self.dataset.n_prediction_steps])
-        return flattened_res.reshape([-1, self.dataset.n_prediction_steps, self.dataset.num_target])
+            forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps])
+        else:
+            forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps, self.dataset.num_target])
+        if self.dataset.normalize_y:
+            mean = np.repeat(self.dataset.y_mean.values(), self.dataset.n_prediction_steps)
+            std = np.repeat(self.dataset.y_std.values(), self.dataset.n_prediction_steps)
+            return forecasting * std + mean
+        return forecasting
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 685a88f9e..56b8b1cd1 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -155,58 +155,87 @@ def transform(
             self,
             X: Optional[Union[List, pd.DataFrame]],
             y: Optional[Union[List, pd.DataFrame]] = None,
-    ) -> Tuple[Optional[pd.DataFrame], pd.DataFrame, List[int]]:
+            validate_for_future_features: bool = False
+    ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], List[int]]:
+        """
+        transform the data with the fitted validator
+        Args:
+            validate_for_future_features: bool
+                if the validator is applied to transform future features (for test sets), in this case we only validate
+                X
+        """
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
-
-        if y is None:
-            raise ValueError('Targets must be given!')
-
-        if isinstance(y, List):
-            num_sequences = len(y)
-            sequence_lengths = [0] * num_sequences
-            if self._is_uni_variant:
-                num_features = 0
+        if validate_for_future_features:
+            if X is None:
+                return None, None, []
+            if isinstance(X, List):
+                num_sequences = len(X)
+                sequence_lengths = [0] * num_sequences
+                for seq_idx in range(num_sequences):
+                    sequence_lengths[seq_idx] = len(X[seq_idx])
+                sequence_lengths = np.asarray(sequence_lengths)
+                x_transformed, _ = self._transform_X(X, sequence_lengths)
+                return x_transformed, None, sequence_lengths
             else:
-                if X is None:
-                    raise ValueError('Multi Variant dataset requires X as input!')
-                num_features = self.feature_validator.num_features
-                assert len(X) == len(y), "Length of features must equal to length of targets!"
+                raise NotImplementedError
 
-            for seq_idx in range(num_sequences):
-                sequence_lengths[seq_idx] = len(y[seq_idx])
-            sequence_lengths = np.asarray(sequence_lengths)
-
-            y_stacked = self.join_series(y)
+        else:
+            if y is None:
+                raise ValueError('Targets must be given!')
 
-            if self.series_idx is None:
-                series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+            if isinstance(y, List):
+                num_sequences = len(y)
+                sequence_lengths = [0] * num_sequences
                 if not self._is_uni_variant:
-                    x_stacked = self.join_series(X)
-                    x_transformed = self.feature_validator.transform(x_stacked,
-                                                                     index=series_number)
+                    if X is None:
+                        raise ValueError('Multi Variant dataset requires X as input!')
+                    assert len(X) == len(y), "Length of features must equal to length of targets!"
 
-            else:
-                # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
-                x_flat = pd.concat(X)
-                x_columns = x_flat.columns
-                for ser_id in self.series_idx:
-                    if ser_id not in x_columns:
-                        raise ValueError(f'{ser_id} does not exist in input feature X')
+                for seq_idx in range(num_sequences):
+                    sequence_lengths[seq_idx] = len(y[seq_idx])
+                sequence_lengths = np.asarray(sequence_lengths)
 
-                series_number = pd.MultiIndex.from_frame(x_flat[self.series_idx])
+                y_stacked = self.join_series(y)
 
-                if not self._is_uni_variant:
-                    x_transformed = self.feature_validator.transform(x_flat.drop(self.series_idx, axis=1),
-                                                                     index=series_number)
-            y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked, index=series_number)
+                x_transformed, series_number = self._transform_X(X, sequence_lengths)
+                y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked, index=series_number)
 
-            if self._is_uni_variant:
-                return None, y_transformed, sequence_lengths
+                if self._is_uni_variant:
+                    return None, y_transformed, sequence_lengths
 
-            return x_transformed, y_transformed, sequence_lengths
+                return x_transformed, y_transformed, sequence_lengths
+            else:
+                raise NotImplementedError
+
+    def _transform_X(self,
+                     X: Optional[Union[List, pd.DataFrame]],
+                     sequence_lengths: np.ndarray) -> Tuple[pd.DataFrame, Union[np.ndarray, pd.Index]]:
+        if self.series_idx is None:
+            series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+            if not self._is_uni_variant:
+                x_stacked = self.join_series(X)
+                x_transformed = self.feature_validator.transform(x_stacked,
+                                                                 index=series_number)
+            else:
+                x_transformed = None
         else:
-            raise NotImplementedError
+            # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
+            x_stacked = pd.concat(X)
+            x_columns = x_stacked.columns
+            for ser_id in self.series_idx:
+                if ser_id not in x_columns:
+                    raise ValueError(f'{ser_id} does not exist in input feature X')
+
+            series_number = pd.MultiIndex.from_frame(x_stacked[self.series_idx])
+
+            if not self._is_uni_variant:
+                x_transformed = self.feature_validator.transform(x_stacked.drop(self.series_idx, axis=1),
+                                                                 index=series_number)
+            else:
+                x_transformed = None
+
+        return x_transformed, series_number
 
     @staticmethod
     def join_series(X: List[SupportedFeatTypes],
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ef7ac4d6d..35f3d7ac0 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,5 +1,6 @@
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union, cast, Set
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from numbers import Real
 import uuid
 import bisect
 import copy
@@ -8,7 +9,6 @@
 import numpy as np
 
 import pandas as pd
-from pandas._libs.tslibs import to_offset
 from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
 from scipy.sparse import issparse
 
@@ -23,7 +23,7 @@
     TASK_TYPES_TO_STRING,
     TIMESERIES_FORECASTING,
 )
-from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetInputType, type_of_target
+from autoPyTorch.datasets.base_dataset import BaseDataset, type_of_target
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValFuncs,
     CrossValTypes,
@@ -88,7 +88,6 @@ def __init__(self,
                  n_prediction_steps: int = 0,
                  sp: int = 1,
                  known_future_features_index: Optional[List[int]] = None,
-                 only_has_past_targets: bool = False,
                  compute_mase_coefficient_value: bool = True,
                  time_features=None,
                  is_test_set=False,
@@ -127,7 +126,7 @@ def __init__(self,
         self.val_transform = val_transforms
         self.sp = sp
         if compute_mase_coefficient_value:
-            if only_has_past_targets:
+            if is_test_set:
                 self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp,
                                                                  n_prediction_steps=n_prediction_steps)
             else:
@@ -136,7 +135,6 @@ def __init__(self,
 
         else:
             self.mase_coefficient = 1.0
-        self.only_has_past_targets = only_has_past_targets
         self.known_future_features_index = known_future_features_index
 
         self.transform_time_features = False
@@ -372,14 +370,11 @@ def __init__(self,
                  normalize_y: bool = False,
                  ):
         """
-        :param target_variables:  Optional[Union[Tuple[int], int]] used for multi-variant forecasting
         tasks, the target_variables indicates which values in X corresponds to Y.
-        TODO add supports on X for pandas and target variables can be str or Tuple[str]
         :param freq: Optional[Union[str, int]] frequency of the series sequences, used to determine the (possible)
         period
         :param lagged_value: lagged values applied to RNN and Transformer that allows them to use previous data
         :param n_prediction_steps: The number of steps you want to forecast into the future
-        :param shift_input_data: bool
         if the input X and targets needs to be shifted to be aligned:
         such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
         :param normalize_y: bool
@@ -468,15 +463,34 @@ def __init__(self,
         self.time_feature_transform = time_feature_transform
         self.time_feature_names = tuple([f'time_feature_{t.__class__.__name__}' for t in self.time_feature_transform])
 
-        # Time features are lazily generated, we do not count them as either numerical_columns or categorical columns
+        # We also need to be able to transform the data, be it for pre-processing
+        # or for augmentation
+        self.train_transform = train_transforms
+        self.val_transform = val_transforms
+
+        if known_future_features is None:
+            known_future_features = tuple()
+        known_future_features_index = extract_feature_index(self.feature_shapes,
+                                                            self.feature_names,
+                                                            queried_features=known_future_features)
+
+        # initialize datasets
+        self.sequences_builder_kwargs = {"freq": self.freq,
+                                         "time_feature_transform": self.time_feature_transform,
+                                         "train_transforms": self.train_transform,
+                                         "val_transforms": self.val_transform,
+                                         "n_prediction_steps": n_prediction_steps,
+                                         "sp": self.seasonality,
+                                         "known_future_features_index": known_future_features_index}
 
         X, Y, sequence_lengths = self.validator.transform(X, Y)
-        time_features_train = self.compute_time_features(self.start_times_train, sequence_lengths)
+        time_features_train = self.compute_time_features(self.start_times_train,
+                                                         sequence_lengths,
+                                                         self.freq,
+                                                         self.time_feature_transform)
 
         if Y_test is not None:
-            X_test, Y_test, self.sequence_lengths_tests = self.validator.transform(X_test, Y_test)
-        else:
-            self.sequence_lengths_tests = None
+            X_test, Y_test, _ = self.validator.transform(X_test, Y_test)
 
         y_groups = Y.groupby(Y.index)
         if normalize_y:
@@ -492,125 +506,29 @@ def __init__(self,
                 std[std == 0] = 1.
                 Y_test = (Y_test[mean.columns] - mean) / std
 
-        self.shuffle = shuffle
-        self.random_state = np.random.RandomState(seed=seed)
-
-        # check if dataset could be split with cross validation
-        minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
-        if isinstance(resampling_strategy, CrossValTypes):
-            num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
-                'num_splits', None)
-            if resampling_strategy_args is not None:
-                num_splits = resampling_strategy_args.get('num_split', num_splits)
-
-            if resampling_strategy != CrossValTypes.time_series_ts_cross_validation:
-                while minimal_seq_length - n_prediction_steps * num_splits <= 0:
-                    num_splits -= 1
-
-                if num_splits >= 2:
-                    resampling_strategy = CrossValTypes.time_series_cross_validation
-                    if resampling_strategy_args is None:
-                        resampling_strategy_args = {'num_splits': num_splits}
-                    else:
-                        resampling_strategy_args.update({'num_splits': num_splits})
-                else:
-                    warnings.warn('The dataset is not suitable for cross validation, we will apply holdout instead')
-
-                    resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
-                    resampling_strategy_args = None
-            else:
-                seasonality_h_value = int(
-                    np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
-
-                while minimal_seq_length < (num_splits - 1) * freq_value + seasonality_h_value - n_prediction_steps:
-                    if num_splits <= 2:
-                        break
-                    num_splits -= 1
-                if resampling_strategy_args is None:
-                    resampling_strategy_args = {'num_splits': num_splits}
-                else:
-                    resampling_strategy_args.update({'num_splits': num_splits})
-
-        num_seqs = len(sequence_lengths)
-
-        if resampling_strategy_args is not None and "n_repeat" not in resampling_strategy_args:
-            n_repeat = resampling_strategy_args["n_repeat"]
-        else:
-            n_repeat = None
-        if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
-                minimal_seq_length > 50 * n_prediction_steps:
-            if n_repeat is None:
-                if num_seqs < 100:
-                    n_repeat = int(np.ceil(100.0 / num_seqs))
-                else:
-                    n_repeat = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
-
-            if resampling_strategy == CrossValTypes.time_series_cross_validation:
-                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps * num_splits))
-            elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
-                seasonality_h_value = int(np.round(
-                    (self.n_prediction_steps * n_repeat // int(self.freq_value) + 1) * self.freq_value)
-                )
-
-                while minimal_seq_length // 5 < (num_splits - 1) * n_repeat * freq_value \
-                        + seasonality_h_value - n_repeat * n_prediction_steps:
-                    n_repeat -= 1
-                    seasonality_h_value = int(np.round(
-                        (self.n_prediction_steps * n_repeat // int(self.freq_value) + 1) * self.freq_value)
-                    )
-            elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
-                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps) - 1)
-
-            else:
-                n_repeat = 1
-
-            n_repeat = max(n_repeat, 1)
-        if n_repeat is None:
-            n_repeat = 1
-
-        if resampling_strategy_args is None:
-            resampling_strategy_args = {'n_repeat': n_repeat}
-        else:
-            resampling_strategy_args.update({'n_repeat': n_repeat})
-
-        self.resampling_strategy = resampling_strategy
-        self.resampling_strategy_args = resampling_strategy_args
-
-        # We also need to be able to transform the data, be it for pre-processing
-        # or for augmentation
-        self.train_transform = train_transforms
-        self.val_transform = val_transforms
-
-        self.num_sequences = len(Y)
-        self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
-
-        if known_future_features is None:
-            known_future_features = tuple()
-        known_future_features_index = extract_feature_index(self.feature_shapes,
-                                                            self.feature_names,
-                                                            queried_features=known_future_features)
-
-        # initialize datasets
-        sequences_kwargs = {"freq": self.freq,
-                            "time_feature_transform": self.time_feature_transform,
-                            "train_transforms": self.train_transform,
-                            "val_transforms": self.val_transform,
-                            "n_prediction_steps": n_prediction_steps,
-                            "sp": self.seasonality,
-                            "known_future_features_index": known_future_features_index}
-
         sequence_datasets, train_tensors = self.make_sequences_datasets(
             X=X, Y=Y,
             X_test=X_test, Y_test=Y_test,
             start_times_train=self.start_times_train,
             start_times_test=self.start_times_test,
             time_features_train=time_features_train,
-            **sequences_kwargs)
+            **self.sequences_builder_kwargs)
+
         self.normalize_y = normalize_y
 
+        sequence_datasets, train_tensors = self.transform_data_into_time_series_sequence(X,
+                                                                                         Y, X_test,
+                                                                                         Y_test,
+
+                                                                                         self.normalize_y)
+
         ConcatDataset.__init__(self, datasets=sequence_datasets)
+
         self.known_future_features = known_future_features
 
+        self.num_sequences = len(Y)
+        self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
+
         self.seq_length_min = int(np.min(self.sequence_lengths_train))
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
         self.seq_length_max = int(np.max(self.sequence_lengths_train))
@@ -627,7 +545,7 @@ def __init__(self,
 
         self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
-        # TODO find a way to edit input shape!
+
         self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)
 
         if known_future_features is None:
@@ -659,12 +577,26 @@ def __init__(self,
         self.numerical_features: List[int] = self.numerical_columns
         self.categorical_features: List[int] = self.categorical_columns
 
-        if isinstance(resampling_strategy, CrossValTypes):
-            self.cross_validators = CrossValFuncs.get_cross_validators(resampling_strategy)
+        self.shuffle = shuffle
+        self.random_state = np.random.RandomState(seed=seed)
+
+        resampling_strategy, resampling_strategy_args = self.get_split_strategy(
+            sequence_lengths=sequence_lengths,
+            n_prediction_steps=n_prediction_steps,
+            freq_value=self.freq_valueq,
+            resampling_strategy=resampling_strategy,
+            resampling_strategy_args=resampling_strategy_args
+        )
+
+        self.resampling_strategy = resampling_strategy
+        self.resampling_strategy_args = resampling_strategy_args
+
+        if isinstance(self.resampling_strategy, CrossValTypes):
+            self.cross_validators = CrossValFuncs.get_cross_validators(self.resampling_strategy)
         else:
             self.cross_validators = CrossValFuncs.get_cross_validators(CrossValTypes.time_series_cross_validation)
-        if isinstance(resampling_strategy, HoldoutValTypes):
-            self.holdout_validators = HoldOutFuncs.get_holdout_validators(resampling_strategy)
+        if isinstance(self.resampling_strategy, HoldoutValTypes):
+            self.holdout_validators = HoldOutFuncs.get_holdout_validators(self.resampling_strategy)
 
         else:
             self.holdout_validators = HoldOutFuncs.get_holdout_validators(
@@ -681,9 +613,11 @@ def __init__(self,
 
         self.lagged_value = lagged_value
 
-    def compute_time_features(self,
-                              start_times: List[pd.DatetimeIndex],
-                              seq_lengths: List[int]) -> Dict[pd.DatetimeIndex, np.ndarray]:
+    @staticmethod
+    def compute_time_features(start_times: List[pd.DatetimeIndex],
+                              seq_lengths: List[int],
+                              freq: Union[str, pd.DateOffset],
+                              time_feature_transform: List[TimeFeature]) -> Dict[pd.DatetimeIndex, np.ndarray]:
         """
         compute the max series length for each start_time and compute their corresponding time_features. As lots of
         series in a dataset share the same start time, we could only compute the features for longest possible series
@@ -698,14 +632,14 @@ def compute_time_features(self,
             try:
                 date_info = pd.date_range(start=start_t,
                                           periods=max_l,
-                                          freq=self.freq)
+                                          freq=freq)
                 series_time_features[start_t] = np.vstack(
                     [transform(date_info).to_numpy(float)
                      if not isinstance(transform, ConstantTransform) else transform(date_info)
-                     for transform in self.time_feature_transform]
+                     for transform in time_feature_transform]
                 ).T
             except OutOfBoundsDatetime as e:
-                series_time_features[start_t] = np.zeros([max_l, len(self.time_feature_transform)])
+                series_time_features[start_t] = np.zeros([max_l, len(time_feature_transform)])
         return series_time_features
 
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:
@@ -750,14 +684,85 @@ def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
 
         return y_test.reshape([-1, self.num_target])
 
-    def make_sequences_datasets(self,
-                                X: pd.DataFrame,
+    def transform_data_into_time_series_sequence(self,
+                                                 X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]],
+                                                 Y: Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]],
+                                                 start_times: List[pd.DatetimeIndex],
+                                                 X_test: Optional[
+                                                     Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                                                 Y_test: Optional[
+                                                     Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
+                                                 is_test_set: bool = False):
+        """
+        Transform the raw data into a list of TimeSeriesSequence that can be processed by AutoPyTorch Time Series
+                build a series time sequence datasets
+        Args:
+            X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]
+                features, if is_test_set is True, then its length of
+            Y: pd.DataFrame (N_all, N_target)
+                flattened train target array with size N_all (the sum of all the series sequences) and number of targets
+            start_times: List[pd.DatetimeIndex]
+                start time of each training series
+            time_features_train: Dict[pd.Timestamp, np.ndarray]:
+                time features for each possible start training times
+            X_test: Optional[np.ndarray (N_all_test, N_feature)]
+                flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
+                number of features
+            Y_test: np.ndarray (N_all_test, N_target)
+                flattened test target array with size N_all (the sum of all the series sequences) and number of targets
+            is_test_set: Optional[List[pd.DatetimeIndex]]
+                if the genereated sequecne used for test
+            sequences_kwargs: Dict
+                additional arguments for test sets
+        Returns:
+            sequence_datasets : List[TimeSeriesSequence]
+                a list of datasets
+            train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
+                training tensors
+        """
+        dataset_with_future_features = is_test_set and X is not None and len(self.known_future_features) > 0
+        if dataset_with_future_features and X_test is None:
+            raise ValueError('When constructing test sets and known future features exist, X_test must be given!')
+        X, Y, sequence_lengths = self.validator.transform(X, Y)
+        time_features = self.compute_time_features(start_times,
+                                                   sequence_lengths,
+                                                   self.freq,
+                                                   self.time_feature_transform)
+
+        if Y_test is not None:
+            X_test, Y_test, _ = self.validator.transform(X_test, Y_test,
+                                                         validate_for_future_features=dataset_with_future_features)
+
+        y_groups = Y.groupby(Y.index)
+        if self.normalize_y:
+            mean = y_groups.agg("mean")
+            std = y_groups.agg("std")
+            std[std == 0] = 1.
+            Y = (Y[mean] - mean) / std
+            self.y_mean = mean
+            self.y_std = std
+            if Y_test is not None:
+                Y_test = (Y_test[mean.columns] - mean) / std
+
+        sequence_datasets, train_tensors = self.make_sequences_datasets(
+            X=X, Y=Y,
+            X_test=X_test, Y_test=Y_test,
+            start_times=start_times,
+            time_features=time_features,
+            is_test_set=is_test_set,
+            dataset_with_future_features=dataset_with_future_features,
+            **self.sequences_builder_kwargs)
+        return sequence_datasets, train_tensors
+
+    @staticmethod
+    def make_sequences_datasets(X: pd.DataFrame,
                                 Y: pd.DataFrame,
-                                start_times_train: List[pd.DatetimeIndex],
-                                time_features_train: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
+                                start_times: List[pd.DatetimeIndex],
+                                time_features: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
                                 X_test: Optional[pd.DataFrame] = None,
                                 Y_test: Optional[pd.DataFrame] = None,
-                                start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+                                is_test_set: bool = False,
+                                dataset_with_future_features: bool = False,
                                 **sequences_kwargs: Optional[Dict]) -> Tuple[
         List[TimeSeriesSequence],
         Tuple[Optional[pd.DataFrame], pd.DataFrame]
@@ -770,24 +775,24 @@ def make_sequences_datasets(self,
                 number of features, X's index should contain the information identifying its series number
             Y: pd.DataFrame (N_all, N_target)
                 flattened train target array with size N_all (the sum of all the series sequences) and number of targets
-            start_times_train: List[pd.DatetimeIndex]
+            start_times: List[pd.DatetimeIndex]
                 start time of each training series
-            time_features_train: Dict[pd.Timestamp, np.ndarray]:
+            time_features: Dict[pd.Timestamp, np.ndarray]:
                 time features for each possible start training times
             X_test: Optional[np.ndarray (N_all_test, N_feature)]
                 flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
                 number of features
             Y_test: np.ndarray (N_all_test, N_target)
                 flattened test target array with size N_all (the sum of all the series sequences) and number of targets
-            start_times_test: Optional[List[pd.DatetimeIndex]]
-                start time for each test series
-            time_features_test:Optional[Dict[pd.Timestamp, np.ndarray]]
-                time features for each possible start test times.
+            is_test_set (bool):
+                if the generated sequence used for test
+            dataset_with_future_features (bool):
+                if we want to create a dataset with future features (that contained in X)
             sequences_kwargs: Dict
                 additional arguments for test sets
         Returns:
             sequence_datasets : List[TimeSeriesSequence]
-                a
+                a list of datasets
             train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
                 training tensors
 
@@ -802,7 +807,7 @@ def make_sequences_datasets(self,
         if X_test is not None:
             x_test_group = X_test.groupby(X_test.index)
 
-        for i_ser, (start_train, y) in enumerate(zip(start_times_train, y_group)):
+        for i_ser, (start_train, y) in enumerate(zip(start_times, y_group)):
             ser_id = y[0]
             y_ser = y[1].transform(np.array).values
             x_ser = x_group.get_group(ser_id).transform(np.array).values if X is not None else None
@@ -810,7 +815,9 @@ def make_sequences_datasets(self,
             y_test_ser = y_test_group.get_group(ser_id).transform(np.array).values if Y_test is not None else None
             x_test_ser = x_test_group.get_group(ser_id).transform(np.array).values if X_test is not None else None
 
-            start_test = None if start_times_test is None else start_times_test[i_ser]
+            if dataset_with_future_features:
+                x_ser = np.concatenate([x_ser, x_test_ser])
+                x_test_ser = None
 
             sequence = TimeSeriesSequence(
                 X=x_ser,
@@ -818,7 +825,8 @@ def make_sequences_datasets(self,
                 start_time_train=start_train,
                 X_test=x_test_ser,
                 Y_test=y_test_ser,
-                time_features=time_features_train[start_train][:len(y_ser)],
+                time_features=time_features[start_train][:len(y_ser)],
+                is_test_set=is_test_set,
                 **sequences_kwargs)
             sequence_datasets.append(sequence)
 
@@ -959,6 +967,106 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    else self.train_tensors[0].isnull().values.any()})
         return dataset_properties
 
+    @staticmethod
+    def get_split_strategy(sequence_lengths: List[int],
+                           n_prediction_steps: int,
+                           freq_value: Real,
+                           resampling_strategy: Optional[Union[
+                               CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
+                           resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
+            Tuple[Union[CrossValTypes, HoldoutValTypes], Optional[Dict[str, Any]]]:
+        """
+        Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
+        enough to support cross-validation split, thus we need to carefully compute the number of folds
+        Args:
+            sequence_lengths (List[int]): lengths of each sequence
+            n_prediction_steps (int): forecasting horizon
+            freq_value (Real): period of the dataset, determined by its sampling frequency
+            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy to be checked
+            resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments to be checked
+        Returns:
+            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy
+            resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments
+        """
+        # check if dataset could be split with cross validation
+        minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
+        if isinstance(resampling_strategy, CrossValTypes):
+            num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
+                'num_splits', None)
+            if resampling_strategy_args is not None:
+                num_splits = resampling_strategy_args.get('num_split', num_splits)
+
+            if resampling_strategy != CrossValTypes.time_series_ts_cross_validation:
+                while minimal_seq_length - n_prediction_steps * num_splits <= 0:
+                    num_splits -= 1
+
+                if num_splits >= 2:
+                    resampling_strategy = CrossValTypes.time_series_cross_validation
+                    if resampling_strategy_args is None:
+                        resampling_strategy_args = {'num_splits': num_splits}
+                    else:
+                        resampling_strategy_args.update({'num_splits': num_splits})
+                else:
+                    warnings.warn('The dataset is not suitable for cross validation, we will apply holdout instead')
+
+                    resampling_strategy = HoldoutValTypes.time_series_hold_out_validation
+                    resampling_strategy_args = None
+            else:
+                seasonality_h_value = int(
+                    np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
+
+                while minimal_seq_length < (num_splits - 1) * freq_value + seasonality_h_value - n_prediction_steps:
+                    if num_splits <= 2:
+                        break
+                    num_splits -= 1
+                if resampling_strategy_args is None:
+                    resampling_strategy_args = {'num_splits': num_splits}
+                else:
+                    resampling_strategy_args.update({'num_splits': num_splits})
+
+        num_seqs = len(sequence_lengths)
+
+        if resampling_strategy_args is not None and "n_repeat" not in resampling_strategy_args:
+            n_repeat = resampling_strategy_args["n_repeat"]
+        else:
+            n_repeat = None
+        if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
+                minimal_seq_length > 50 * n_prediction_steps:
+            if n_repeat is None:
+                if num_seqs < 100:
+                    n_repeat = int(np.ceil(100.0 / num_seqs))
+                else:
+                    n_repeat = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
+
+            if resampling_strategy == CrossValTypes.time_series_cross_validation:
+                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps * num_splits))
+            elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
+                seasonality_h_value = int(np.round(
+                    (n_prediction_steps * n_repeat // int(freq_value) + 1) * freq_value)
+                )
+
+                while minimal_seq_length // 5 < (num_splits - 1) * n_repeat * freq_value \
+                        + seasonality_h_value - n_repeat * n_prediction_steps:
+                    n_repeat -= 1
+                    seasonality_h_value = int(np.round(
+                        (n_prediction_steps * n_repeat // int(freq_value) + 1) * freq_value)
+                    )
+            elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
+                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps) - 1)
+
+            else:
+                n_repeat = 1
+
+            n_repeat = max(n_repeat, 1)
+        if n_repeat is None:
+            n_repeat = 1
+
+        if resampling_strategy_args is None:
+            resampling_strategy_args = {'n_repeat': n_repeat}
+        else:
+            resampling_strategy_args.update({'n_repeat': n_repeat})
+        return resampling_strategy, resampling_strategy_args
+
     def create_cross_val_splits(
             self,
             cross_val_type: CrossValTypes,
@@ -1077,8 +1185,6 @@ def create_refit_split(
         Returns:
             (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
         """
-        kwargs = {"n_prediction_steps": self.n_prediction_steps}
-
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
@@ -1103,7 +1209,6 @@ def generate_test_seqs(self) -> List[TimeSeriesSequence]:
         test_sets = copy.deepcopy(self.datasets)
         for test_seq in test_sets:
             test_seq.is_test_set = True
-            test_seq.only_has_past_targets = True
             if len(self.known_future_features) > 0 and test_seq.X_test is None:
                 raise ValueError("If future features are required, X_test must be given!")
             test_seq.X = np.concatenate([test_seq.X, test_seq.X_test])

From 1a1fe68efb21c1e76cdafef614a34f9a4e017ee1 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 29 Apr 2022 14:43:17 +0200
Subject: [PATCH 231/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    |  2 +-
 .../data/time_series_forecasting_validator.py | 14 ++--
 autoPyTorch/datasets/time_series_dataset.py   | 74 ++++++-------------
 3 files changed, 31 insertions(+), 59 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index dca66b3d3..915157838 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -465,7 +465,7 @@ def predict(
         """
         if not isinstance(X_test[0], TimeSeriesSequence):
             # Validate and construct TimeSeriesSequence
-            X_test, _ = self.dataset.transform_data_into_time_series_sequence(X=X_test,
+            X_test, _, _ = self.dataset.transform_data_into_time_series_sequence(X=X_test,
                                                                               Y=past_targets,
                                                                               X_test=future_targets,
                                                                               start_times=start_times,
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 56b8b1cd1..130728c94 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -36,8 +36,7 @@ def __init__(self,
         self._is_uni_variant = False
         self.known_future_features = None
         self.n_prediction_steps = 1
-        self.start_times_train = None
-        self.start_times_test = None
+        self.start_times = None
         self.feature_shapes: Dict[str, int] = {}
         self.feature_names: List[str] = []
         self.series_idx = None
@@ -49,7 +48,7 @@ def fit(
             series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
             X_test: Optional[Union[List, pd.DataFrame]] = None,
             y_test: Optional[Union[List, pd.DataFrame]] = None,
-            start_times_train: Optional[List[pd.DatetimeIndex]] = None,
+            start_times: Optional[List[pd.DatetimeIndex]] = None,
             freq: str = '1Y',
             n_prediction_steps: int = 1,
             known_future_features: Optional[List[Union[int, str]]] = None,
@@ -70,13 +69,12 @@ def fit(
         self.series_idx = series_idx
         self.n_prediction_steps = n_prediction_steps
 
-        if start_times_train is None:
-            start_times_train = [pd.DatetimeIndex(pd.to_datetime(['2000-01-01']), freq=freq)] * len(y_train)
+        if start_times is None:
+            start_times = [pd.DatetimeIndex(pd.to_datetime(['2000-01-01']), freq=freq)] * len(y_train)
         else:
-            assert len(start_times_train) == len(y_train), 'start_times_train must have the same length as y_train!'
+            assert len(start_times) == len(y_train), 'start_times_train must have the same length as y_train!'
 
-
-        self.start_times_train = start_times_train
+        self.start_times = start_times
 
         if X_train is None:
             self._is_uni_variant = True
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 35f3d7ac0..81ef0ad32 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -209,7 +209,7 @@ def __getitem__(self, index: int, train: bool = True) \
 
         # In case of prediction, the targets are not provided
         targets = self.Y
-        if self.only_has_past_targets:
+        if self.is_test_set:
             future_targets = None
         else:
             future_targets = targets[index + 1: index + self.n_prediction_steps + 1]
@@ -239,7 +239,7 @@ def __getitem__(self, index: int, train: bool = True) \
                     0]}, future_targets
 
     def __len__(self) -> int:
-        return self.Y.shape[0] if self.only_has_past_targets else self.Y.shape[0] - self.n_prediction_steps
+        return self.Y.shape[0] if self.is_test_set else self.Y.shape[0] - self.n_prediction_steps
 
     def compute_time_features(self, ):
         if self._cached_time_features is None:
@@ -297,8 +297,8 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
         return self
 
     def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
-        if self.only_has_past_targets:
-            raise ValueError("get_val_seq_set is not supported for the sequence that only has past targets!")
+        if self.is_test_set:
+            raise ValueError("get_val_seq_set is not supported for the test sequences!")
         if index < 0:
             index = self.__len__() + index
         if index == self.__len__() - 1:
@@ -328,8 +328,8 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                                       time_features=cached_time_features)
 
     def get_test_target(self, test_idx: int):
-        if self.only_has_past_targets:
-            raise ValueError("get_test_target is not supported for the sequence that only has past targets!")
+        if self.is_test_set:
+            raise ValueError("get_test_target is not supported for test sequences!")
         if test_idx < 0:
             test_idx = self.__len__() + test_idx
         Y_future = self.Y[test_idx + 1: test_idx + self.n_prediction_steps + 1]
@@ -351,8 +351,7 @@ def __init__(self,
                  Y: Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]],
                  X_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                  Y_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
-                 start_times_train: Optional[List[pd.DatetimeIndex]] = None,
-                 start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+                 start_times: Optional[List[pd.DatetimeIndex]] = None,
                  known_future_features: Optional[Tuple[str]] = None,
                  time_feature_transform: Optional[List[TimeFeature]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
@@ -383,6 +382,7 @@ def __init__(self,
         header's configspace can be built beforehand.
         :param static_features: statistic features, invariant across different
         """
+        # Preprocess time series data information
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
         if freq is None:
@@ -415,12 +415,13 @@ def __init__(self,
         self.freq: Optional[str] = freq
         self.freq_value: Optional[int] = freq_value
 
+        self.n_prediction_steps = n_prediction_steps
+
         self.dataset_name = dataset_name
 
         if self.dataset_name is None:
             self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
-
-        self.n_prediction_steps = n_prediction_steps
+        # Data Validation
         if validator is None:
             validator = TimeSeriesForecastingInputValidator(is_classification=False)
         self.validator: TimeSeriesForecastingInputValidator = validator
@@ -431,7 +432,7 @@ def __init__(self,
 
         if not self.validator._is_fitted:
             self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
-                               start_times_train=start_times_train, start_times_test=start_times_test,
+                               start_times=start_times, start_times_test=start_times_test,
                                n_prediction_steps=n_prediction_steps)
 
         self.is_uni_variant = self.validator._is_uni_variant
@@ -447,8 +448,7 @@ def __init__(self,
         self.feature_shapes = self.validator.feature_shapes
         self.feature_names = tuple(self.validator.feature_names)
 
-        self.start_times_train = self.validator.start_times_train
-        self.start_times_test = self.validator.start_times_test
+        self.start_times = self.validator.start_times
 
         self.static_features = self.validator.feature_validator.static_features
 
@@ -468,6 +468,7 @@ def __init__(self,
         self.train_transform = train_transforms
         self.val_transform = val_transforms
 
+        # Construct time series sequences
         if known_future_features is None:
             known_future_features = tuple()
         known_future_features_index = extract_feature_index(self.feature_shapes,
@@ -483,44 +484,15 @@ def __init__(self,
                                          "sp": self.seasonality,
                                          "known_future_features_index": known_future_features_index}
 
-        X, Y, sequence_lengths = self.validator.transform(X, Y)
-        time_features_train = self.compute_time_features(self.start_times_train,
-                                                         sequence_lengths,
-                                                         self.freq,
-                                                         self.time_feature_transform)
-
-        if Y_test is not None:
-            X_test, Y_test, _ = self.validator.transform(X_test, Y_test)
-
-        y_groups = Y.groupby(Y.index)
-        if normalize_y:
-            mean = y_groups.transform("mean")
-            std = y_groups.transform("std")
-            std[std == 0] = 1.
-            Y = (Y[mean.columns] - mean) / std
-            if Y_test is not None:
-                y_groups_test = Y_test.groupby(Y.index)
-
-                mean = y_groups_test.transform("mean")
-                std = y_groups_test.transform("std")
-                std[std == 0] = 1.
-                Y_test = (Y_test[mean.columns] - mean) / std
-
-        sequence_datasets, train_tensors = self.make_sequences_datasets(
-            X=X, Y=Y,
-            X_test=X_test, Y_test=Y_test,
-            start_times_train=self.start_times_train,
-            start_times_test=self.start_times_test,
-            time_features_train=time_features_train,
-            **self.sequences_builder_kwargs)
-
         self.normalize_y = normalize_y
 
-        sequence_datasets, train_tensors = self.transform_data_into_time_series_sequence(X,
-                                                                                         Y, X_test,
-                                                                                         Y_test,
+        sequence_datasets, train_tensors, sequence_lengths = self.transform_data_into_time_series_sequence(
+            X, Y,
+            start_times=start_times,
+            X_test=X_test,
+            Y_test=Y_test, )
 
-                                                                                         self.normalize_y)
+        Y = train_tensors[1]
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
@@ -548,6 +520,7 @@ def __init__(self,
 
         self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)
 
+        # process known future features
         if known_future_features is None:
             self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)
         else:
@@ -572,6 +545,7 @@ def __init__(self,
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
 
+        # dataset split
         self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
         self.numerical_features: List[int] = self.numerical_columns
@@ -583,7 +557,7 @@ def __init__(self,
         resampling_strategy, resampling_strategy_args = self.get_split_strategy(
             sequence_lengths=sequence_lengths,
             n_prediction_steps=n_prediction_steps,
-            freq_value=self.freq_valueq,
+            freq_value=self.freq_value,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args
         )
@@ -752,7 +726,7 @@ def transform_data_into_time_series_sequence(self,
             is_test_set=is_test_set,
             dataset_with_future_features=dataset_with_future_features,
             **self.sequences_builder_kwargs)
-        return sequence_datasets, train_tensors
+        return sequence_datasets, train_tensors, sequence_lengths
 
     @staticmethod
     def make_sequences_datasets(X: pd.DataFrame,

From f4ad355e50782abc6c15894f50ecc6618af0abb7 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 29 Apr 2022 17:22:23 +0200
Subject: [PATCH 232/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py   |  2 +-
 .../setup/network/forecasting_architecture.py | 20 +++--
 .../base_forecasting_decoder.py               |  8 +-
 .../base_forecasting_encoder.py               |  5 +-
 .../flat_encoder/NBEATSEncoder.py             | 18 ++++-
 .../flat_encoder/__init__.py                  | 77 ++++++++++++++++++-
 .../seq_encoder/TransformerEncoder.py         |  1 -
 .../forecasting_base_trainer.py               | 14 ++--
 8 files changed, 121 insertions(+), 24 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 81ef0ad32..e46e88e29 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -496,7 +496,7 @@ def __init__(self,
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
-        self.known_future_features = known_future_features
+        self.known_future_features = tuple(known_future_features)
 
         self.num_sequences = len(Y)
         self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 907412f9f..40d84c6ad 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -443,11 +443,11 @@ def pre_processing(self,
         else:
             if past_features is not None:
                 past_features = past_features[:, -self.window_size:]
-                x_past = torch.cat([past_features, x_past], dim=-1)
+                x_past = torch.cat([x_past, past_features], dim=-1)
 
-            x_past = x_past.to(device=self.device)
+            x_past = self.embedding(x_past.to(device=self.device))
             if future_features is not None:
-                future_features = future_features.to(self.device)
+                future_features = self.embedding_future(future_features.to(self.device))
             return x_past, future_features, None, loc, scale, None, past_targets
 
     def forward(self,
@@ -605,7 +605,7 @@ def forward(self,
             else:
                 x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
                                                                                     dim=-1)
-            x_future = x_future.to(self.device)
+                x_future = self.embedding_future(x_future.to(self.device))
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
                                                            additional_input=encoder_additional)
@@ -650,11 +650,14 @@ def forward(self,
                             future_features=future_features[:, [idx_pred]] if future_features is not None else None
                         )
                     else:
-                        x_future = x_future if future_features is None else torch.cat([future_features[:, [idx_pred]],
-                                                                                       x_future],
+                        x_future = x_future if future_features is None else torch.cat([x_future,
+                                                                                       future_features[:, [idx_pred]],
+                                                                                       ],
                                                                                       dim=-1)
                         x_future = x_future.to(self.device)
 
+                    x_future = self.embedding_future(x_future)
+
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
                                                   pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
@@ -741,6 +744,8 @@ def forward(self,
 
                         x_future = x_future.to(self.device)
 
+                    x_future = self.embedding_future(x_future)
+
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
                                                   pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
@@ -1035,6 +1040,9 @@ def forward(self,
                     if repeated_time_feat is not None:
                         x_next = torch.cat([repeated_time_feat[:, [k - 1]], x_next], dim=-1)
                     x_next = x_next.to(self.device)
+
+                x_next = self.embedding(x_next)
+
                 encoder2decoder, _ = self.encoder(encoder_input=x_next,
                                                   additional_input=[None] * self.network_structure.num_blocks,
                                                   output_seq=False, cache_intermediate_state=True,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 859c7deba..ef7461285 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -39,7 +39,8 @@ def __init__(self,
     def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
-            FitRequirement('future_feature_shapes', (Tuple,), user_defined=False, dataset_property=True),
+            FitRequirement('known_future_features', (Tuple,), user_defined=False, dataset_property=True),
+            FitRequirement('feature_shapes', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
             FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
@@ -81,7 +82,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         variable_selection = network_structure.variable_selection
 
         if 'n_decoder_output_features' not in X:
-            future_feature_shapes = X['dataset_properties']['future_feature_shapes']
+            future_features = X['dataset_properties']['known_future_features']
+            feature_shapes = X['dataset_properties']['feature_shapes']
+            future_in_features = sum([feature_shapes[fu_feat] for fu_feat in future_features]).item()
 
             if X['transform_time_features']:
                 n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
@@ -91,7 +94,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             if self.block_number == network_structure.num_blocks:
                 self.is_last_decoder = True
 
-            future_in_features = future_feature_shapes[-1]
             if variable_selection:
                 future_in_features = X['network_encoder']['block_1'].encoder_output_shape[-1]
             else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index fdbb88733..7701988e2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -89,14 +89,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 in_features = self.n_encoder_output_feature()
             elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
                 in_features = len(self.lagged_value) * output_shape[-1] + \
-                              input_shape[-1] + n_time_feature_transform
+                              input_shape[-1]
             else:
-                in_features = output_shape[-1] + input_shape[-1] + n_time_feature_transform
+                in_features = output_shape[-1] + input_shape[-1]
 
             input_shape = (X['window_size'], in_features)
         else:
             input_shape = X['encoder_output_shape']
 
+
         self.encoder = self.build_encoder(
             input_shape=input_shape,
         )
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 3e70a6f29..6fea17b70 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -7,6 +7,7 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
     BaseForecastingEncoder, EncoderProperties
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.utils.common import FitRequirement
@@ -40,8 +41,23 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
         return requirements_list
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.check_requirements(X, y)
         self.window_size = X["window_size"]
-        return super().fit(X, y)
+
+        input_shape = X["dataset_properties"]['input_shape']
+        # n-BEATS only requires targets as its input
+        # TODO add support for multi-variant
+        output_shape = X["dataset_properties"]['output_shape']
+
+        self.encoder = self.build_encoder(
+            input_shape=output_shape,
+        )
+
+        self.input_shape = [self.window_size, output_shape[-1]]
+
+        has_hidden_states = self.encoder_properties().has_hidden_states
+        self.encoder_output_shape = get_output_shape(self.encoder, self.input_shape, has_hidden_states)
+        return self
 
     def n_encoder_output_feature(self):
         # THIS function should never be called!!!
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index 01c0ddd82..22f5e70a1 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -4,13 +4,14 @@
 
 import os
 from collections import OrderedDict
-from typing import Dict, Union, Optional
+from typing import Dict, Union, Optional, List, Type
 
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
     find_components,
 )
+from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
@@ -27,6 +28,80 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
 
 
 class FlatForecastingEncoderChoice(AbstractForecastingEncoderChoice):
+    def get_available_components(
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        include: List[str] = None,
+        exclude: List[str] = None,
+        components: Optional[Dict[str, autoPyTorchComponent]] = None
+    ) -> Dict[str, Type[autoPyTorchComponent]]:
+        """Filters out components based on user provided
+        include/exclude directives, as well as the dataset properties
+
+        Args:
+         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
+            to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
+             to remove from the configuration space
+         dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
+             of the dataset to guide the pipeline choices of components
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: A filtered dict of learning
+                rate backbones
+
+        """
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        if include is not None and exclude is not None:
+            raise ValueError(
+                "The argument include and exclude cannot be used together.")
+
+        if components is None:
+            available_comp = self.get_components()
+        else:
+            available_comp = components
+
+        if include is not None:
+            for incl in include:
+                if incl not in available_comp:
+                    raise ValueError("Trying to include unknown component: "
+                                     "%s" % incl)
+
+        components_dict = OrderedDict()
+        for name in available_comp:
+            if include is not None and name not in include:
+                continue
+            elif exclude is not None and name in exclude:
+                continue
+
+            entry = available_comp[name]
+
+            # Exclude itself to avoid infinite loop
+            if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'):
+                continue
+
+            task_type = str(dataset_properties['task_type'])
+            properties = entry.get_properties()
+            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+                continue
+            elif 'image' in task_type and not bool(properties['handles_image']):
+                continue
+            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+                continue
+
+            # target_type = dataset_properties['target_type']
+            # Apply some automatic filtering here for
+            # backbones based on the dataset!
+            # TODO: Think if there is any case where a backbone
+            # is not recommended for a certain dataset
+
+            components_dict[name] = entry
+
+        return components_dict
+
+
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 1a7ecd296..bbcfbff08 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -20,7 +20,6 @@
     PositionalEncoding, build_transformer_layers
 
 
-
 class _TransformerEncoder(EncoderNetwork):
     def __init__(self,
                  in_features: int,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 40233ce64..321531afd 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -10,7 +10,6 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-
 from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
@@ -19,7 +18,6 @@
     NBEATSNet, ForecastingSeq2SeqNet
 from autoPyTorch.pipeline.components.training.losses import MASELoss
 
-
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
@@ -87,8 +85,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             float: training loss
             Dict[str, float]: scores for each desired metric
         """
-        import time
-        time_start = time.time()
         loss_sum = 0.0
         N = 0
         self.model.train()
@@ -119,8 +115,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
 
         self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N)
 
-        print(f'Time Used for training: {time.time() - time_start}')
-
         if self.metrics_during_training:
             return loss_sum / N, self.compute_metrics(outputs_data, targets_data)
         else:
@@ -182,10 +176,10 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
             loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
             loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
 
-            loss_backcast = loss_func_backcast(self.criterion, backcast)
-            loss_forecast = loss_func_forecast(self.criterion, forecast)
+            loss_backcast = loss_func_backcast(self.criterion, backcast) * past_observed_targets
+            loss_forecast = loss_func_forecast(self.criterion, forecast) * future_observed_targets
 
-            loss = loss_forecast + loss_backcast * self.backcast_loss_ratio
+            loss = loss_forecast.mean() + loss_backcast.mean() * self.backcast_loss_ratio
 
             outputs = forecast
         else:
@@ -198,6 +192,8 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
                     else:
                         all_targets = torch.cat([past_target[:, 1 - self.window_size:, ],
                                                  future_targets_values], dim=1)
+                        future_observed_targets = torch.cat([past_observed_targets[:, 1 - self.window_size:, ],
+                                                             future_observed_targets], dim=1)
                 past_target, criterion_kwargs = self.data_preparation(past_target, all_targets.to(self.device))
             else:
                 past_target, criterion_kwargs = self.data_preparation(past_target,

From 79ef7a7391fb7021ee45d177e2792762b8538a44 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 29 Apr 2022 19:35:40 +0200
Subject: [PATCH 233/347] flake8

---
 autoPyTorch/constants.py                      |  16 +-
 autoPyTorch/constants_forecasting.py          |   1 -
 .../data/time_series_feature_validator.py     |   4 +-
 .../data/time_series_forecasting_validator.py |   2 -
 .../data/time_series_target_validator.py      |   5 +-
 autoPyTorch/datasets/time_series_dataset.py   |  14 +-
 autoPyTorch/evaluation/abstract_evaluator.py  |   7 +-
 ...time_series_forecasting_train_evaluator.py |  17 +-
 autoPyTorch/evaluation/train_evaluator.py     |  50 ++-
 autoPyTorch/optimizer/smbo.py                 |   3 +-
 .../base_time_series_preprocessing.py         |   2 +-
 .../encoding/OneHotEncoder.py                 |   2 +-
 .../encoding/__init__.py                      |   7 +-
 .../encoding/time_series_base_encoder.py      |   1 -
 .../imputation/TimeSeriesImputer.py           |  34 +-
 .../imputation/base_time_series_imputer.py    |   2 +-
 .../scaling/base_scaler.py                    |   1 +
 .../scaling/utils.py                          |   6 +-
 .../time_series_preprocessing/utils.py        |   3 +-
 .../pipeline/components/training/losses.py    |  16 +-
 .../components/training/trainer/__init__.py   |   3 +-
 .../training/trainer/base_trainer.py          |  42 ++-
 .../ForecastingMixUpTrainer.py                |   2 +-
 .../ForecastingStandardTrainer.py             |   2 +-
 .../trainer/forecasting_trainer/__init__.py   |   5 +-
 .../pipeline/time_series_classification.py    | 279 ----------------
 .../pipeline/time_series_forecasting.py       |   3 +-
 .../pipeline/time_series_regression.py        | 224 -------------
 .../utils/forecasting_time_features.py        |   8 -
 autoPyTorch/utils/pipeline.py                 |  16 +-
 .../test_time_series_data_loader.py           |  43 ---
 .../test_time_series_scaler_choice.py         |  47 ---
 .../components/test_time_series_scalers.py    | 152 ---------
 .../test_time_series_transformer.py           |  30 --
 .../test_pipeline/components/training/base.py |  29 --
 .../test_time_series_classification.py        | 316 ------------------
 .../test_time_series_regression.py            | 295 ----------------
 37 files changed, 106 insertions(+), 1583 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/time_series_classification.py
 delete mode 100644 autoPyTorch/pipeline/time_series_regression.py
 delete mode 100644 test/test_pipeline/components/test_time_series_data_loader.py
 delete mode 100644 test/test_pipeline/components/test_time_series_scaler_choice.py
 delete mode 100644 test/test_pipeline/components/test_time_series_scalers.py
 delete mode 100644 test/test_pipeline/components/test_time_series_transformer.py
 delete mode 100644 test/test_pipeline/test_time_series_classification.py
 delete mode 100644 test/test_pipeline/test_time_series_regression.py

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 318203421..f074ffaee 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -2,17 +2,15 @@
 IMAGE_CLASSIFICATION = 2
 TABULAR_REGRESSION = 3
 IMAGE_REGRESSION = 4
-TIMESERIES_CLASSIFICATION = 5
-TIMESERIES_REGRESSION = 6
-TIMESERIES_FORECASTING = 7
+TIMESERIES_FORECASTING = 5
 
-REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION, TIMESERIES_REGRESSION]
-CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION, TIMESERIES_CLASSIFICATION]
+REGRESSION_TASKS = [TABULAR_REGRESSION, IMAGE_REGRESSION]
+CLASSIFICATION_TASKS = [TABULAR_CLASSIFICATION, IMAGE_CLASSIFICATION]
 FORECASTING_TASKS = [TIMESERIES_FORECASTING]  # TODO extend FORECASTING TASKS to Classification and regression tasks
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
-TIMESERIES_TASKS = [TIMESERIES_CLASSIFICATION, TIMESERIES_REGRESSION, TIMESERIES_FORECASTING]
+TIMESERIES_TASKS = [TIMESERIES_FORECASTING]
 
 TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + TIMESERIES_TASKS
 
@@ -21,8 +19,6 @@
      IMAGE_CLASSIFICATION: 'image_classification',
      TABULAR_REGRESSION: 'tabular_regression',
      IMAGE_REGRESSION: 'image_regression',
-     TIMESERIES_CLASSIFICATION: 'time_series_classification',
-     TIMESERIES_REGRESSION: 'time_series_regression',
      TIMESERIES_FORECASTING: 'time_series_forecasting'}
 
 STRING_TO_TASK_TYPES = \
@@ -30,8 +26,6 @@
      'image_classification': IMAGE_CLASSIFICATION,
      'tabular_regression': TABULAR_REGRESSION,
      'image_regression': IMAGE_REGRESSION,
-     'time_series_classification': TIMESERIES_CLASSIFICATION,
-     'time_series_regression': TIMESERIES_REGRESSION,
      'time_series_forecasting': TIMESERIES_FORECASTING}
 
 # Output types have been defined as in scikit-learn type_of_target
@@ -49,7 +43,7 @@
      CONTINUOUSMULTIOUTPUT: 'continuous-multioutput',
      MULTICLASS: 'multiclass',
      CONTINUOUS: 'continuous',
-     MULTICLASSMULTIOUTPUT: 'multiclass-multioutput',}
+     MULTICLASSMULTIOUTPUT: 'multiclass-multioutput'}
 
 STRING_TO_OUTPUT_TYPES = \
     {'binary': BINARY,
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index b5df96d3c..351286a63 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -17,4 +17,3 @@
 }
 
 MAX_WINDOW_SIZE_BASE = 500
-
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 40f70097b..07f79e2f2 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -1,11 +1,9 @@
 import logging
-from typing import Optional, Union, Tuple, Sequence
+from typing import Optional, Union, Tuple
 import pandas as pd
 import numpy as np
 
-import sklearn.utils
 from sklearn.base import BaseEstimator
-from sklearn.exceptions import NotFittedError
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator, SupportedFeatTypes
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 130728c94..d66e9c4c3 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -1,6 +1,4 @@
 # -*- encoding: utf-8 -*-
-import logging
-import warnings
 from typing import Optional, Tuple, List, Union, Dict
 import numpy as np
 import pandas as pd
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 34c879f0f..ad61ae19d 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -3,17 +3,14 @@
 import numpy as np
 
 import pandas as pd
-from pandas.api.types import is_numeric_dtype
 
 from scipy.sparse import issparse, spmatrix
 
 import sklearn.utils
-from sklearn import preprocessing
-from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.multiclass import type_of_target
 
-from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
+from autoPyTorch.data.base_target_validator import SupportedTargetTypes
 from autoPyTorch.utils.common import ispandas
 from autoPyTorch.data.tabular_target_validator import TabularTargetValidator, ArrayType
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index e46e88e29..00487adfa 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -13,7 +13,7 @@
 from scipy.sparse import issparse
 
 import torch
-from torch.utils.data.dataset import Dataset, Subset, ConcatDataset
+from torch.utils.data.dataset import Dataset, ConcatDataset
 
 import torchvision.transforms
 
@@ -38,11 +38,8 @@
     TimeFeature,
     time_features_from_frequency_str,
 )
-from autoPyTorch.utils.forecasting_time_features import FREQUENCY_MAP
 
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
-    TimeSeriesTransformer
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.constants_forecasting import SEASONALITY_MAP, MAX_WINDOW_SIZE_BASE
 from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
@@ -166,8 +163,9 @@ def __getitem__(self, index: int, train: bool = True) \
                 past_features = self.X[:index + 1]
 
             if self.known_future_features_index:
-                future_features = self.X[index + 1: index + self.n_prediction_steps + 1,
-                                  self.known_future_features_index]
+                future_features = self.X[
+                                  index + 1: index + self.n_prediction_steps + 1, self.known_future_features_index
+                                  ]
             else:
                 future_features = None
         else:
@@ -432,7 +430,7 @@ def __init__(self,
 
         if not self.validator._is_fitted:
             self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
-                               start_times=start_times, start_times_test=start_times_test,
+                               start_times=start_times,
                                n_prediction_steps=n_prediction_steps)
 
         self.is_uni_variant = self.validator._is_uni_variant
@@ -612,7 +610,7 @@ def compute_time_features(start_times: List[pd.DatetimeIndex],
                      if not isinstance(transform, ConstantTransform) else transform(date_info)
                      for transform in time_feature_transform]
                 ).T
-            except OutOfBoundsDatetime as e:
+            except OutOfBoundsDatetime:
                 series_time_features[start_t] = np.zeros([max_l, len(time_feature_transform)])
         return series_time_features
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 667e2ec2c..af484c6d1 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -3,7 +3,6 @@
 import warnings
 from multiprocessing.queues import Queue
 from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check
-from functools import partial
 
 from ConfigSpace import Configuration
 
@@ -20,7 +19,6 @@
 import autoPyTorch.pipeline.image_classification
 import autoPyTorch.pipeline.tabular_classification
 import autoPyTorch.pipeline.tabular_regression
-import autoPyTorch.pipeline.time_series_classification
 import autoPyTorch.pipeline.traditional_tabular_classification
 import autoPyTorch.pipeline.time_series_forecasting
 import autoPyTorch.pipeline.traditional_tabular_regression
@@ -327,7 +325,7 @@ def fit(self, X: Dict[str, Any], y: Any,
             sample_weight: Optional[np.ndarray] = None) -> object:
         self.n_prediction_steps = X['dataset_properties']['n_prediction_steps']
         y_train = subsampler(X['y_train'], X['train_indices'])
-        return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train,sample_weight)
+        return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train, sample_weight)
 
     def _genreate_dummy_forecasting(self, X):
         if isinstance(X[0], TimeSeriesSequence):
@@ -528,9 +526,6 @@ def __init__(self, backend: Backend,
                     self.pipeline_class = autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline
                 elif self.task_type in IMAGE_TASKS:
                     self.pipeline_class = autoPyTorch.pipeline.image_classification.ImageClassificationPipeline
-                elif self.task_type in TIMESERIES_TASKS:
-                    self.pipeline_class = \
-                        autoPyTorch.pipeline.time_series_classification.TimeSeriesClassificationPipeline
                 else:
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 7ee18b830..d238881a0 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -1,28 +1,20 @@
-from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
-from autoPyTorch.evaluation.abstract_evaluator import DummyClassificationPipeline
-
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar, Sequence
-from functools import partial
-import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union, Sequence
 
 from ConfigSpace.configuration_space import Configuration
 
 import numpy as np
-import pandas as pd
 
 from sklearn.base import BaseEstimator
 
 from smac.tae import StatusType
 
+from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.utils.common import subsampler
 from autoPyTorch.evaluation.abstract_evaluator import DummyTimeSeriesForecastingPipeline
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
-from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.constants_forecasting import SEASONALITY_MAP
 
 
@@ -141,7 +133,6 @@ def fit_predict_and_loss(self) -> None:
             )
 
         else:
-            Y_train_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
             Y_optimization_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
             Y_valid_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
             Y_test_pred: List[Optional[np.ndarray]] = [None] * self.num_folds
@@ -149,8 +140,8 @@ def fit_predict_and_loss(self) -> None:
 
             self.pipelines = [self._get_pipeline() for _ in range(self.num_folds)]
 
-            # stores train loss of each fold.
-            train_losses = [np.NaN] * self.num_folds
+            # Train losses is not applied here as it might become too expensive
+
             # used as weights when averaging train losses.
             train_fold_weights = [np.NaN] * self.num_folds
             # stores opt (validation) loss of each fold.
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 907cf4fbf..cff9a2776 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -1,7 +1,5 @@
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Union, no_type_check, ClassVar
-
-import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration
 
@@ -26,7 +24,6 @@
 from autoPyTorch.utils.common import dict_repr, subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
-
 __all__ = ['TrainEvaluator', 'eval_train_function']
 
 
@@ -118,6 +115,7 @@ class TrainEvaluator(AbstractEvaluator):
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
             An object used to fine tune the hyperparameter search space of the pipeline
     """
+
     def __init__(self, backend: Backend, queue: Queue,
                  metric: autoPyTorchMetric,
                  budget: float,
@@ -234,7 +232,6 @@ def fit_predict_and_loss(self) -> None:
             additional_run_info = {}
 
             for i, (train_split, test_split) in enumerate(self.splits):
-
                 pipeline = self.pipelines[i]
                 train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
                                                                                     train_indices=train_split,
@@ -393,8 +390,7 @@ def _predict(self, pipeline: BaseEstimator,
                                            self.y_train[train_indices])
 
         opt_pred = self.predict_function(subsampler(self.X_train, test_indices), pipeline,
-                                            self.y_train[train_indices])
-
+                                         self.y_train[train_indices])
 
         if self.X_valid is not None:
             valid_pred = self.predict_function(self.X_valid, pipeline,
@@ -413,26 +409,26 @@ def _predict(self, pipeline: BaseEstimator,
 
 # create closure for evaluating an algorithm
 def eval_train_function(
-    backend: Backend,
-    queue: Queue,
-    metric: autoPyTorchMetric,
-    budget: float,
-    config: Optional[Configuration],
-    seed: int,
-    output_y_hat_optimization: bool,
-    num_run: int,
-    include: Optional[Dict[str, Any]],
-    exclude: Optional[Dict[str, Any]],
-    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-    pipeline_config: Optional[Dict[str, Any]] = None,
-    budget_type: str = None,
-    init_params: Optional[Dict[str, Any]] = None,
-    logger_port: Optional[int] = None,
-    all_supported_metrics: bool = True,
-    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-    instance: str = None,
-    evaluator_class: Optional[AbstractEvaluator] = None,
-    **evaluator_kwargs,
+        backend: Backend,
+        queue: Queue,
+        metric: autoPyTorchMetric,
+        budget: float,
+        config: Optional[Configuration],
+        seed: int,
+        output_y_hat_optimization: bool,
+        num_run: int,
+        include: Optional[Dict[str, Any]],
+        exclude: Optional[Dict[str, Any]],
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        pipeline_config: Optional[Dict[str, Any]] = None,
+        budget_type: str = None,
+        init_params: Optional[Dict[str, Any]] = None,
+        logger_port: Optional[int] = None,
+        all_supported_metrics: bool = True,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        instance: str = None,
+        evaluator_class: Optional[AbstractEvaluator] = None,
+        **evaluator_kwargs,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 666af4374..73e988584 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -267,7 +267,7 @@ def __init__(self,
         initial_configurations = []
         if portfolio_selection is not None:
             initial_configurations = read_return_initial_configurations(config_space=config_space,
-                                                                             portfolio_selection=portfolio_selection)
+                                                                        portfolio_selection=portfolio_selection)
 
         suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models', None)
         custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path', None)
@@ -396,7 +396,6 @@ def run_smbo(self, func: Optional[Callable] = None
             ta_kwargs['max_budget'] = self.max_budget
             ta_kwargs['min_num_test_instances'] = self.min_num_test_instances
 
-
         if self.get_smac_object_callback is not None:
             smac = self.get_smac_object_callback(scenario_dict=scenario_dict,
                                                  seed=seed,
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
index cb688891c..94f37c27b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -73,4 +73,4 @@ def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
     def __str__(self) -> str:
         """ Allow a nice understanding of what components where used """
         string = self.__class__.__name__
-        return string
\ No newline at end of file
+        return string
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
index fd4e109cf..807a6ca19 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
index 3f71ddc81..884ea0df8 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
@@ -1,12 +1,7 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Dict
 
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
index 3b2d29b84..8299ab62a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -32,4 +32,3 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         X['dataset_properties'].update({'feature_shapes': self.feature_shapes})
         return BaseEncoder.transform(self, X)
-
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index 2d0db6d7e..b81978db1 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -6,7 +6,7 @@
 from ConfigSpace import ConfigurationSpace
 from autoPyTorch.utils.common import FitRequirement
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.\
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation. \
     base_time_series_imputer import BaseTimeSeriesImputer
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
@@ -19,8 +19,8 @@
 
 class TimeSeriesFeatureImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesPreprocessingComponent):
     def __init__(self,
-                random_state: Optional[np.random.RandomState] = None,
-                imputation_strategy: str = 'mean'):
+                 random_state: Optional[np.random.RandomState] = None,
+                 imputation_strategy: str = 'mean'):
         super().__init__()
         self.random_state = random_state
         self.imputation_strategy = imputation_strategy
@@ -71,12 +71,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='imputation_strategy',
-            value_range=("drift", "linear", "nearest", "constant_zero", "mean", "median", "bfill", "ffill"),
-            default_value="drift",
-        ),
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='imputation_strategy',
+                value_range=("drift", "linear", "nearest", "constant_zero", "mean", "median", "bfill", "ffill"),
+                default_value="drift",
+            ),
     ) -> ConfigurationSpace:
         if dataset_properties.get('features_have_missing_values', False):
             cs = super().get_hyperparameter_search_space(dataset_properties, imputation_strategy)
@@ -87,8 +87,8 @@ def get_hyperparameter_search_space(
 
 class TimeSeriesTargetImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesTargetPreprocessingComponent):
     def __init__(self,
-                random_state: Optional[np.random.RandomState] = None,
-                imputation_strategy: str = 'mean',):
+                 random_state: Optional[np.random.RandomState] = None,
+                 imputation_strategy: str = 'mean', ):
         super().__init__()
         self.random_state = random_state
         self.imputation_strategy = imputation_strategy
@@ -136,12 +136,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='imputation_strategy',
-            value_range=("linear", "nearest", "constant_zero", "bfill", "ffill"),
-            default_value="linear",
-        ),
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='imputation_strategy',
+                value_range=("linear", "nearest", "constant_zero", "bfill", "ffill"),
+                default_value="linear",
+            ),
     ) -> ConfigurationSpace:
         """
         Time series imputor, for the sake of speed, we only allow local imputation here (i.e., the filled value only
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
index 9550b534e..883e6af56 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict,  Optional
+from typing import Any, Dict, Optional
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
index 2b3bb7905..aad545a3b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -13,6 +13,7 @@
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
 
+
 class BaseScaler(autoPyTorchTimeSeriesPreprocessingComponent):
     """
     Provides abstract class interface for time series scalers in AutoPytorch
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 62389bd0c..9b383f563 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,10 +1,8 @@
-from typing import Any, List, Callable, Optional, Union, Tuple
+from typing import Any, Union, Tuple
 
 import numpy as np
 import pandas as pd
-from pandas.core.groupby.generic import DataFrameGroupBy
 
-import sklearn
 from sklearn.base import BaseEstimator
 
 
@@ -112,5 +110,3 @@ def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
             return X
         else:
             raise ValueError(f"Unknown mode {self.mode} for time series scaler")
-
-
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
index e2f64fc75..e7e15ad8e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
@@ -2,6 +2,7 @@
 
 from sklearn.base import BaseEstimator
 
+
 def get_time_series_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator]]:
     """
     Expects fit_dictionary(X) to have numerical/categorical preprocessors
@@ -49,4 +50,4 @@ def get_time_series_target_preprocessers(X: Dict[str, Any]) -> Dict[str, List[Ba
                 preprocessor['target_numerical'].append(value['target_numerical'])
             if 'target_categorical' in value and isinstance(value['target_categorical'], BaseEstimator):
                 preprocessor['target_categorical'].append(value['target_categorical'])
-    return preprocessor
\ No newline at end of file
+    return preprocessor
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index a241eaa7a..c6cf50736 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -47,10 +47,10 @@ class MAPELoss(Loss):
     def __init__(self, reduction: str = 'mean') -> None:
         super(MAPELoss, self).__init__(reduction=reduction)
 
-    def forward(self, input: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
+    def forward(self, predictions: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
         # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/model/n_beats/_network.py
         denominator = torch.abs(target_tensor)
-        diff = torch.abs(input - target_tensor)
+        diff = torch.abs(predictions - target_tensor)
 
         flag = (denominator == 0).float()
 
@@ -78,9 +78,9 @@ def set_mase_coefficient(self, mase_coefficient: torch.Tensor) -> 'MASELoss':
         return self
 
     def forward(self,
-                input: torch.distributions.Distribution,
+                predictions: torch.distributions.Distribution,
                 target_tensor: torch.Tensor) -> torch.Tensor:
-        loss = torch.abs(input - target_tensor) * self._mase_coefficient
+        loss = torch.abs(predictions - target_tensor) * self._mase_coefficient
         if self.reduction == 'mean':
             return loss.mean()
         elif self.reduction == 'sum':
@@ -96,15 +96,15 @@ def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5], loss
         super(QuantileLoss, self).__init__(reduction=reduction)
         self.quantiles = quantiles
 
-    def set_quantiles(self, quantiles = List[float]):
+    def set_quantiles(self, quantiles=List[float]):
         self.quantiles = quantiles
 
     def forward(self,
-                input: List[torch.Tensor],
+                predictions: List[torch.Tensor],
                 target_tensor: torch.Tensor) -> torch.Tensor:
-        assert len(self.quantiles) == len(input)
+        assert len(self.quantiles) == len(predictions)
         losses_all = []
-        for q, y_pred in zip(self.quantiles, input):
+        for q, y_pred in zip(self.quantiles, predictions):
             diff = target_tensor - y_pred
 
             loss_q = torch.max(q * diff, (q - 1) * diff)
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index 2370064fb..fdf73a357 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -205,8 +205,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
         self.logger = get_named_client_logger(
             name=f"{X['num_run']}_{time.time()}",
             # Log to a user provided port else to the default logging port
-            port=X['logger_port'
-            ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+            port=X['logger_port'] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         )
 
         # Call the actual fit function.
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index 46300255a..a337a7f24 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -12,7 +12,6 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-
 from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
@@ -61,10 +60,10 @@ def is_max_time_reached(self) -> bool:
 
 class RunSummary(object):
     def __init__(
-        self,
-        total_parameter_count: float,
-        trainable_parameter_count: float,
-        optimize_metric: Optional[str] = None,
+            self,
+            total_parameter_count: float,
+            trainable_parameter_count: float,
+            optimize_metric: Optional[str] = None,
     ):
         """
         A useful object to track performance per epoch.
@@ -125,7 +124,6 @@ def get_best_epoch(self, split_type: str = 'val') -> int:
         # metric to the loss
         if self.optimize_metric is not None:
 
-
             metrics_type = f"{split_type}_metrics"
             if self.optimize_metric in CLASSIFICATION_METRICS:
                 scorer = CLASSIFICATION_METRICS[self.optimize_metric]
@@ -211,19 +209,19 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
         self.weighted_loss: bool = False
 
     def prepare(
-        self,
-        metrics: List[Any],
-        model: torch.nn.Module,
-        criterion: Type[torch.nn.Module],
-        budget_tracker: BudgetTracker,
-        optimizer: Optimizer,
-        device: torch.device,
-        metrics_during_training: bool,
-        scheduler: _LRScheduler,
-        task_type: int,
-        labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
-        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
-        **kwargs: Dict
+            self,
+            metrics: List[Any],
+            model: torch.nn.Module,
+            criterion: Type[torch.nn.Module],
+            budget_tracker: BudgetTracker,
+            optimizer: Optimizer,
+            device: torch.device,
+            metrics_during_training: bool,
+            scheduler: _LRScheduler,
+            task_type: int,
+            labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
+            step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+            **kwargs: Dict
     ) -> None:
 
         # Save the device to be used
@@ -276,9 +274,9 @@ def on_epoch_end(self, X: Dict[str, Any], epoch: int) -> bool:
         return False
 
     def _scheduler_step(
-        self,
-        step_interval: StepIntervalUnit,
-        loss: Optional[float] = None
+            self,
+            step_interval: StepIntervalUnit,
+            loss: Optional[float] = None
     ) -> None:
 
         if self.step_interval != step_interval:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
index 958f125f1..aaf3fe3d7 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
@@ -14,4 +14,4 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         return {
             'shortname': 'ForecastingMixUpTrainer',
             'name': 'MixUp Regularized Trainer',
-        }
\ No newline at end of file
+        }
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
index 7f9baa0fc..81e3bc1b7 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
@@ -14,4 +14,4 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         return {
             'shortname': 'ForecastingStandardTrainer',
             'name': 'Forecasting Standard Trainer',
-        }
\ No newline at end of file
+        }
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 2c29c0908..f5f1aef99 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -42,8 +42,8 @@ class ForecastingTrainerChoice(TrainerChoice):
     def _fit_requirements(self) -> Optional[List[FitRequirement]]:
         fit_requirements = super()._fit_requirements
         fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
-                                               user_defined=False, dataset_property=False),
-                                FitRequirement("window_size", (int,), user_defined=False, dataset_property=False)]
+                                                user_defined=False, dataset_property=False),
+                                 FitRequirement("window_size", (int,), user_defined=False, dataset_property=False)]
                                 )
         return fit_requirements
 
@@ -90,7 +90,6 @@ def prepare_trainer(self, X):
             backcast_loss_ratio=X.get('backcast_loss_ratio', 0.0)
         )
 
-
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available trainer components
 
diff --git a/autoPyTorch/pipeline/time_series_classification.py b/autoPyTorch/pipeline/time_series_classification.py
deleted file mode 100644
index 749aaabe1..000000000
--- a/autoPyTorch/pipeline/time_series_classification.py
+++ /dev/null
@@ -1,279 +0,0 @@
-import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-
-import numpy as np
-
-import sklearn.preprocessing
-from sklearn.base import ClassifierMixin
-
-
-from autoPyTorch.pipeline.base_pipeline import BasePipeline
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
-    TimeSeriesTransformer
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
-    EncoderChoice
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import (
-    FeatureProprocessorChoice
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
-from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
-from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
-from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
-from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice
-from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
-from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
-from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-
-class TimeSeriesClassificationPipeline(ClassifierMixin, BasePipeline):
-    """This class is a proof of concept to integrate AutoSklearn Components
-
-    It implements a pipeline, which includes as steps:
-
-        ->One preprocessing step
-        ->One neural network
-
-    Contrary to the sklearn API it is not possible to enumerate the
-    possible parameters in the __init__ function because we only know the
-    available classifiers at runtime. For this reason the user must
-    specifiy the parameters by passing an instance of
-    ConfigSpace.configuration_space.Configuration.
-
-
-    Args:
-        config (Configuration)
-            The configuration to evaluate.
-        random_state (Optional[RandomState): random_state is the random number generator
-
-    Attributes:
-    Examples
-    """
-
-    def __init__(
-        self,
-        config: Optional[Configuration] = None,
-        steps: Optional[List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]] = None,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        include: Optional[Dict[str, Any]] = None,
-        exclude: Optional[Dict[str, Any]] = None,
-        random_state: Optional[np.random.RandomState] = None,
-        init_params: Optional[Dict[str, Any]] = None,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
-    ):
-        super().__init__(
-            config, steps, dataset_properties, include, exclude,
-            random_state, init_params, search_space_updates)
-
-    def _predict_proba(self, X: np.ndarray) -> np.ndarray:
-        # Pre-process X
-        loader = self.named_steps['data_loader'].get_loader(X=X)
-        pred = self.named_steps['network'].predict(loader)
-        if isinstance(self.dataset_properties['output_shape'], int):
-            proba = pred[:, :self.dataset_properties['output_shape']]
-            normalizer = proba.sum(axis=1)[:, np.newaxis]
-            normalizer[normalizer == 0.0] = 1.0
-            proba /= normalizer
-
-            return proba
-
-        else:
-            all_proba = []
-
-            for k in range(self.dataset_properties['output_shape']):
-                proba_k = pred[:, k, :self.dataset_properties['output_shape'][k]]
-                normalizer = proba_k.sum(axis=1)[:, np.newaxis]
-                normalizer[normalizer == 0.0] = 1.0
-                proba_k /= normalizer
-                all_proba.append(proba_k)
-
-            return all_proba
-
-    def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
-        """predict_proba.
-
-        Args:
-            X (np.ndarray): input to the pipeline, from which to guess targets
-            batch_size (Optional[int]): batch_size controls whether the pipeline
-                will be called on small chunks of the data. Useful when calling the
-                predict method on the whole array X results in a MemoryError.
-        Returns:
-            np.ndarray: Probabilities of the target being certain class
-        """
-        if batch_size is None:
-            y = self._predict_proba(X)
-
-        else:
-            if not isinstance(batch_size, int):
-                raise ValueError("Argument 'batch_size' must be of type int, "
-                                 "but is '%s'" % type(batch_size))
-            if batch_size <= 0:
-                raise ValueError("Argument 'batch_size' must be positive, "
-                                 "but is %d" % batch_size)
-
-            else:
-                # Probe for the target array dimensions
-                target = self.predict_proba(X[0:2].copy())
-
-                y = np.zeros((X.shape[0], target.shape[1]),
-                             dtype=np.float32)
-
-                for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))):
-                    batch_from = k * batch_size
-                    batch_to = min([(k + 1) * batch_size, X.shape[0]])
-                    pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None)
-                    y[batch_from:batch_to] = pred_prob.astype(np.float32)
-
-        # Neural networks might not be fit to produce a [0-1] output
-        # For instance, after small number of epochs.
-        y = np.clip(y, 0, 1)
-        y = sklearn.preprocessing.normalize(y, axis=1, norm='l1')
-
-        return y
-
-    def _get_hyperparameter_search_space(self,
-                                         dataset_properties: Dict[str, Any],
-                                         include: Optional[Dict[str, Any]] = None,
-                                         exclude: Optional[Dict[str, Any]] = None,
-                                         ) -> ConfigurationSpace:
-        """Create the hyperparameter configuration space.
-
-        For the given steps, and the Choices within that steps,
-        this procedure returns a configuration space object to
-        explore.
-
-        Args:
-            include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-                to honor when creating the configuration space
-            exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-                to remove from the configuration space
-            dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics
-                of the dataset to guide the pipeline choices of components
-
-        Returns:
-            cs (Configuration): The configuration space describing
-                the SimpleRegressionClassifier.
-        """
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None or not isinstance(dataset_properties, dict):
-            if not isinstance(dataset_properties, dict):
-                warnings.warn('The given dataset_properties argument contains an illegal value.'
-                              'Proceeding with the default value')
-            dataset_properties = dict()
-
-        if 'target_type' not in dataset_properties:
-            dataset_properties['target_type'] = 'time_series_classification'
-        if dataset_properties['target_type'] != 'time_series_classification':
-            warnings.warn('Tabular classification is being used, however the target_type'
-                          'is not given as "time_series_classification". Overriding it.')
-            dataset_properties['target_type'] = 'time_series_classification'
-        # get the base search space given this
-        # dataset properties. Then overwrite with custom
-        # classification requirements
-        cs = self._get_base_search_space(
-            cs=cs, dataset_properties=dataset_properties,
-            exclude=exclude, include=include, pipeline=self.steps)
-
-        # Here we add custom code, like this with this
-        # is not a valid configuration
-
-        self.configuration_space = cs
-        self.dataset_properties = dataset_properties
-        return cs
-
-    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
-                            ) -> List[Tuple[str, autoPyTorchChoice]]:
-        """
-        Defines what steps a pipeline should follow.
-        The step itself has choices given via autoPyTorchChoice.
-
-        Returns:
-            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
-                by the pipeline.
-        """
-        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
-
-        default_dataset_properties = {'target_type': 'time_series_classification'}
-        if dataset_properties is not None:
-            default_dataset_properties.update(dataset_properties)
-
-        steps.extend([
-            ("imputer", SimpleImputer(random_state=self.random_state)),
-            ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
-            ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
-            ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
-                                                               random_state=self.random_state)),
-            ("tabular_transformer", TimeSeriesTransformer(random_state=self.random_state)),
-            ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
-            ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
-                                                         random_state=self.random_state)),
-            ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
-                                                       random_state=self.random_state)),
-            ("network_head", NetworkHeadChoice(default_dataset_properties,
-                                               random_state=self.random_state)),
-            ("network", NetworkComponent(random_state=self.random_state)),
-            ("network_init", NetworkInitializerChoice(default_dataset_properties,
-                                                      random_state=self.random_state)),
-            ("optimizer", OptimizerChoice(default_dataset_properties,
-                                          random_state=self.random_state)),
-            ("lr_scheduler", SchedulerChoice(default_dataset_properties,
-                                             random_state=self.random_state)),
-            ("data_loader", TimeSeriesDataLoader(random_state=self.random_state)),
-            ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
-        ])
-        return steps
-
-    def get_pipeline_representation(self) -> Dict[str, str]:
-        """
-        Returns a representation of the pipeline, so that it can be
-        consumed and formatted by the API.
-
-        It should be a representation that follows:
-        [{'PreProcessing': <>, 'Estimator': <>}]
-
-        Returns:
-            Dict: contains the pipeline representation in a short format
-        """
-        preprocessing = []
-        estimator = []
-        skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
-                      'preprocessing', 'time_series_transformer']
-        for step_name, step_component in self.steps:
-            if step_name in skip_steps:
-                continue
-            properties = {}
-            if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
-                properties = step_component.choice.get_properties()
-            elif isinstance(step_component, autoPyTorchComponent):
-                properties = step_component.get_properties()
-            if 'shortname' in properties:
-                if 'network' in step_name:
-                    estimator.append(properties['shortname'])
-                else:
-                    preprocessing.append(properties['shortname'])
-        return {
-            'Preprocessing': ','.join(preprocessing),
-            'Estimator': ','.join(estimator),
-        }
-
-    def _get_estimator_hyperparameter_name(self) -> str:
-        """
-        Returns the name of the current estimator.
-
-        Returns:
-            str: name of the pipeline type
-        """
-        return "time_series_classifier"
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index a44ab548d..c4a9b37c4 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -236,7 +236,8 @@ def _get_hyperparameter_search_space(self,
             forecast_strategy = cs.get_hyperparameter("loss:DistributionLoss:forecast_strategy")
             use_tf = cs.get_hyperparameter("network_backbone:seq_encoder:use_temporal_fusion")
 
-            if True in decoder_auto_regressive.choices and 'sample' in forecast_strategy.choices and True in use_tf.choices:
+            if True in decoder_auto_regressive.choices and\
+                    'sample' in forecast_strategy.choices and True in use_tf.choices:
                 cs.add_forbidden_clause(
                     ForbiddenAndConjunction(
                         ForbiddenEqualsClause(decoder_auto_regressive, True),
diff --git a/autoPyTorch/pipeline/time_series_regression.py b/autoPyTorch/pipeline/time_series_regression.py
deleted file mode 100644
index 9c43e5966..000000000
--- a/autoPyTorch/pipeline/time_series_regression.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-
-import numpy as np
-
-from sklearn.base import RegressorMixin
-
-from autoPyTorch.pipeline.base_pipeline import BasePipeline
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
-    TimeSeriesTransformer
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
-    EncoderChoice
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import (
-    FeatureProprocessorChoice
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
-from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
-from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
-from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
-from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice
-from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
-from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import TimeSeriesDataLoader
-from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-
-
-class TimeSeriesRegressionPipeline(RegressorMixin, BasePipeline):
-    """This class is a proof of concept to integrate AutoPyTorch Components
-
-    It implements a pipeline, which includes as steps:
-
-        ->One preprocessing step
-        ->One neural network
-
-    Contrary to the sklearn API it is not possible to enumerate the
-    possible parameters in the __init__ function because we only know the
-    available regressors at runtime. For this reason the user must
-    specifiy the parameters by passing an instance of
-    ConfigSpace.configuration_space.Configuration.
-
-
-    Args:
-        config (Configuration)
-            The configuration to evaluate.
-        random_state (Optional[RandomState): random_state is the random number generator
-
-    Attributes:
-    Examples
-    """
-
-    def __init__(self,
-                 config: Optional[Configuration] = None,
-                 steps: Optional[List[Tuple[str, Union[autoPyTorchComponent, autoPyTorchChoice]]]] = None,
-                 dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-                 include: Optional[Dict[str, Any]] = None,
-                 exclude: Optional[Dict[str, Any]] = None,
-                 random_state: Optional[np.random.RandomState] = None,
-                 init_params: Optional[Dict[str, Any]] = None,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
-                 ):
-        super().__init__(
-            config, steps, dataset_properties, include, exclude,
-            random_state, init_params, search_space_updates)
-
-    def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
-        """Scores the fitted estimator on (X, y)
-
-        Args:
-            X (np.ndarray): input to the pipeline, from which to guess targets
-            batch_size (Optional[int]): batch_size controls whether the pipeline
-                will be called on small chunks of the data. Useful when calling the
-                predict method on the whole array X results in a MemoryError.
-        Returns:
-            np.ndarray: coefficient of determination R^2 of the prediction
-        """
-        from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
-        metrics = get_metrics(self.dataset_properties, ['r2'])
-        y_pred = self.predict(X, batch_size=batch_size)
-        r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']],
-                             metrics=metrics)['r2']
-        return r2
-
-    def _get_hyperparameter_search_space(self,
-                                         dataset_properties: Dict[str, Any],
-                                         include: Optional[Dict[str, Any]] = None,
-                                         exclude: Optional[Dict[str, Any]] = None,
-                                         ) -> ConfigurationSpace:
-        """Create the hyperparameter configuration space.
-
-        For the given steps, and the Choices within that steps,
-        this procedure returns a configuration space object to
-        explore.
-
-        Args:
-            include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-                to honor when creating the configuration space
-            exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-                to remove from the configuration space
-            dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics
-                of the dataset to guide the pipeline choices of components
-
-        Returns:
-            cs (Configuration): The configuration space describing the TimeSeriesRegressionPipeline.
-        """
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None or not isinstance(dataset_properties, dict):
-            if not isinstance(dataset_properties, dict):
-                warnings.warn('The given dataset_properties argument contains an illegal value.'
-                              'Proceeding with the default value')
-            dataset_properties = dict()
-
-        if 'target_type' not in dataset_properties:
-            dataset_properties['target_type'] = 'time_series_regression'
-        if dataset_properties['target_type'] != 'time_series_regression':
-            warnings.warn('Time series regression is being used, however the target_type'
-                          'is not given as "time_series_regression". Overriding it.')
-            dataset_properties['target_type'] = 'time_series_regression'
-        # get the base search space given this
-        # dataset properties. Then overwrite with custom
-        # regression requirements
-        cs = self._get_base_search_space(
-            cs=cs, dataset_properties=dataset_properties,
-            exclude=exclude, include=include, pipeline=self.steps)
-
-        # Here we add custom code, like this with this
-        # is not a valid configuration
-
-        self.configuration_space = cs
-        self.dataset_properties = dataset_properties
-        return cs
-
-    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> List[Tuple[str, autoPyTorchChoice]]:
-        """
-        Defines what steps a pipeline should follow.
-        The step itself has choices given via autoPyTorchChoice.
-
-        Returns:
-            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
-                by the pipeline.
-        """
-        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
-
-        default_dataset_properties = {'target_type': 'time_series_regression'}
-        if dataset_properties is not None:
-            default_dataset_properties.update(dataset_properties)
-
-        steps.extend([
-            ("imputer", SimpleImputer(random_state=self.random_state)),
-            ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
-            ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
-                                                               random_state=self.random_state)),
-            ("tabular_transformer", TimeSeriesTransformer(random_state=self.random_state)),
-            ("preprocessing", EarlyPreprocessing(random_state=self.random_state)),
-            ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
-                                                         random_state=self.random_state)),
-            ("network_backbone", NetworkBackboneChoice(default_dataset_properties,
-                                                       random_state=self.random_state)),
-            ("network_head", NetworkHeadChoice(default_dataset_properties,
-                                               random_state=self.random_state)),
-            ("network", NetworkComponent(random_state=self.random_state)),
-            ("network_init", NetworkInitializerChoice(default_dataset_properties,
-                                                      random_state=self.random_state)),
-            ("optimizer", OptimizerChoice(default_dataset_properties,
-                                          random_state=self.random_state)),
-            ("lr_scheduler", SchedulerChoice(default_dataset_properties,
-                                             random_state=self.random_state)),
-            ("data_loader", TimeSeriesDataLoader(random_state=self.random_state)),
-            ("trainer", TrainerChoice(default_dataset_properties, random_state=self.random_state)),
-        ])
-        return steps
-
-    def get_pipeline_representation(self) -> Dict[str, str]:
-        """
-        Returns a representation of the pipeline, so that it can be
-        consumed and formatted by the API.
-
-        It should be a representation that follows:
-        [{'PreProcessing': <>, 'Estimator': <>}]
-
-        Returns:
-            Dict: contains the pipeline representation in a short format
-        """
-        preprocessing = []
-        estimator = []
-        skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
-                      'preprocessing', 'time_series_transformer']
-        for step_name, step_component in self.steps:
-            if step_name in skip_steps:
-                continue
-            properties = {}
-            if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
-                properties = step_component.choice.get_properties()
-            elif isinstance(step_component, autoPyTorchComponent):
-                properties = step_component.get_properties()
-            if 'shortname' in properties:
-                if 'network' in step_name:
-                    estimator.append(properties['shortname'])
-                else:
-                    preprocessing.append(properties['shortname'])
-        return {
-            'Preprocessing': ','.join(preprocessing),
-            'Estimator': ','.join(estimator),
-        }
-
-    def _get_estimator_hyperparameter_name(self) -> str:
-        """
-        Returns the name of the current estimator.
-
-        Returns:
-            str: name of the pipeline type
-        """
-        return "time_series_regressor"
diff --git a/autoPyTorch/utils/forecasting_time_features.py b/autoPyTorch/utils/forecasting_time_features.py
index f576f9221..40ccf8cf1 100644
--- a/autoPyTorch/utils/forecasting_time_features.py
+++ b/autoPyTorch/utils/forecasting_time_features.py
@@ -11,13 +11,6 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-
-
-from typing import List, Optional
-
-import numpy as np
-from pandas.tseries.frequencies import to_offset
-
 # Frequencies used by GluonTS framework
 FREQUENCY_MAP = {
     "minutely": "1min",
@@ -30,4 +23,3 @@
     "quarterly": "1Q",
     "yearly": "1Y"
 }
-
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 9a38af209..24b01b2d5 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -15,8 +15,6 @@
 from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
-from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -91,18 +89,12 @@ def _get_regression_dataset_requirements(info: Dict[str, Any],
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
     if task_type in TABULAR_TASKS:
         return TabularRegressionPipeline(
-            dataset_properties=info,
-            include=include,
-            exclude=exclude
-        ).get_dataset_requirements()
-
-    elif task_type in TIMESERIES_TASKS:
-        return TimeSeriesRegressionPipeline(
             dataset_properties=info,
             include=include,
             exclude=exclude,
             search_space_updates=search_space_updates
         ).get_dataset_requirements()
+
     else:
         raise ValueError("Task_type not supported")
 
@@ -131,9 +123,9 @@ def _get_classification_dataset_requirements(info: Dict[str, Any],
 
 
 def _get_forecasting_dataset_requirements(info: Dict[str, Any],
-                                             include: Optional[Dict] = None,
-                                             exclude: Optional[Dict] = None,
-                                             search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+                                          include: Optional[Dict] = None,
+                                          exclude: Optional[Dict] = None,
+                                          search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                           ) -> List[FitRequirement]:
     task_type = STRING_TO_TASK_TYPES[info['task_type']]
 
diff --git a/test/test_pipeline/components/test_time_series_data_loader.py b/test/test_pipeline/components/test_time_series_data_loader.py
deleted file mode 100644
index 1b15db916..000000000
--- a/test/test_pipeline/components/test_time_series_data_loader.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import unittest
-import unittest.mock
-
-import torchvision
-
-from autoPyTorch.pipeline.components.training.data_loader.time_series_data_loader import (
-    TimeSeriesDataLoader
-)
-
-
-class TestTimeSeriesDataLoader(unittest.TestCase):
-    def test_build_transform_small_preprocess_true(self):
-        """
-        Makes sure a proper composition is created
-        """
-        loader = TimeSeriesDataLoader()
-
-        fit_dictionary = {'dataset_properties': {'is_small_preprocess': True}}
-        for thing in ['scaler']:
-            fit_dictionary[thing] = [unittest.mock.Mock()]
-
-        compose = loader.build_transform(fit_dictionary, mode='train')
-
-        self.assertIsInstance(compose, torchvision.transforms.Compose)
-
-        # No preprocessing needed here as it was done before, only from_numpy
-        self.assertEqual(len(compose.transforms), 1)
-
-    def test_build_transform_small_preprocess_false(self):
-        """
-        Makes sure a proper composition is created
-        """
-        loader = TimeSeriesDataLoader()
-
-        fit_dictionary = {'dataset_properties': {'is_small_preprocess': False},
-                          'preprocess_transforms': [unittest.mock.Mock()]}
-
-        compose = loader.build_transform(fit_dictionary, mode='train')
-
-        self.assertIsInstance(compose, torchvision.transforms.Compose)
-
-        # We expect the preprocess_transforms and from_numpy
-        self.assertEqual(len(compose.transforms), 2)
diff --git a/test/test_pipeline/components/test_time_series_scaler_choice.py b/test/test_pipeline/components/test_time_series_scaler_choice.py
deleted file mode 100644
index d59154ed3..000000000
--- a/test/test_pipeline/components/test_time_series_scaler_choice.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import copy
-import unittest
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice
-
-
-class TestTimeSeriesScalerChoice(unittest.TestCase):
-
-    def test_get_set_config_space(self):
-        """Make sure that we can setup a valid choice for the time series scaler"""
-        dataset_properties = {'categorical_features': [],
-                              'numerical_features': list(range(4)),
-                              'issparse': False}
-        scaler_choice = ScalerChoice(dataset_properties)
-        cs = scaler_choice.get_hyperparameter_search_space()
-
-        # Make sure that all hyperparameters are part of the search space
-        self.assertListEqual(
-            sorted(cs.get_hyperparameter('__choice__').choices),
-            sorted(list(scaler_choice.get_components().keys()))
-        )
-
-        # Make sure we can properly set some random configs
-        # Whereas just one iteration will make sure the algorithm works,
-        # doing five iterations increase the confidence. We will be able to
-        # catch component specific crashes
-        for i in range(5):
-            config = cs.sample_configuration()
-            config_dict = copy.deepcopy(config.get_dictionary())
-            scaler_choice.set_hyperparameters(config)
-
-            self.assertEqual(scaler_choice.choice.__class__,
-                             scaler_choice.get_components()[config_dict['__choice__']])
-
-            # Then check the choice configuration
-            selected_choice = config_dict.pop('__choice__', None)
-            for key, value in config_dict.items():
-                # Remove the selected_choice string from the parameter
-                # so we can query in the object for it
-                key = key.replace(selected_choice + ':', '')
-                self.assertIn(key, vars(scaler_choice.choice))
-                self.assertEqual(value, scaler_choice.choice.__dict__[key])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/test_pipeline/components/test_time_series_scalers.py b/test/test_pipeline/components/test_time_series_scalers.py
deleted file mode 100644
index f051be329..000000000
--- a/test/test_pipeline/components/test_time_series_scalers.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import unittest
-
-import numpy as np
-from numpy.testing import assert_allclose
-
-from sklearn.base import BaseEstimator
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MaxAbsScaler import MaxAbsScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.MinMaxScaler import MinMaxScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.NoScaler import NoScaler
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.StandardScaler import \
-    StandardScaler
-
-
-class TestMinMaxScaler(unittest.TestCase):
-
-    def test_minmax_scaler(self):
-        data = np.array([
-            [[1], [2], [3]],
-            [[7], [8], [9]],
-            [[10], [11], [12]]
-        ])
-
-        dataset_properties = {'categorical_features': [],
-                              'numerical_features': [0]}
-
-        X = {
-            'X_train': data,
-            'dataset_properties': dataset_properties
-        }
-        scaler_component = MinMaxScaler()
-
-        scaler_component = scaler_component.fit(X)
-        X = scaler_component.transform(X)
-        scaler = X['scaler']['numerical']
-
-        # check if the fit dictionary X is modified as expected
-        self.assertIsInstance(X['scaler'], dict)
-        self.assertIsInstance(scaler, BaseEstimator)
-        self.assertIsNone(X['scaler']['categorical'])
-
-        # make column transformer with returned encoder to fit on data
-        scaler = scaler.fit(X["X_train"])
-        transformed = scaler.transform(X["X_train"])
-        assert_allclose(transformed,
-                        np.array([
-                            [[0], [0.5], [1]],
-                            [[0], [0.5], [1]],
-                            [[0], [0.5], [1]],
-                        ]))
-
-
-class TestMaxAbsScaler(unittest.TestCase):
-
-    def test_maxabs_scaler(self):
-        data = np.array([
-            [[-10], [2], [3]],
-            [[-7], [8], [9]],
-            [[-8], [11], [12]]
-        ])
-
-        dataset_properties = {'categorical_features': [],
-                              'numerical_features': [0]}
-
-        X = {
-            'X_train': data,
-            'dataset_properties': dataset_properties
-        }
-        scaler_component = MaxAbsScaler()
-
-        scaler_component = scaler_component.fit(X)
-        X = scaler_component.transform(X)
-        scaler = X['scaler']['numerical']
-
-        # check if the fit dictionary X is modified as expected
-        self.assertIsInstance(X['scaler'], dict)
-        self.assertIsInstance(scaler, BaseEstimator)
-        self.assertIsNone(X['scaler']['categorical'])
-
-        # make column transformer with returned encoder to fit on data
-        scaler = scaler.fit(X["X_train"])
-        transformed = scaler.transform(X["X_train"])
-        assert_allclose(transformed,
-                        np.array([
-                            [[-1], [0.2], [0.3]],
-                            [[-7 / 9], [8 / 9], [1]],
-                            [[-8 / 12], [11 / 12], [1]],
-                        ]))
-
-
-class TestStandardScaler(unittest.TestCase):
-
-    def test_standard_scaler(self):
-        data = np.array([
-            [[1], [2], [3], [4], [5]],
-            [[7], [8], [9], [10], [11]],
-            [[10], [11], [12], [13], [14]]
-        ])
-
-        dataset_properties = {'categorical_features': [],
-                              'numerical_features': [0]}
-
-        X = {
-            'X_train': data,
-            'dataset_properties': dataset_properties
-        }
-        scaler_component = StandardScaler()
-
-        scaler_component = scaler_component.fit(X)
-        X = scaler_component.transform(X)
-        scaler = X['scaler']['numerical']
-
-        # check if the fit dictionary X is modified as expected
-        self.assertIsInstance(X['scaler'], dict)
-        self.assertIsInstance(scaler, BaseEstimator)
-        self.assertIsNone(X['scaler']['categorical'])
-
-        # make column transformer with returned encoder to fit on data
-        scaler = scaler.fit(X["X_train"])
-        transformed = scaler.transform(X["X_train"])
-        assert_allclose(transformed,
-                        np.array([
-                            [[-1.41421356], [-0.70710678], [0.], [0.70710678], [1.41421356]],
-                            [[-1.41421356], [-0.70710678], [0.], [0.70710678], [1.41421356]],
-                            [[-1.41421356], [-0.70710678], [0.], [0.70710678], [1.41421356]],
-                        ]))
-
-
-class TestNoneScaler(unittest.TestCase):
-
-    def test_none_scaler(self):
-        data = np.array([
-            [[1], [2], [3]],
-            [[7], [8], [9]],
-            [[10], [11], [12]]
-        ])
-
-        dataset_properties = {'categorical_features': [],
-                              'numerical_features': [0]}
-
-        X = {
-            'X_train': data,
-            'dataset_properties': dataset_properties
-        }
-        scaler_component = NoScaler()
-
-        scaler_component = scaler_component.fit(X)
-        X = scaler_component.transform(X)
-
-        # check if the fit dictionary X is modified as expected
-        self.assertIsInstance(X['scaler'], dict)
-        self.assertIsNone(X['scaler']['categorical'])
diff --git a/test/test_pipeline/components/test_time_series_transformer.py b/test/test_pipeline/components/test_time_series_transformer.py
deleted file mode 100644
index 456267e3e..000000000
--- a/test/test_pipeline/components/test_time_series_transformer.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from test.test_pipeline.components.base import TimeSeriesPipeline
-
-import numpy as np
-
-import pytest
-
-from sklearn.pipeline import Pipeline
-
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
-    TimeSeriesTransformer
-
-
-@pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
-class TestTimeSeriesTransformer:
-    def test_time_series_preprocess(self, fit_dictionary_time_series):
-        pipeline = TimeSeriesPipeline(dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        pipeline = pipeline.fit(fit_dictionary_time_series)
-        X = pipeline.transform(fit_dictionary_time_series)
-        transformer = X['time_series_transformer']
-
-        # check if transformer was added to fit dictionary
-        assert 'time_series_transformer' in X.keys()
-        # check if transformer is of expected type
-        # In this case we expect the time series transformer not the actual implementation behind it
-        # as the later is not callable and runs into error in the compose transform
-        assert isinstance(transformer, TimeSeriesTransformer)
-        assert isinstance(transformer.preprocessor, Pipeline)
-
-        data = transformer.preprocessor.fit_transform(X['X_train'])
-        assert isinstance(data, np.ndarray)
diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py
index 1e85df5e6..4f86356e4 100644
--- a/test/test_pipeline/components/training/base.py
+++ b/test/test_pipeline/components/training/base.py
@@ -21,14 +21,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice as\
     TabularScalerChoice
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import \
-    TimeSeriesTransformer
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler_choice import \
-    ScalerChoice as TimeSeriesScalerChoice
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
-from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
 
 
 
@@ -167,27 +162,3 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
         ])
         return steps
 
-
-class TimeSeriesPipeline(TimeSeriesClassificationPipeline):
-    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
-                            ) -> List[Tuple[str, autoPyTorchChoice]]:
-        """
-        Defines what steps a pipeline should follow.
-        The step itself has choices given via autoPyTorchChoice.
-
-        Returns:
-            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
-                by the pipeline.
-        """
-        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
-
-        default_dataset_properties = {'target_type': 'time_series_classification'}
-        if dataset_properties is not None:
-            default_dataset_properties.update(dataset_properties)
-
-        steps.extend([
-            ("scaler", TimeSeriesScalerChoice(default_dataset_properties)),
-            ("time_series_transformer", TimeSeriesTransformer()),
-        ])
-        return steps
-
diff --git a/test/test_pipeline/test_time_series_classification.py b/test/test_pipeline/test_time_series_classification.py
deleted file mode 100644
index 6098f2dc7..000000000
--- a/test/test_pipeline/test_time_series_classification.py
+++ /dev/null
@@ -1,316 +0,0 @@
-import os
-import re
-
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter,
-)
-
-import numpy as np
-
-import pytest
-
-import torch
-
-from autoPyTorch import metrics
-from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms
-from autoPyTorch.pipeline.time_series_classification import TimeSeriesClassificationPipeline
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates, \
-    parse_hyperparameter_search_space_updates
-
-
-@pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
-class TestTimeSeriesClassification:
-    def _assert_pipeline_search_space(self, pipeline, search_space_updates):
-        config_space = pipeline.get_hyperparameter_search_space()
-        for update in search_space_updates.updates:
-            try:
-                assert update.node_name + ':' + update.hyperparameter in config_space
-                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter)
-            except AssertionError:
-                assert any(update.node_name + ':' + update.hyperparameter in name
-                           for name in config_space.get_hyperparameter_names()), \
-                    "Can't find hyperparameter: {}".format(update.hyperparameter)
-                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1')
-            assert update.default_value == hyperparameter.default_value
-            if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)):
-                assert update.value_range[0] == hyperparameter.lower
-                assert update.value_range[1] == hyperparameter.upper
-                if hasattr(update, 'log'):
-                    assert update.log == hyperparameter.log
-            elif isinstance(hyperparameter, CategoricalHyperparameter):
-                assert update.value_range == hyperparameter.choices
-
-    def test_pipeline_fit(self, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to fit
-        given random combinations of hyperparameters across the pipeline"""
-
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-        pipeline.fit(fit_dictionary_time_series)
-
-        # To make sure we fitted the model, there should be a
-        # run summary object with accuracy
-        run_summary = pipeline.named_steps['trainer'].run_summary
-        assert run_summary is not None
-
-        # Make sure that performance was properly captured
-        assert run_summary.performance_tracker['train_loss'][1] > 0
-        assert run_summary.total_parameter_count > 0
-        assert 'accuracy' in run_summary.performance_tracker['train_metrics'][1]
-
-        # Make sure a network was fit
-        assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)
-
-    @pytest.mark.parametrize("fit_dictionary_time_series_dummy", ["classification"], indirect=True)
-    def test_pipeline_score(self, fit_dictionary_time_series_dummy, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to achieve a decent score on dummy data
-        given the default configuration"""
-        X = fit_dictionary_time_series_dummy['X_train'].copy()
-        y = fit_dictionary_time_series_dummy['y_train'].copy()
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series_dummy['dataset_properties'])
-
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.get_default_configuration()
-        pipeline.set_hyperparameters(config)
-
-        pipeline.fit(fit_dictionary_time_series_dummy)
-
-        # we expect the output to have the same batch size as the test input,
-        # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties)
-        expected_output_shape = (X.shape[0],
-                                 fit_dictionary_time_series_dummy["dataset_properties"]["output_shape"])
-
-        prediction = pipeline.predict(X)
-        assert isinstance(prediction, np.ndarray)
-        assert prediction.shape == expected_output_shape
-
-        # we should be able to get a decent score on this dummy data
-        accuracy = metrics.accuracy(y, prediction.squeeze())
-        assert accuracy >= 0.8
-
-    def test_pipeline_predict(self, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to predict
-        given a random configuration"""
-        X = fit_dictionary_time_series['X_train'].copy()
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        pipeline.fit(fit_dictionary_time_series)
-
-        # we expect the output to have the same batch size as the test input,
-        # and number of outputs per batch sample equal to the number of outputs
-        expected_output_shape = (X.shape[0], fit_dictionary_time_series["dataset_properties"]["output_shape"])
-
-        prediction = pipeline.predict(X)
-        assert isinstance(prediction, np.ndarray)
-        assert prediction.shape == expected_output_shape
-
-    def test_pipeline_predict_proba(self, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to fit
-        given random combinations of hyperparameters across the pipeline
-        And then predict using predict probability
-        """
-        X = fit_dictionary_time_series['X_train'].copy()
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        pipeline.fit(fit_dictionary_time_series)
-
-        # we expect the output to have the same batch size as the test input,
-        # and number of outputs per batch sample equal to the number of classes ("num_classes" in dataset_properties)
-        expected_output_shape = (X.shape[0], fit_dictionary_time_series["dataset_properties"]["output_shape"])
-
-        prediction = pipeline.predict_proba(X)
-        assert isinstance(prediction, np.ndarray)
-        assert prediction.shape == expected_output_shape
-
-    def test_pipeline_transform(self, fit_dictionary_time_series):
-        """
-        In the context of autopytorch, transform expands a fit dictionary with
-        components that where previously fit. We can use this as a nice way to make sure
-        that fit properly work.
-        This code is added in light of components not properly added to the fit dicitonary
-        """
-
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        # We do not want to make the same early preprocessing operation to the fit dictionary
-        pipeline.fit(fit_dictionary_time_series.copy())
-
-        transformed_fit_dictionary_time_series = pipeline.transform(fit_dictionary_time_series)
-
-        # First, we do not lose anyone! (We use a fancy subset containment check)
-        assert fit_dictionary_time_series.items() <= transformed_fit_dictionary_time_series.items()
-
-        # Then the pipeline should have added the following keys
-        expected_keys = {'scaler', 'time_series_transformer',
-                         'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler',
-                         'train_data_loader', 'val_data_loader', 'run_summary'}
-        assert expected_keys.issubset(set(transformed_fit_dictionary_time_series.keys()))
-
-        # Then we need to have transformations being created.
-        assert len(get_preprocess_transforms(transformed_fit_dictionary_time_series)) > 0
-
-        # We expect the transformations to be in the pipeline at anytime for inference
-        assert 'preprocess_transforms' in transformed_fit_dictionary_time_series.keys()
-
-    @pytest.mark.parametrize("is_small_preprocess", [True, False])
-    def test_default_configuration(self, fit_dictionary_time_series, is_small_preprocess):
-        """Makes sure that when no config is set, we can trust the
-        default configuration from the space"""
-
-        fit_dictionary_time_series['is_small_preprocess'] = is_small_preprocess
-
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-
-        pipeline.fit(fit_dictionary_time_series)
-
-    def test_remove_key_check_requirements(self, fit_dictionary_time_series):
-        """Makes sure that when a key is removed from X, correct error is outputted"""
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        for key in ['num_run', 'device', 'split_id', 'use_pynisher', 'torch_num_threads', 'dataset_properties']:
-            fit_dictionary_time_series_copy = fit_dictionary_time_series.copy()
-            fit_dictionary_time_series_copy.pop(key)
-            with pytest.raises(ValueError, match=r"To fit .+?, expected fit dictionary to have"):
-                pipeline.fit(fit_dictionary_time_series_copy)
-
-    def test_network_optimizer_lr_handshake(self, fit_dictionary_time_series):
-        """Fitting a network should put the network in the X"""
-        # Create the pipeline to check. A random config should be sufficient
-        pipeline = TimeSeriesClassificationPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        # Make sure that fitting a network adds a "network" to X
-        assert 'network' in pipeline.named_steps.keys()
-        fit_dictionary_time_series['network_backbone'] = torch.nn.Linear(3, 4)
-        fit_dictionary_time_series['network_head'] = torch.nn.Linear(4, 1)
-        X = pipeline.named_steps['network'].fit(
-            fit_dictionary_time_series,
-            None
-        ).transform(fit_dictionary_time_series)
-        assert 'network' in X
-
-        # Then fitting a optimizer should fail if no network:
-        assert 'optimizer' in pipeline.named_steps.keys()
-        with pytest.raises(
-            ValueError,
-            match=r"To fit .+?, expected fit dictionary to have 'network' but got .*"
-        ):
-            pipeline.named_steps['optimizer'].fit({'dataset_properties': {}}, None)
-
-        # No error when network is passed
-        X = pipeline.named_steps['optimizer'].fit(X, None).transform(X)
-        assert 'optimizer' in X
-
-        # Then fitting a optimizer should fail if no network:
-        assert 'lr_scheduler' in pipeline.named_steps.keys()
-        with pytest.raises(
-            ValueError,
-            match=r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*"
-        ):
-            pipeline.named_steps['lr_scheduler'].fit({'dataset_properties': {}}, None)
-
-        # No error when network is passed
-        X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X)
-        assert 'optimizer' in X
-
-    def test_get_fit_requirements(self, fit_dictionary_time_series):
-        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
-                              'task_type': 'time_series_classification'}
-        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties)
-        fit_requirements = pipeline.get_fit_requirements()
-
-        # check if fit requirements is a list of FitRequirement named tuples
-        assert isinstance(fit_requirements, list)
-        for requirement in fit_requirements:
-            assert isinstance(requirement, FitRequirement)
-
-    def test_apply_search_space_updates(self, fit_dictionary_time_series, search_space_updates):
-        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
-                              'task_type': 'time_series_classification'}
-        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
-                                                    search_space_updates=search_space_updates)
-        self._assert_pipeline_search_space(pipeline, search_space_updates)
-
-    def test_read_and_update_search_space(self, fit_dictionary_time_series, search_space_updates):
-        import tempfile
-        path = tempfile.gettempdir()
-        path = os.path.join(path, 'updates.txt')
-        # Write to disk
-        search_space_updates.save_as_file(path=path)
-        assert os.path.exists(path=path)
-
-        # Read from disk
-        file_search_space_updates = parse_hyperparameter_search_space_updates(updates_file=path)
-        assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates)
-        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
-                              'task_type': 'time_series_classification'}
-        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
-                                                    search_space_updates=file_search_space_updates)
-        assert file_search_space_updates == pipeline.search_space_updates
-
-    def test_error_search_space_updates(self, fit_dictionary_time_series, error_search_space_updates):
-        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
-                              'task_type': 'time_series_classification'}
-        try:
-            _ = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
-                                                 search_space_updates=error_search_space_updates)
-        except Exception as e:
-            assert isinstance(e, ValueError)
-            assert re.match(r'Unknown hyperparameter for component .*?\. Expected update '
-                            r'hyperparameter to be in \[.*?\] got .+', e.args[0])
-
-    def test_set_range_search_space_updates(self, fit_dictionary_time_series):
-        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
-                              'task_type': 'time_series_classification'}
-        config_dict = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties). \
-            get_hyperparameter_search_space()._hyperparameters
-        updates = HyperparameterSearchSpaceUpdates()
-        for i, (name, hyperparameter) in enumerate(config_dict.items()):
-            if '__choice__' in name:
-                continue
-            name = name.split(':')
-            hyperparameter_name = ':'.join(name[1:])
-            if "network" in name[0]:
-                continue
-            if isinstance(hyperparameter, CategoricalHyperparameter):
-                value_range = (hyperparameter.choices[0],)
-                default_value = hyperparameter.choices[0]
-            else:
-                value_range = (0, 1)
-                default_value = 1
-            updates.append(node_name=name[0], hyperparameter=hyperparameter_name,
-                           value_range=value_range, default_value=default_value)
-        pipeline = TimeSeriesClassificationPipeline(dataset_properties=dataset_properties,
-                                                    search_space_updates=updates)
-
-        try:
-            self._assert_pipeline_search_space(pipeline, updates)
-        except AssertionError as e:
-            # As we are setting num_layers to 1 for fully connected
-            # head, units_layer does not exist in the configspace
-            assert 'fully_connected:units_layer' in e.args[0]
diff --git a/test/test_pipeline/test_time_series_regression.py b/test/test_pipeline/test_time_series_regression.py
deleted file mode 100644
index e867d8932..000000000
--- a/test/test_pipeline/test_time_series_regression.py
+++ /dev/null
@@ -1,295 +0,0 @@
-import os
-import re
-
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter,
-)
-
-import numpy as np
-
-import pytest
-
-import torch
-
-from autoPyTorch import metrics
-from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms
-from autoPyTorch.pipeline.time_series_regression import TimeSeriesRegressionPipeline
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates, \
-    parse_hyperparameter_search_space_updates
-
-
-@pytest.mark.parametrize("fit_dictionary_time_series", ['regression_numerical_only'], indirect=True)
-class TestTimeSeriesRegression:
-    def _assert_pipeline_search_space(self, pipeline, search_space_updates):
-        config_space = pipeline.get_hyperparameter_search_space()
-        for update in search_space_updates.updates:
-            try:
-                assert update.node_name + ':' + update.hyperparameter in config_space
-                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter)
-            except AssertionError:
-                assert any(update.node_name + ':' + update.hyperparameter in name
-                           for name in config_space.get_hyperparameter_names()), \
-                    "Can't find hyperparameter: {}".format(update.hyperparameter)
-                hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1')
-            assert update.default_value == hyperparameter.default_value
-            if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)):
-                assert update.value_range[0] == hyperparameter.lower
-                assert update.value_range[1] == hyperparameter.upper
-                if hasattr(update, 'log'):
-                    assert update.log == hyperparameter.log
-            elif isinstance(hyperparameter, CategoricalHyperparameter):
-                assert update.value_range == hyperparameter.choices
-
-    def test_pipeline_fit(self, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to fit
-        given random combinations of hyperparameters across the pipeline"""
-
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-        pipeline.fit(fit_dictionary_time_series)
-
-        # To make sure we fitted the model, there should be a
-        # run summary object with accuracy
-        run_summary = pipeline.named_steps['trainer'].run_summary
-        assert run_summary is not None
-
-        # Make sure that performance was properly captured
-        assert run_summary.performance_tracker['train_loss'][1] > 0
-        assert run_summary.total_parameter_count > 0
-        assert 'r2' in run_summary.performance_tracker['train_metrics'][1]
-
-        # Make sure a network was fit
-        assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)
-
-    @pytest.mark.parametrize("fit_dictionary_time_series_dummy", ["regression"], indirect=True)
-    def test_pipeline_score(self, fit_dictionary_time_series_dummy, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to achieve a decent score on dummy data
-        given the default configuration"""
-        X = fit_dictionary_time_series_dummy['X_train'].copy()
-        y = fit_dictionary_time_series_dummy['y_train'].copy()
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series_dummy['dataset_properties'])
-
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.get_default_configuration()
-        pipeline.set_hyperparameters(config)
-
-        # regression needs more iterations to converge
-        fit_dictionary_time_series_dummy["epochs"] = 1000
-        pipeline.fit(fit_dictionary_time_series_dummy)
-
-        # we expect the output to have the same batch size as the test input,
-        # and number of outputs per batch sample equal to the number of targets ("output_shape" in dataset_properties)
-        expected_output_shape = (X.shape[0],
-                                 fit_dictionary_time_series_dummy["dataset_properties"]["output_shape"])
-
-        prediction = pipeline.predict(X)
-        assert isinstance(prediction, np.ndarray)
-        assert prediction.shape == expected_output_shape
-
-        # we should be able to get a decent score on this dummy data
-        r2_score = metrics.r2(y, prediction.squeeze())
-        assert r2_score >= 0.5
-
-    def test_pipeline_predict(self, fit_dictionary_time_series):
-        """This test makes sure that the pipeline is able to predict
-        given a random configuration"""
-        X = fit_dictionary_time_series['X_train'].copy()
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        pipeline.fit(fit_dictionary_time_series)
-
-        # we expect the output to have the same batch size as the test input,
-        # and number of outputs per batch sample equal to the number of outputs
-        expected_output_shape = (X.shape[0], fit_dictionary_time_series["dataset_properties"]["output_shape"])
-
-        prediction = pipeline.predict(X)
-        assert isinstance(prediction, np.ndarray)
-        assert prediction.shape == expected_output_shape
-
-    def test_pipeline_transform(self, fit_dictionary_time_series):
-        """
-        In the context of autopytorch, transform expands a fit dictionary with
-        components that where previously fit. We can use this as a nice way to make sure
-        that fit properly work.
-        This code is added in light of components not properly added to the fit dicitonary
-        """
-
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        # We do not want to make the same early preprocessing operation to the fit dictionary
-        pipeline.fit(fit_dictionary_time_series.copy())
-
-        transformed_fit_dictionary_time_series = pipeline.transform(fit_dictionary_time_series)
-
-        # First, we do not lose anyone! (We use a fancy subset containment check)
-        assert fit_dictionary_time_series.items() <= transformed_fit_dictionary_time_series.items()
-
-        # Then the pipeline should have added the following keys
-        expected_keys = {'scaler', 'time_series_transformer',
-                         'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler',
-                         'train_data_loader', 'val_data_loader', 'run_summary'}
-        assert expected_keys.issubset(set(transformed_fit_dictionary_time_series.keys()))
-
-        # Then we need to have transformations being created.
-        assert len(get_preprocess_transforms(transformed_fit_dictionary_time_series)) > 0
-
-        # We expect the transformations to be in the pipeline at anytime for inference
-        assert 'preprocess_transforms' in transformed_fit_dictionary_time_series.keys()
-
-    @pytest.mark.parametrize("is_small_preprocess", [True, False])
-    def test_default_configuration(self, fit_dictionary_time_series, is_small_preprocess):
-        """Makes sure that when no config is set, we can trust the
-        default configuration from the space"""
-
-        fit_dictionary_time_series['is_small_preprocess'] = is_small_preprocess
-
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-
-        pipeline.fit(fit_dictionary_time_series)
-
-    def test_remove_key_check_requirements(self, fit_dictionary_time_series):
-        """Makes sure that when a key is removed from X, correct error is outputted"""
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        for key in ['num_run', 'device', 'split_id', 'use_pynisher', 'torch_num_threads', 'dataset_properties']:
-            fit_dictionary_time_series_copy = fit_dictionary_time_series.copy()
-            fit_dictionary_time_series_copy.pop(key)
-            with pytest.raises(ValueError, match=r"To fit .+?, expected fit dictionary to have"):
-                pipeline.fit(fit_dictionary_time_series_copy)
-
-    def test_network_optimizer_lr_handshake(self, fit_dictionary_time_series):
-        """Fitting a network should put the network in the X"""
-        # Create the pipeline to check. A random config should be sufficient
-        pipeline = TimeSeriesRegressionPipeline(
-            dataset_properties=fit_dictionary_time_series['dataset_properties'])
-        cs = pipeline.get_hyperparameter_search_space()
-        config = cs.sample_configuration()
-        pipeline.set_hyperparameters(config)
-
-        # Make sure that fitting a network adds a "network" to X
-        assert 'network' in pipeline.named_steps.keys()
-        fit_dictionary_time_series['network_backbone'] = torch.nn.Linear(3, 4)
-        fit_dictionary_time_series['network_head'] = torch.nn.Linear(4, 1)
-        X = pipeline.named_steps['network'].fit(
-            fit_dictionary_time_series,
-            None
-        ).transform(fit_dictionary_time_series)
-        assert 'network' in X
-
-        # Then fitting a optimizer should fail if no network:
-        assert 'optimizer' in pipeline.named_steps.keys()
-        with pytest.raises(
-            ValueError,
-            match=r"To fit .+?, expected fit dictionary to have 'network' but got .*"
-        ):
-            pipeline.named_steps['optimizer'].fit({'dataset_properties': {}}, None)
-
-        # No error when network is passed
-        X = pipeline.named_steps['optimizer'].fit(X, None).transform(X)
-        assert 'optimizer' in X
-
-        # Then fitting a optimizer should fail if no network:
-        assert 'lr_scheduler' in pipeline.named_steps.keys()
-        with pytest.raises(
-            ValueError,
-            match=r"To fit .+?, expected fit dictionary to have 'optimizer' but got .*"
-        ):
-            pipeline.named_steps['lr_scheduler'].fit({'dataset_properties': {}}, None)
-
-        # No error when network is passed
-        X = pipeline.named_steps['lr_scheduler'].fit(X, None).transform(X)
-        assert 'optimizer' in X
-
-    def test_get_fit_requirements(self, fit_dictionary_time_series):
-        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
-                              'task_type': 'time_series_classification'}
-        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties)
-        fit_requirements = pipeline.get_fit_requirements()
-
-        # check if fit requirements is a list of FitRequirement named tuples
-        assert isinstance(fit_requirements, list)
-        for requirement in fit_requirements:
-            assert isinstance(requirement, FitRequirement)
-
-    def test_apply_search_space_updates(self, fit_dictionary_time_series, search_space_updates):
-        dataset_properties = {'numerical_features': [0], 'categorical_features': [],
-                              'task_type': 'time_series_classification'}
-        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                search_space_updates=search_space_updates)
-        self._assert_pipeline_search_space(pipeline, search_space_updates)
-
-    def test_read_and_update_search_space(self, fit_dictionary_time_series, search_space_updates):
-        import tempfile
-        path = tempfile.gettempdir()
-        path = os.path.join(path, 'updates.txt')
-        # Write to disk
-        search_space_updates.save_as_file(path=path)
-        assert os.path.exists(path=path)
-
-        # Read from disk
-        file_search_space_updates = parse_hyperparameter_search_space_updates(updates_file=path)
-        assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates)
-        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
-                              'task_type': 'time_series_classification'}
-        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                search_space_updates=file_search_space_updates)
-        assert file_search_space_updates == pipeline.search_space_updates
-
-    def test_error_search_space_updates(self, fit_dictionary_time_series, error_search_space_updates):
-        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
-                              'task_type': 'time_series_classification'}
-        try:
-            _ = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                             search_space_updates=error_search_space_updates)
-        except Exception as e:
-            assert isinstance(e, ValueError)
-            assert re.match(r'Unknown hyperparameter for component .*?\. Expected update '
-                            r'hyperparameter to be in \[.*?\] got .+', e.args[0])
-
-    def test_set_range_search_space_updates(self, fit_dictionary_time_series):
-        dataset_properties = {'numerical_features': [1], 'categorical_features': [2],
-                              'task_type': 'time_series_classification'}
-        config_dict = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties). \
-            get_hyperparameter_search_space()._hyperparameters
-        updates = HyperparameterSearchSpaceUpdates()
-        for i, (name, hyperparameter) in enumerate(config_dict.items()):
-            if '__choice__' in name:
-                continue
-            name = name.split(':')
-            hyperparameter_name = ':'.join(name[1:])
-            if "network" in name[0]:
-                continue
-            if isinstance(hyperparameter, CategoricalHyperparameter):
-                value_range = (hyperparameter.choices[0],)
-                default_value = hyperparameter.choices[0]
-            else:
-                value_range = (0, 1)
-                default_value = 1
-            updates.append(node_name=name[0], hyperparameter=hyperparameter_name,
-                           value_range=value_range, default_value=default_value)
-        pipeline = TimeSeriesRegressionPipeline(dataset_properties=dataset_properties,
-                                                search_space_updates=updates)
-
-        try:
-            self._assert_pipeline_search_space(pipeline, updates)
-        except AssertionError as e:
-            # As we are setting num_layers to 1 for fully connected
-            # head, units_layer does not exist in the configspace
-            assert 'fully_connected:units_layer' in e.args[0]

From 88977e0a9348a59226f73e096436f83aa57ee095 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 2 May 2022 12:43:36 +0200
Subject: [PATCH 234/347] synchronise with development

---
 README.md                                     | 21 ------
 autoPyTorch/constants.py                      |  3 +-
 autoPyTorch/constants_forecasting.py          |  2 +-
 autoPyTorch/evaluation/abstract_evaluator.py  |  5 +-
 autoPyTorch/evaluation/tae.py                 |  2 +-
 autoPyTorch/optimizer/smbo.py                 | 34 +++++-----
 .../setup/early_preprocessor/utils.py         |  2 +-
 .../setup/network_backbone/__init__.py        |  7 +-
 .../network_backbone/base_network_backbone.py |  5 +-
 .../LearnedEntityEmbedding.py                 |  8 +--
 .../base_network_embedding.py                 |  2 +-
 .../setup/network_head/fully_connected.py     |  2 +-
 .../setup/network_initializer/SparseInit.py   |  5 +-
 .../data_loader/time_series_data_loader.py    | 60 -----------------
 .../pipeline/components/training/losses.py    |  9 ++-
 .../components/training/metrics/base.py       |  4 +-
 .../components/training/metrics/utils.py      |  8 ++-
 autoPyTorch/utils/pipeline.py                 | 22 ++-----
 setup.py                                      |  1 -
 test/conftest.py                              | 64 ++++++++++---------
 .../components/setup/test_setup.py            |  4 +-
 .../test_pipeline/components/training/base.py | 36 -----------
 22 files changed, 91 insertions(+), 215 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py

diff --git a/README.md b/README.md
index b537e80fb..f82910806 100755
--- a/README.md
+++ b/README.md
@@ -49,8 +49,6 @@ Then API starts the following procedures:
 
 pip install autoPyTorch
 
-```sh
-$ git checkout development
 ```
 
 ### Manual Installation
@@ -147,31 +145,12 @@ Please refer to the branch `TPAMI.2021.3067763` to reproduce the paper *Auto-PyT
   title = {Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL},
   journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
   year = {2021},
-<<<<<<< HEAD
-  note = {IEEE early access; also available under https://arxiv.org/abs/2006.13799},
-  pages = {1-12}
-=======
   note = {also available under https://arxiv.org/abs/2006.13799},
   pages = {3079 - 3090}
->>>>>>> upstream/master
-}
-```
-
-```bibtex
-<<<<<<< HEAD
-  @article{zimmer-tpami21a,
-  author = {Lucas Zimmer and Marius Lindauer and Frank Hutter},
-  title = {Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL},
-  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
-  year = {2021},
-  note = {IEEE early access; also available under https://arxiv.org/abs/2006.13799},
-  pages = {1-12}
 }
 ```
 
 ```bibtex
-=======
->>>>>>> upstream/master
 @incollection{mendoza-automlbook18a,
   author    = {Hector Mendoza and Aaron Klein and Matthias Feurer and Jost Tobias Springenberg and Matthias Urban and Michael Burkart and Max Dippel and Marius Lindauer and Frank Hutter},
   title     = {Towards Automatically-Tuned Deep Neural Networks},
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index f074ffaee..4e563eae4 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -10,9 +10,8 @@
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
-TIMESERIES_TASKS = [TIMESERIES_FORECASTING]
 
-TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + TIMESERIES_TASKS
+TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + FORECASTING_TASKS
 
 TASK_TYPES_TO_STRING = \
     {TABULAR_CLASSIFICATION: 'tabular_classification',
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index 351286a63..edfc40e11 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -2,7 +2,7 @@
 # https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
 # seasonality map, maps a frequency value to a number
 
-FORECASTING_BUDGET_TYPE = ['resolution', 'num_seq', 'num_sample_per_seq']
+FORECASTING_BUDGET_TYPE = ('resolution', 'num_seq', 'num_sample_per_seq')
 
 SEASONALITY_MAP = {
     "1min": [1440, 10080, 525960],
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index af484c6d1..d35613d66 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -506,10 +506,7 @@ def __init__(self, backend: Backend,
             elif isinstance(self.configuration, str):
                 self.pipeline_class = MyTraditionalTabularRegressionPipeline
             elif isinstance(self.configuration, Configuration):
-                if self.task_type in TABULAR_TASKS:
-                    self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline
-                elif self.task_type in FORECASTING_TASKS:
-                    self.pipeline_class = autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline
+                self.pipeline_class = autoPyTorch.pipeline.tabular_regression.TabularRegressionPipeline
             else:
                 raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_regression
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 6568296d4..31692f596 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -213,7 +213,7 @@ def _check_and_get_default_budget(self) -> float:
 
         budget_choices_forecasting = {budget_type: 1.0 for budget_type in FORECASTING_BUDGET_TYPE}
         budget_choices.update(budget_choices_forecasting)
-        budget_type_choices = (*budget_type_choices, *FORECASTING_BUDGET_TYPE)
+        budget_type_choices = budget_type_choices + FORECASTING_BUDGET_TYPE
 
         # budget is defined by epochs by default
         budget_type = str(self.pipeline_config.get('budget_type', 'epochs'))
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 73e988584..5341c9d6b 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -265,26 +265,26 @@ def __init__(self,
         self.logger.info("initialised {}".format(self.__class__.__name__))
 
         initial_configurations = []
-        if portfolio_selection is not None:
-            initial_configurations = read_return_initial_configurations(config_space=config_space,
-                                                                        portfolio_selection=portfolio_selection)
-
-        suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models', None)
-        custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path', None)
-
-        # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
-        # do not provide any initial configurations
-        if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
-            initial_configurations = read_forecasting_init_configurations(
-                config_space=config_space,
-                suggested_init_models=suggested_init_models,
-                custom_init_setting_path=custom_init_setting_path)
-
-        self.initial_configurations = initial_configurations \
-            if len(initial_configurations) > 0 else None
 
         if self.time_series_forecasting:
+            suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models', None)
+            custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path', None)
+            # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
+            # do not provide any initial configurations
+            if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
+                initial_configurations = read_forecasting_init_configurations(
+                    config_space=config_space,
+                    suggested_init_models=suggested_init_models,
+                    custom_init_setting_path=custom_init_setting_path)
+            # proxy-validation sets
             self.min_num_test_instances = kwargs.get('min_num_test_instances', None)
+        else:
+            if portfolio_selection is not None:
+                initial_configurations = read_return_initial_configurations(config_space=config_space,
+                                                                            portfolio_selection=portfolio_selection)
+
+        self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else None
+
 
     def reset_data_manager(self) -> None:
         if self.datamanager is not None:
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index 0b2094ad6..b9037a138 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -16,7 +16,7 @@
 
 def get_preprocess_transforms(X: Dict[str, Any],
                               preprocess_type: Union[Type[aPTPre], Type[aPTTPre]] = aPTPre) \
-        -> torchvision.transforms.Compose:
+        -> List[Union[Type[aPTPre], Type[aPTTPre]]]:
     candidate_transforms: List[preprocess_type] = list()
     for key, value in X.items():
         if isinstance(value, preprocess_type):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index 87c1ccf70..4a9e360fe 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -189,8 +189,11 @@ def get_hyperparameter_search_space(
 
     @property
     def _defaults_network(self):
-        return ['ShapedMLPBackbone',
-                'MLPBackbone']
+        return [
+            'ShapedMLPBackbone',
+            'MLPBackbone',
+            'ConvNetImageBackbone',
+        ]
 
     def transform(self, X: np.ndarray) -> np.ndarray:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index 8433f77a6..c415217da 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Any, Dict, Iterable, Optional, Tuple, List
+from typing import Any, Dict, Iterable, Optional, Tuple
 
 import numpy as np
 
@@ -10,15 +10,12 @@
 import torch
 from torch import nn
 
-import torchvision
-
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.base_component import (
     autoPyTorchComponent,
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.constants import TIMESERIES_FORECASTING, TASK_TYPES_TO_STRING
 
 
 class NetworkBackboneComponent(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 7ae0dd894..66867847e 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -19,19 +19,13 @@
 class _LearnedEntityEmbedding(nn.Module):
     """ Learned entity embedding module for categorical features"""
 
-    def __init__(self,
-                 config: Dict[str, Any],
-                 num_input_features: np.ndarray,
-                 num_numerical_features: int):
+    def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int):
         """
         Args:
             config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer
             num_input_features (np.ndarray): column wise information of number of output columns after transformation
                 for each categorical column and 0 for numerical columns
             num_numerical_features (int): number of numerical features in X
-            num_output_dimensions Optional[List[int]]: number of output dimensions, this is applied to quickly
-                construct a new Embedding network
-            ee_layers (Optional[nn.Module])
         """
         super().__init__()
         self.config = config
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 895eb2fa3..22d453805 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -71,5 +71,5 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
         categories = X['dataset_properties']['categories']
 
         for i, category in enumerate(categories):
-            num_input_features[num_numerical_columns + i,] = len(category)
+            num_input_features[num_numerical_columns + i, ] = len(category)
         return num_numerical_columns, num_input_features
diff --git a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
index 3a853648f..99762bbcf 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/fully_connected.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Tuple, Union, List
+from typing import Dict, Optional, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
index 4820c55db..4cd3dd72c 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
@@ -21,7 +21,10 @@ def weights_init(self) -> Callable:
         """
 
         def initialization(m: torch.nn.Module) -> None:
-            if isinstance(m, torch.nn.Linear):
+            if isinstance(m, (torch.nn.Conv1d,
+                              torch.nn.Conv2d,
+                              torch.nn.Conv3d,
+                              torch.nn.Linear)):
                 torch.nn.init.sparse_(m.weight.data, 0.9)
                 if m.bias is not None and self.bias_strategy == 'Zero':
                     torch.nn.init.constant_(m.bias.data, 0.0)
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py
deleted file mode 100644
index 5ea83b8dd..000000000
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_data_loader.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from typing import Any, Callable, Dict, List
-
-import torch
-
-import torchvision
-
-from autoPyTorch.pipeline.components.training.data_loader.base_data_loader import BaseDataLoaderComponent
-
-
-class TimeSeriesDataLoader(BaseDataLoaderComponent):
-    """This class is an interface to the PyTorch Dataloader.
-
-    Particularly, this data loader builds transformations for
-    tabular data.
-
-    """
-
-    def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transforms.Compose:
-        """
-        Method to build a transformation that can pre-process input data
-
-        Args:
-            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
-            mode (str): train/val/test
-
-        Returns:
-            A composition of transformations
-        """
-
-        if mode not in ['train', 'val', 'test']:
-            raise ValueError("Unsupported mode provided {}. ".format(mode))
-
-        # In the case of time series data, the options currently available
-        # for transformations are:
-        #   + scaler
-        # This transformations apply for both train/val/test, so no
-        # distinction is performed
-        candidate_transformations = []  # type: List[Callable]
-
-        if 'test' in mode or not X['dataset_properties']['is_small_preprocess']:
-            candidate_transformations.extend(X['preprocess_transforms'])
-
-        # Transform to tensor
-        candidate_transformations.append(torch.from_numpy)
-
-        return torchvision.transforms.Compose(candidate_transformations)
-
-    def _check_transform_requirements(self, X: Dict[str, Any], y: Any = None) -> None:
-        """
-
-        Makes sure that the fit dictionary contains the required transformations
-        that the dataset should go through
-
-        Args:
-            X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing
-                mechanism, in which during a transform, a components adds relevant information
-                so that further stages can be properly fitted
-        """
-        if not X['dataset_properties']['is_small_preprocess'] and 'preprocess_transforms' not in X:
-            raise ValueError("Cannot find the preprocess_transforms in the fit dictionary")
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index c6cf50736..f896fa3cb 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -9,6 +9,12 @@
             MSELoss: supports continuous output types
             L1Loss: supports continuous output types
         Default: MSELoss
+Forecasting:
+            LogProbLoss: supports distribution output types
+            QuantileLoss: supports quantile output type
+            MAPELoss: supports continuous output types
+            MASELoss: supports continuous output types
+            L1Loss: supports continuous output types
 """
 from typing import Any, Dict, Optional, Type, List
 
@@ -119,7 +125,6 @@ def forward(self,
         else:
             return losses_all
 
-
 losses = dict(
     classification=dict(
         CrossEntropyLoss=dict(
@@ -147,7 +152,7 @@ def forward(self,
 
 default_losses: Dict[str, Type[Loss]] = dict(classification=CrossEntropyLoss,
                                              regression=MSELoss,
-                                             forecasting=LogProbLoss)
+                                             forecasting=MASELoss)
 
 LOSS_TYPES = ['regression', 'distribution']
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 157e9d4c5..3b102416e 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -37,9 +37,9 @@ def __repr__(self) -> str:
         return self.name
 
 
-# This is a mixin for computing time series forecasting losses, the  parameters are defined at:
+# This is a mixin for computing time series forecasting losses, the  parameters are defined by:
 # https://www.sktime.org/en/stable/api_reference/performance_metrics.html
-# TODO considering adding more arguments to this function to allow more advanced loss function, e.g. asymmetric_error
+# TODO considering adding more arguments to this function to allow advanced loss functions, e.g. asymmetric_error
 class ForecastingMetricMixin:
     def __call__(self,
                  y_true: np.ndarray,
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 0b1d07a66..62be56098 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -11,8 +11,12 @@
     TASK_TYPES,
 )
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, \
-    REGRESSION_METRICS, FORECASTING_METRICS, MASE_LOSSES
+from autoPyTorch.pipeline.components.training.metrics.metrics import (
+    CLASSIFICATION_METRICS,
+    REGRESSION_METRICS,
+    FORECASTING_METRICS,
+    MASE_LOSSES
+)
 
 
 def sanitize_array(array: np.ndarray) -> np.ndarray:
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 24b01b2d5..59d747363 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -10,7 +10,6 @@
     FORECASTING_TASKS,
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS,
-    TIMESERIES_TASKS,
 )
 from autoPyTorch.pipeline.image_classification import ImageClassificationPipeline
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
@@ -199,16 +198,11 @@ def _get_regression_configuration_space(info: Dict[str, Any], include: Dict[str,
                                         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                                         ) -> ConfigurationSpace:
     if STRING_TO_TASK_TYPES[info['task_type']] in TABULAR_TASKS:
-        pipeline = TabularRegressionPipeline(dataset_properties=info,
-                                             include=include,
-                                             exclude=exclude,
-                                             search_space_updates=search_space_updates)
-        return pipeline.get_hyperparameter_search_space()
-
-    elif STRING_TO_TASK_TYPES[info['task_type']] in TIMESERIES_TASKS:
-        pipeline = TimeSeriesRegressionPipeline(dataset_properties=info,
-                                                include=include, exclude=exclude,
-                                                search_space_updates=search_space_updates)
+        pipeline = TabularRegressionPipeline(
+            dataset_properties=info,
+            include=include,
+            exclude=exclude,
+            search_space_updates=search_space_updates)
         return pipeline.get_hyperparameter_search_space()
 
     else:
@@ -225,12 +219,6 @@ def _get_classification_configuration_space(info: Dict[str, Any], include: Dict[
                                                  search_space_updates=search_space_updates)
         return pipeline.get_hyperparameter_search_space()
 
-    elif STRING_TO_TASK_TYPES[info['task_type']] in TIMESERIES_TASKS:
-        pipeline = TimeSeriesClassificationPipeline(dataset_properties=info,
-                                                    include=include, exclude=exclude,
-                                                    search_space_updates=search_space_updates)
-        return pipeline.get_hyperparameter_search_space()
-
     elif STRING_TO_TASK_TYPES[info['task_type']] in IMAGE_TASKS:
         return ImageClassificationPipeline(
             dataset_properties=info,
diff --git a/setup.py b/setup.py
index be3441a37..e1e3d47e2 100755
--- a/setup.py
+++ b/setup.py
@@ -71,7 +71,6 @@
             "jupyter",
             "notebook",
             "seaborn",
-            "sktime"
         ],
         "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"],
     },
diff --git a/test/conftest.py b/test/conftest.py
index 7e4729fb8..fa066e202 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -27,7 +27,6 @@
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
-
 N_SAMPLES = 300
 
 
@@ -41,19 +40,19 @@ def callattr_ahead_of_alltests(request):
     """
     tasks_used = [
         146818,  # Australian
-        2295,    # cholesterol
-        2075,    # abalone
-        2071,    # adult
-        3,       # kr-vs-kp
-        9981,    # cnae-9
+        2295,  # cholesterol
+        2075,  # abalone
+        2071,  # adult
+        3,  # kr-vs-kp
+        9981,  # cnae-9
         146821,  # car
         146822,  # Segment
-        2,       # anneal
-        53,      # vehicle
-        5136,    # tecator
-        4871,    # sensory
-        4857,    # boston
-        3916,    # kc1
+        2,  # anneal
+        53,  # vehicle
+        5136,  # tecator
+        4871,  # sensory
+        4857,  # boston
+        3916,  # kc1
     ]
 
     # Populate the cache
@@ -274,7 +273,14 @@ def get_tabular_data(task):
 
     return X, y, validator
 
-def get_fit_dictionary(datamanager, backend):
+
+def get_fit_dictionary(X, y, validator, backend):
+    datamanager = TabularDataset(
+        X=X, Y=y,
+        validator=validator,
+        X_test=X, Y_test=y,
+    )
+
     info = datamanager.get_required_dataset_info()
 
     dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
@@ -301,15 +307,6 @@ def get_fit_dictionary(datamanager, backend):
     return fit_dictionary
 
 
-def get_tabular_fit_dictionary(X, y, validator, backend):
-    datamanager = TabularDataset(
-        X=X, Y=y,
-        validator=validator,
-        X_test=X, Y_test=y,
-    )
-    return get_fit_dictionary(datamanager, backend)
-
-
 @pytest.fixture
 def fit_dictionary_tabular_dummy(request, backend):
     if request.param == "classification":
@@ -317,15 +314,14 @@ def fit_dictionary_tabular_dummy(request, backend):
     elif request.param == "regression":
         X, y, validator = get_tabular_data("regression_numerical_only")
     else:
-        raise ValueError(f"Unsupported indirect fixture {request.param}")
-    return get_tabular_fit_dictionary(X, y, validator, backend)
-
+        raise ValueError("Unsupported indirect fixture {}".format(request.param))
+    return get_fit_dictionary(X, y, validator, backend)
 
 
 @pytest.fixture
 def fit_dictionary_tabular(request, backend):
     X, y, validator = get_tabular_data(request.param)
-    return get_tabular_fit_dictionary(X, y, validator, backend)
+    return get_fit_dictionary(X, y, validator, backend)
 
 
 @pytest.fixture
@@ -369,6 +365,10 @@ def dataset_traditional_classifier_num_categorical():
 @pytest.fixture
 def search_space_updates():
     updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="imputer",
+                   hyperparameter="numerical_strategy",
+                   value_range=("mean", "most_frequent"),
+                   default_value="mean")
     updates.append(node_name="data_loader",
                    hyperparameter="batch_size",
                    value_range=[16, 512],
@@ -377,16 +377,20 @@ def search_space_updates():
                    hyperparameter="CosineAnnealingLR:T_max",
                    value_range=[50, 60],
                    default_value=55)
-    updates.append(node_name="optimizer",
-                   hyperparameter="AdamOptimizer:lr",
-                   value_range=[0.0001, 0.001],
-                   default_value=0.001)
+    updates.append(node_name='network_backbone',
+                   hyperparameter='ResNetBackbone:dropout',
+                   value_range=[0, 0.5],
+                   default_value=0.2)
     return updates
 
 
 @pytest.fixture
 def error_search_space_updates():
     updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="imputer",
+                   hyperparameter="num_str",
+                   value_range=("mean", "most_frequent"),
+                   default_value="mean")
     updates.append(node_name="data_loader",
                    hyperparameter="batch_size",
                    value_range=[16, 512],
diff --git a/test/test_pipeline/components/setup/test_setup.py b/test/test_pipeline/components/setup/test_setup.py
index 3cbc0263b..9d66953b2 100644
--- a/test/test_pipeline/components/setup/test_setup.py
+++ b/test/test_pipeline/components/setup/test_setup.py
@@ -317,7 +317,7 @@ class TestNetworkBackbone:
     def test_all_backbones_available(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
 
-        assert len(backbone_choice.get_components().keys()) == 9
+        assert len(backbone_choice.get_components().keys()) == 8
 
     @pytest.mark.parametrize('task_type_input_shape', [(constants.IMAGE_CLASSIFICATION, (3, 64, 64)),
                                                        (constants.IMAGE_REGRESSION, (3, 64, 64)),
@@ -370,7 +370,7 @@ def test_dummy_forward_backward_pass(self, task_type_input_shape):
     def test_every_backbone_is_valid(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
 
-        assert len(backbone_choice.get_components().keys()) == 9
+        assert len(backbone_choice.get_components().keys()) == 8
 
         for name, backbone in backbone_choice.get_components().items():
             config = backbone.get_hyperparameter_search_space().sample_configuration()
diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py
index 4f86356e4..63ca438dc 100644
--- a/test/test_pipeline/components/training/base.py
+++ b/test/test_pipeline/components/training/base.py
@@ -14,17 +14,8 @@
     REGRESSION_TASKS,
     TASK_TYPES_TO_STRING)
 
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \
-    TabularColumnTransformer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice as\
-    TabularScalerChoice
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
-from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
-
 
 
 class BaseTraining:
@@ -135,30 +126,3 @@ def train_model(self,
                 # Backward pass
                 loss.backward()
                 optimizer.step()
-
-
-class TabularPipeline(TabularClassificationPipeline):
-    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
-                            ) -> List[Tuple[str, autoPyTorchChoice]]:
-        """
-        Defines what steps a pipeline should follow.
-        The step itself has choices given via autoPyTorchChoice.
-
-        Returns:
-            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
-                by the pipeline.
-        """
-        steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
-
-        default_dataset_properties = {'target_type': 'tabular_classification'}
-        if dataset_properties is not None:
-            default_dataset_properties.update(dataset_properties)
-
-        steps.extend([
-            ("imputer", SimpleImputer()),
-            ("encoder", EncoderChoice(default_dataset_properties)),
-            ("scaler", TabularScalerChoice(default_dataset_properties)),
-            ("tabular_transformer", TabularColumnTransformer()),
-        ])
-        return steps
-

From b269ff814a388c4dfbc25648fb375b3839150118 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 2 May 2022 14:39:34 +0200
Subject: [PATCH 235/347] recover timeseries

---
 autoPyTorch/constants.py                     | 1 +
 autoPyTorch/evaluation/abstract_evaluator.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 4e563eae4..4f66b3188 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -10,6 +10,7 @@
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
+TIMESERIES_TASKS =[FORECASTING_TASKS]
 
 TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + FORECASTING_TASKS
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index d35613d66..617e6b0e0 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -671,7 +671,7 @@ def _init_fit_dictionary(
         elif self.budget_type == 'runtime':
             self.fit_dictionary['runtime'] = self.budget
             self.fit_dictionary.pop('epochs', None)
-        elif self.budget_type == 'resolution' and self.task_type in TIMESERIES_TASKS:
+        elif self.budget_type == 'resolution' and self.task_type in FORECASTING_TASKS:
             self.fit_dictionary['sample_interval'] = int(np.ceil(1.0 / self.budget))
             self.fit_dictionary.pop('epochs', None)
             self.fit_dictionary.pop('runtime', None)

From 31f9e432235d3e1fbc56780a6ac41eee413c4cd2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 2 May 2022 15:30:18 +0200
Subject: [PATCH 236/347] maint

---
 autoPyTorch/constants.py                               |  2 +-
 .../training/data_loader/base_data_loader.py           |  1 +
 .../components/training/trainer/base_trainer.py        |  2 +-
 .../forecasting_trainer/forecasting_base_trainer.py    | 10 ----------
 test/test_pipeline/components/setup/test_setup.py      |  4 ----
 5 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 4f66b3188..5788ca4ce 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -10,7 +10,7 @@
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
-TIMESERIES_TASKS =[FORECASTING_TASKS]
+TIMESERIES_TASKS = [FORECASTING_TASKS]
 
 TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + FORECASTING_TASKS
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index b1ee41bb2..dfa9f42a2 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -109,6 +109,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
 
+
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=min(self.batch_size, len(train_dataset)),
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index a337a7f24..2cd86a26d 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -345,7 +345,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
             return loss_sum / N, {}
 
     def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
-        if self.task_type in REGRESSION_TASKS or FORECASTING_TASKS:
+        if self.task_type in (REGRESSION_TASKS + FORECASTING_TASKS):
             targets = targets.float().to(self.device)
             # make sure that targets will have same shape as outputs (really important for mse loss for example)
             if targets.ndim == 1:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 321531afd..17549d7cb 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -120,16 +120,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
         else:
             return loss_sum / N, {}
 
-    def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
-        if self.task_type in REGRESSION_TASKS or FORECASTING_TASKS:
-            targets = targets.float()
-            # make sure that targets will have same shape as outputs (really important for mse loss for example)
-            if targets.ndim == 1:
-                targets = targets.unsqueeze(1)
-        else:
-            targets = targets.long()
-        return targets
-
     def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, torch.Tensor]) \
             -> Tuple[float, torch.Tensor]:
         """
diff --git a/test/test_pipeline/components/setup/test_setup.py b/test/test_pipeline/components/setup/test_setup.py
index 9d66953b2..fcf88dc62 100644
--- a/test/test_pipeline/components/setup/test_setup.py
+++ b/test/test_pipeline/components/setup/test_setup.py
@@ -321,8 +321,6 @@ def test_all_backbones_available(self):
 
     @pytest.mark.parametrize('task_type_input_shape', [(constants.IMAGE_CLASSIFICATION, (3, 64, 64)),
                                                        (constants.IMAGE_REGRESSION, (3, 64, 64)),
-                                                       (constants.TIMESERIES_CLASSIFICATION, (32, 6)),
-                                                       (constants.TIMESERIES_REGRESSION, (32, 6)),
                                                        (constants.TABULAR_CLASSIFICATION, (100,)),
                                                        (constants.TABULAR_REGRESSION, (100,))])
     def test_dummy_forward_backward_pass(self, task_type_input_shape):
@@ -506,8 +504,6 @@ def test_all_heads_available(self):
 
     @pytest.mark.parametrize('task_type_input_output_shape', [(constants.IMAGE_CLASSIFICATION, (3, 64, 64), (5,)),
                                                               (constants.IMAGE_REGRESSION, (3, 64, 64), (1,)),
-                                                              (constants.TIMESERIES_CLASSIFICATION, (32, 6), (5,)),
-                                                              (constants.TIMESERIES_REGRESSION, (32, 6), (1,)),
                                                               (constants.TABULAR_CLASSIFICATION, (100,), (5,)),
                                                               (constants.TABULAR_REGRESSION, (100,), (1,))])
     def test_dummy_forward_backward_pass(self, task_type_input_output_shape):

From 67ea836255ef11150e7c72328443bc4d1b224a4f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 2 May 2022 23:41:22 +0200
Subject: [PATCH 237/347] maint

---
 autoPyTorch/constants.py                      |  2 +-
 autoPyTorch/ensemble/ensemble_builder.py      |  2 +-
 test/test_api/test_api.py                     |  2 +-
 .../test_datasets/test_time_series_dataset.py | 38 -------------------
 .../components/setup/test_setup.py            |  8 ++--
 5 files changed, 8 insertions(+), 44 deletions(-)
 delete mode 100644 test/test_datasets/test_time_series_dataset.py

diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 5788ca4ce..4196149b0 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -10,7 +10,7 @@
 
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
-TIMESERIES_TASKS = [FORECASTING_TASKS]
+TIMESERIES_TASKS = [TIMESERIES_FORECASTING]
 
 TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + FORECASTING_TASKS
 
diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index cc097d42f..83a488e0d 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -66,7 +66,7 @@ def __init__(
         random_state: int,
         logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
         pynisher_context: str = 'fork',
-        metrics_kwargs: Optional[Dict] = None,
+        metrics_kwargs: Dict = {},
     ):
         """ SMAC callback to handle ensemble building
         Args:
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 4346ff2b6..22c0618e5 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -649,7 +649,7 @@ def test_build_pipeline(api_type, fit_dictionary_tabular):
 @pytest.mark.parametrize("disable_file_output", [['all'], None])
 @pytest.mark.parametrize('openml_id', (40984,))
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
-                         ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}),
+                         (#(HoldoutValTypes.holdout_validation, {'val_share': 0.8}),
                           (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}),
                           (NoResamplingStrategyTypes.no_resampling, {})
                           )
diff --git a/test/test_datasets/test_time_series_dataset.py b/test/test_datasets/test_time_series_dataset.py
deleted file mode 100644
index c61cc2c76..000000000
--- a/test/test_datasets/test_time_series_dataset.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import pytest
-
-from autoPyTorch.utils.pipeline import get_dataset_requirements
-
-
-@pytest.mark.parametrize("fit_dictionary_time_series", ['classification_numerical_only'], indirect=True)
-def test_get_dataset_properties(backend, fit_dictionary_time_series):
-    # The fixture creates a datamanager by itself
-    datamanager = backend.load_datamanager()
-
-    info = {'task_type': datamanager.task_type,
-            'output_type': datamanager.output_type,
-            'issparse': datamanager.issparse,
-            'numerical_features': datamanager.numerical_features,
-            'categorical_features': datamanager.categorical_features}
-    dataset_requirements = get_dataset_requirements(info)
-
-    dataset_properties = datamanager.get_dataset_properties(dataset_requirements)
-    for expected in [
-        'categorical_features',
-        'numerical_features',
-        'issparse',
-        'is_small_preprocess',
-        'task_type',
-        'output_type',
-        'input_shape',
-        'output_shape'
-    ]:
-        assert expected in dataset_properties
-
-    assert isinstance(dataset_properties, dict)
-    for dataset_requirement in dataset_requirements:
-        assert dataset_requirement.name in dataset_properties.keys()
-        assert isinstance(dataset_properties[dataset_requirement.name], dataset_requirement.supported_types)
-
-    assert datamanager.train_tensors[0].shape == fit_dictionary_time_series['X_train'].shape
-    assert datamanager.train_tensors[1].shape == fit_dictionary_time_series['y_train'].shape
-    assert datamanager.task_type == 'time_series_classification'
diff --git a/test/test_pipeline/components/setup/test_setup.py b/test/test_pipeline/components/setup/test_setup.py
index fcf88dc62..e4b8deeb4 100644
--- a/test/test_pipeline/components/setup/test_setup.py
+++ b/test/test_pipeline/components/setup/test_setup.py
@@ -317,7 +317,7 @@ class TestNetworkBackbone:
     def test_all_backbones_available(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
 
-        assert len(backbone_choice.get_components().keys()) == 8
+        assert len(backbone_choice.get_components().keys()) == 6
 
     @pytest.mark.parametrize('task_type_input_shape', [(constants.IMAGE_CLASSIFICATION, (3, 64, 64)),
                                                        (constants.IMAGE_REGRESSION, (3, 64, 64)),
@@ -367,8 +367,7 @@ def test_dummy_forward_backward_pass(self, task_type_input_shape):
 
     def test_every_backbone_is_valid(self):
         backbone_choice = NetworkBackboneChoice(dataset_properties={})
-
-        assert len(backbone_choice.get_components().keys()) == 8
+        assert len(backbone_choice.get_components().keys()) == 6
 
         for name, backbone in backbone_choice.get_components().items():
             config = backbone.get_hyperparameter_search_space().sample_configuration()
@@ -399,6 +398,9 @@ def test_get_set_config_space(self):
         """
         network_backbone_choice = NetworkBackboneChoice(dataset_properties={})
         for task_type in constants.TASK_TYPES:
+            if task_type in constants.FORECASTING_TASKS:
+                # Forecasting task has individual backbones
+                continue
             dataset_properties = {"task_type": constants.TASK_TYPES_TO_STRING[task_type]}
             cs = network_backbone_choice.get_hyperparameter_search_space(dataset_properties)
 

From 80b8ac2ebcd34c48472eff98b7502c2c50c2816d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 2 May 2022 23:41:59 +0200
Subject: [PATCH 238/347] limit memory usage tae

---
 autoPyTorch/evaluation/tae.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 31692f596..d7366cac1 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -317,8 +317,7 @@ def run(
             logger=logger,
             # Pynisher expects seconds as a time indicator
             wall_time_in_s=int(cutoff) if cutoff is not None else None,
-            # TODO Figure out how pynisher influences GPU memory usage here
-            # mem_in_mb=self.memory_limit,
+            mem_in_mb=self.memory_limit,
             capture_output=True,
             context=context,
         )

From d01e2a72e6e1d00887f2e77df3fdd408f74830fe Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 2 May 2022 23:44:21 +0200
Subject: [PATCH 239/347] revert test api

---
 test/test_api/test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 22c0618e5..4346ff2b6 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -649,7 +649,7 @@ def test_build_pipeline(api_type, fit_dictionary_tabular):
 @pytest.mark.parametrize("disable_file_output", [['all'], None])
 @pytest.mark.parametrize('openml_id', (40984,))
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
-                         (#(HoldoutValTypes.holdout_validation, {'val_share': 0.8}),
+                         ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}),
                           (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}),
                           (NoResamplingStrategyTypes.no_resampling, {})
                           )

From 3be7be9b484885e70d08b2833c5511d584776499 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 3 May 2022 22:04:24 +0200
Subject: [PATCH 240/347] test for targets

---
 autoPyTorch/data/base_target_validator.py     |   5 +
 autoPyTorch/data/tabular_target_validator.py  |  67 ++++---
 .../data/time_series_feature_validator.py     |  11 +-
 .../data/time_series_forecasting_validator.py |  16 +-
 .../data/time_series_target_validator.py      | 176 +++---------------
 test/conftest.py                              | 144 ++++++++++++++
 .../test_forecasting_target_validator.py      |  59 ++++++
 7 files changed, 296 insertions(+), 182 deletions(-)
 create mode 100644 test/test_data/test_forecasting_target_validator.py

diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index 530675fbd..7de67316b 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -183,3 +183,8 @@ def is_single_column_target(self) -> bool:
         Output is encoded with a single column encoding
         """
         return self.out_dimensionality == 1
+
+    @property
+    def allow_missing_values(self) -> bool:
+        return False
+
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index 22cabb999..e19837707 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -15,19 +15,25 @@
 
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
 from autoPyTorch.utils.common import ispandas
-
+import numpy.ma as ma
 
 ArrayType = Union[np.ndarray, spmatrix]
 
 
-def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
+def _check_and_to_array(y: SupportedTargetTypes, allow_nan=False) -> ArrayType:
     """ sklearn check array will make sure we have the correct numerical features for the array """
-    return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
+    if allow_nan:
+        return sklearn.utils.check_array(y, force_all_finite=False, accept_sparse='csr', ensure_2d=False)
+    else:
+        return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
 
 
-def _modify_regression_target(y: ArrayType) -> ArrayType:
+def _modify_regression_target(y: ArrayType, allow_nan=False) -> ArrayType:
     # Regression targets must have numbers after a decimal point.
     # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+    if allow_nan:
+        y = ma.masked_where(np.isnan(y), y, 1e12)
+
     y_min = np.abs(y).min()
     offset = max(y_min, 1e-13) * 1e-13  # Sufficiently small number
     if y_min > 1e12:
@@ -41,15 +47,16 @@ def _modify_regression_target(y: ArrayType) -> ArrayType:
         y = y.astype(dtype=np.float64) + offset
     else:
         y.data = y.data.astype(dtype=np.float64) + offset
-
+    if allow_nan:
+        return y.data
     return y
 
 
 class TabularTargetValidator(BaseTargetValidator):
     def _fit(
-        self,
-        y_train: SupportedTargetTypes,
-        y_test: Optional[SupportedTargetTypes] = None,
+            self,
+            y_train: SupportedTargetTypes,
+            y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         If dealing with classification, this utility encodes the targets.
@@ -89,9 +96,9 @@ def _fit(
             raise ValueError("Multi-dimensional classification is not yet supported. "
                              "Encoding multidimensional data converts multiple columns "
                              "to a 1 dimensional encoding. Data involved = {}/{}".format(
-                                 np.shape(y_train),
-                                 self.type_of_target
-                             ))
+                np.shape(y_train),
+                self.type_of_target
+            ))
 
         # Mypy redefinition
         assert self.encoder is not None
@@ -114,8 +121,8 @@ def _fit(
             if is_numeric_dtype(y_train.dtype):
                 self.dtype = y_train.dtype
         elif (
-            hasattr(y_train, 'dtypes')
-            and is_numeric_dtype(cast(pd.DataFrame, y_train).dtypes[0])
+                hasattr(y_train, 'dtypes')
+                and is_numeric_dtype(cast(pd.DataFrame, y_train).dtypes[0])
         ):
             # This case is for pandas array with a single column
             y_train = cast(pd.DataFrame, y_train)
@@ -125,7 +132,7 @@ def _fit(
 
     def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         if self.encoder is None:
-            return _check_and_to_array(y)
+            return _check_and_to_array(y, self.allow_missing_values)
 
         # remove ravel warning from pandas Series
         shape = np.shape(y)
@@ -139,7 +146,7 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         else:
             y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
 
-        return _check_and_to_array(y)
+        return _check_and_to_array(y, self.allow_missing_values)
 
     def transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
@@ -166,8 +173,13 @@ def transform(self, y: SupportedTargetTypes) -> np.ndarray:
         if y.ndim == 2 and y.shape[1] == 1:
             y = np.ravel(y)
 
-        if not self.is_classification and "continuous" not in type_of_target(y):
-            y = _modify_regression_target(y)
+        if self.allow_missing_values:
+            func_fill_na = np.nan_to_num
+        else:
+            func_fill_na = lambda x: x
+
+        if not self.is_classification and "continuous" not in type_of_target(func_fill_na(y)):
+            y = _modify_regression_target(y, self.allow_missing_values)
 
         return y
 
@@ -213,15 +225,14 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
             y (SupportedTargetTypes):
                 A set of features whose dimensionality and data type is going to be checked
         """
-
         if not isinstance(y, (np.ndarray, pd.DataFrame,
                               List, pd.Series)) \
                 and not issparse(y):  # type: ignore[misc]
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " pd.Series, sparse data and Python Lists as targets, yet, "
                              "the provided input is of type {}".format(
-                                 type(y)
-                             ))
+                type(y)
+            ))
 
         # Sparse data muss be numerical
         # Type ignore on attribute because sparse targets have a dtype
@@ -245,17 +256,25 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
 
         # No Nan is supported
         has_nan_values = False
+        sparse_has_nan = False
         if ispandas(y):
             has_nan_values = cast(pd.DataFrame, y).isnull().values.any()
+            if has_nan_values and self.allow_missing_values:
+                # if missing value is allowed, we simply fill the missing values to pass 'type_of_target'
+                y = cast(pd.DataFrame, y).fillna(method='pad')
         if issparse(y):
             y = cast(spmatrix, y)
             has_nan_values = not np.array_equal(y.data, y.data)
+            if has_nan_values and self.allow_missing_values:
+                sparse_has_nan = True
         else:
             # List and array like values are considered here
             # np.isnan cannot work on strings, so we have to check for every element
             # but NaN, are not equal to themselves:
             has_nan_values = not np.array_equal(y, y)
-        if has_nan_values:
+            if has_nan_values and self.allow_missing_values:
+                y = np.nan_to_num(y)
+        if sparse_has_nan or has_nan_values and not self.allow_missing_values:
             raise ValueError("Target values cannot contain missing/NaN values. "
                              "This is not supported by scikit-learn. "
                              )
@@ -281,6 +300,6 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
         if self.type_of_target not in supported_output_types:
             raise ValueError("Provided targets are not supported by AutoPyTorch. "
                              "Provided type is {} whereas supported types are {}.".format(
-                                 self.type_of_target,
-                                 supported_output_types
-                             ))
+                self.type_of_target,
+                supported_output_types
+            ))
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 07f79e2f2..d77077469 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -16,6 +16,7 @@ def __init__(
         super().__init__(logger)
         self.only_contain_series_idx = False
         self.static_features = ()
+        self.series_idx: Optional[Union[Tuple[Union[str, int]]]] = None
 
     def get_reordered_columns(self):
         return self.transformed_columns + list(set(self.column_order) - set(self.transformed_columns))
@@ -42,6 +43,7 @@ def fit(self,
                 The fitted base estimator
         """
         if series_idx is not None:
+            self.series_idx = series_idx
             # remove series idx as they are not part of features
             if isinstance(X_train, pd.DataFrame):
                 for series_id in series_idx:
@@ -71,7 +73,6 @@ def fit(self,
             X_train = pd.DataFrame(X_train, index=[0] * len(X_train))
         static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
         self.static_features = tuple(idx for idx in static_features.index if static_features[idx])
-        self.get_reordered_columns()
         return self
 
     def transform(
@@ -79,6 +80,14 @@ def transform(
             X: SupportedFeatTypes,
             index: Optional[Union[pd.Index, np.ndarray]] = None,
     ) -> Union[pd.DataFrame]:
+        if self.series_idx is not None:
+            if isinstance(X, pd.DataFrame):
+                if self.only_contain_series_idx:
+                    return None
+                X = X.drop(self.series_idx, axis=1)
+            else:
+                raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
+                                          f"X_train is {type(X)} ")
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if index is None:
             index = np.array([0.] * len(X))
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index d66e9c4c3..99b8c3ebf 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -50,16 +50,22 @@ def fit(
             freq: str = '1Y',
             n_prediction_steps: int = 1,
             known_future_features: Optional[List[Union[int, str]]] = None,
-            use_time_features: bool = False
     ) -> BaseEstimator:
         """
         fit the validator with the training data, (optionally) start times and other information
         Args:
             X_train (Optional[Union[List, pd.DataFrame]]): training features, could be None for "pure" forecasting tasks
             y_train (Union[List, pd.DataFrame]), training targets
-            series_idx (Optional[Union[List[Union[str, int]], str, int]]): which columns of the data are considered to
-                identify the
-
+            series_idx (Optional[Union[List[Union[str, int]], str, int]]): which columns of features are applied to
+                identify the series
+            X_test (Optional[Union[List, pd.DataFrame]]): test features. For forecasting tasks, test features indicates
+                known future features after the forecasting timestep\
+            y_test (Optional[Union[List, pd.DataFrame]]): target in the future
+            start_times (Optional[List[pd.DatetimeIndex]]): start times on which the first element of each series is
+                sampled
+            freq (str): the frequency that the data is sampled
+            n_prediction_steps (int): number of prediction steps (forecast horizon)
+            known_future_features (Optional[List[Union[int, str]]]): which features are known even in the future
 
         """
         if isinstance(series_idx, (str, int)):
@@ -111,8 +117,6 @@ def fit(
 
                 self._is_fitted = True
 
-                # In this case we don't assign series index to the data, we manually assigne
-
                 self.check_input_shapes(X_train, y_train, is_training=True)
 
                 if X_test is not None:
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index ad61ae19d..08d9d1359 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -1,50 +1,30 @@
-from typing import List, Optional, Union, cast
-
-import numpy as np
+import logging
+from typing import Optional, Union
 
 import pandas as pd
+import numpy as np
+from scipy.sparse import issparse
 
-from scipy.sparse import issparse, spmatrix
-
-import sklearn.utils
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.multiclass import type_of_target
-
+from autoPyTorch.utils.logging_ import PicklableClientLogger
 from autoPyTorch.data.base_target_validator import SupportedTargetTypes
-from autoPyTorch.utils.common import ispandas
 from autoPyTorch.data.tabular_target_validator import TabularTargetValidator, ArrayType
 
 
-def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
-    """ sklearn check array will make sure we have the correct numerical features for the array """
-    return sklearn.utils.check_array(y, force_all_finite=False, accept_sparse='csr', ensure_2d=False)
-
-
-def _modify_regression_target(y: ArrayType) -> ArrayType:
-    # Regression targets must have numbers after a decimal point.
-    # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
-    y_min = np.abs(np.nan_to_num(y, 1e12)).min()
-    offset = max(y_min, 1e-13) * 1e-13  # Sufficiently small number
-    if y_min > 1e12:
-        raise ValueError(
-            "The minimum value for the target labels of regression tasks must be smaller than "
-            f"1e12 to avoid errors caused by an overflow, but got {y_min}"
-        )
-
-    # Since it is all integer, we can just add a random small number
-    if isinstance(y, np.ndarray):
-        y = y.astype(dtype=np.float64) + offset
-    else:
-        y.data = y.data.astype(dtype=np.float64) + offset
-
-    return y
-
-
 class TimeSeriesTargetValidator(TabularTargetValidator):
+    def __init__(self,
+                 is_classification: bool = False,
+                 logger: Optional[
+                     Union[PicklableClientLogger, logging.Logger]
+                 ] = None,
+                 ):
+        if is_classification:
+            raise NotImplementedError("Classification is currently not supported for forecasting tasks!")
+        super().__init__(is_classification, logger)
+
     def transform(self,
                   y: SupportedTargetTypes,
                   index: Optional[Union[pd.Index, np.ndarray]] = None,
-                  ) ->pd.DataFrame:
+                  ) -> pd.DataFrame:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -60,125 +40,19 @@ def transform(self,
             pd.DataFrame:
                 The transformed array
         """
-        if not self._is_fitted:
-            raise NotFittedError("Cannot call transform on a validator that is not fitted")
-
-        # Check the data here so we catch problems on new test data
-        self._check_data(y)
-        y = self._transform_by_encoder(y)
-
-        # When translating a dataframe to numpy, make sure we honor the ravel requirement
-        if y.ndim == 2 and y.shape[1] == 1:
-            y = np.ravel(y)
-
-        if not self.is_classification and "continuous" not in type_of_target(np.nan_to_num(y)):
-            y = _modify_regression_target(y)
+        y: ArrayType = super().transform(y)
 
         if index is None:
-            index = np.array([0.] * len(y))
+            index = np.array([0] * y.shape[0])
         if y.ndim == 1:
             y = np.expand_dims(y, -1)
-
-        y: pd.DataFrame = pd.DataFrame(y)
+        if isinstance(y, np.ndarray):
+            y: pd.DataFrame = pd.DataFrame(y)
+        else:
+            y: pd.DataFrame = pd.DataFrame.sparse.from_spmatrix(y)
         y.index = index
-
         return y
 
-
-    def _check_data(self, y: SupportedTargetTypes) -> None:
-        """
-        Perform dimensionality and data type checks on the targets, This is nearly the same as
-        TabularTargetValidator._check_data, however, we allow NAN values in target
-
-        Args:
-            y (SupportedTargetTypes):
-                A set of features whose dimensionality and data type is going to be checked
-        """
-        if not isinstance(y, (np.ndarray, pd.DataFrame,
-                              List, pd.Series)) \
-                and not issparse(y):  # type: ignore[misc]
-            raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
-                             " pd.Series, sparse data and Python Lists as targets, yet, "
-                             "the provided input is of type {}".format(
-                                 type(y)
-                             ))
-
-        # Sparse data muss be numerical
-        # Type ignore on attribute because sparse targets have a dtype
-        if issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
-                                             np.number):
-            raise ValueError("When providing a sparse matrix as targets, the only supported "
-                             "values are numerical. Please consider using a dense"
-                             " instead."
-                             )
-
-        if self.data_type is None:
-            self.data_type = type(y)
-        if self.data_type != type(y):
-            self.logger.warning("AutoPyTorch previously received targets of type %s "
-                                "yet the current features have type %s. Changing the dtype "
-                                "of inputs to an estimator might cause problems" % (
-                                    str(self.data_type),
-                                    str(type(y)),
-                                ),
-                                )
-        if ispandas(y):
-            has_nan_values = cast(pd.DataFrame, y).isnull().values.any()
-            if has_nan_values:
-                y = cast(pd.DataFrame, y).fillna(method='pad')
-        if issparse(y):
-            y = cast(spmatrix, y)
-            has_nan_values = not np.array_equal(y.data, y.data)
-            if has_nan_values:
-                type_y = type(y)
-                y = type_y(np.nan_to_num(y.todense()))
-        else:
-            # List and array like values are considered here
-            # np.isnan cannot work on strings, so we have to check for every element
-            # but NaN, are not equal to themselves:
-            has_nan_values = not np.array_equal(y, y)
-            if has_nan_values:
-                y = np.nan_to_num(y)
-
-        # Pandas Series is not supported for multi-label indicator
-        # This format checks are done by type of target
-        try:
-            self.type_of_target = type_of_target(y)
-        except Exception as e:
-            raise ValueError("The provided data could not be interpreted by AutoPyTorch. "
-                             "While determining the type of the targets via type_of_target "
-                             "run into exception: {}.".format(e))
-
-        supported_output_types = ('binary',
-                                  'continuous',
-                                  'continuous-multioutput',
-                                  'multiclass',
-                                  'multilabel-indicator',
-                                  # Notice unknown/multiclass-multioutput are not supported
-                                  # This can only happen during testing only as estimators
-                                  # should filter out unsupported types.
-                                  )
-        if self.type_of_target not in supported_output_types:
-            raise ValueError("Provided targets are not supported by AutoPyTorch. "
-                             "Provided type is {} whereas supported types are {}.".format(
-                                 self.type_of_target,
-                                 supported_output_types
-                             ))
-
-    def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
-        if self.encoder is None:
-            return _check_and_to_array(y)
-
-        # remove ravel warning from pandas Series
-        shape = np.shape(y)
-        if len(shape) > 1:
-            y = self.encoder.transform(y)
-        elif ispandas(y):
-            # The Ordinal encoder expects a 2 dimensional input.
-            # The targets are 1 dimensional, so reshape to match the expected shape
-            y = cast(pd.DataFrame, y)
-            y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
-        else:
-            y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
-
-        return _check_and_to_array(y)
\ No newline at end of file
+    @property
+    def allow_missing_values(self) -> bool:
+        return True
diff --git a/test/conftest.py b/test/conftest.py
index fa066e202..2f3a0d9e2 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -12,18 +12,22 @@
 import openml
 
 import pandas as pd
+import datetime
 
 import pytest
 
 from scipy import sparse
 
 from sklearn.datasets import fetch_openml, make_classification, make_regression
+from sklearn.utils import check_random_state
 
 import torch
 
 from autoPyTorch.automl_common.common.utils.backend import create
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
@@ -614,3 +618,143 @@ def input_data_featuretest(request):
         return X
     else:
         ValueError("Unsupported indirect fixture {}".format(request.param))
+
+
+# Forecasting tasks
+def get_forecasting_data(uni_variant, with_missing_value=False, type_X='pd', with_series_id=False):
+    generator = check_random_state(0)
+    n_seq = 10
+    base_length = 50
+    targets = []
+
+    features = []
+
+    start_times = []
+    # the first character indicates the type of the feature:
+    # n: numerical, c: categorical, s: static
+    # for categorical features, the following character indicate how the feature is stored:
+    # s: stored as string; n: stored as
+    if type_X == 'pd':
+        feature_columns = ['n1', 'cs2_10', 'f3', 'cn4_5', 's5']
+    else:
+        feature_columns = ['n1', 'cn2_5', 'f3', 'cn4_5', 's5']
+
+    def generate_forecasting_features(feature_type, length):
+        feature_type_content = list(feature_type)
+        if feature_type_content[0] == 'n':
+            # numerical features
+            return generator.rand(length)
+        elif feature_type_content[0] == 'c':
+            num_class = int(feature_type.split("_"))
+            if feature_type_content[1] == 's':
+                return generator.choice([f'value_{feature_id}' for feature_id in range(num_class)],
+                                        size=length, replace=True)
+            elif feature_type_content[1] == 'n':
+                return generator.choice(list(range(num_class)), size=length, replace=True)
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    for i in range(n_seq):
+        new_seq = np.arange(i * 1000, base_length + i * 1010)
+        series_length = base_length + i * 10
+
+        targets.append(np.arange(i * 1000, series_length + i * 1000))
+        if not uni_variant:
+            if type_X == 'np':
+                features = np.asarray([generate_forecasting_features(col, series_length) for col in feature_columns])
+            elif type_X == 'pd':
+                features = {col: generate_forecasting_features(col, series_length) for col in feature_columns}
+                if with_series_id:
+                    features["series_id"] = [i] * series_length
+                features = pd.DataFrame(
+                    features
+                )
+            else:
+                raise NotImplementedError
+            features.append(features)
+
+        if with_missing_value:
+            new_seq[5] = np.NAN
+            new_seq[-5] = np.NAN
+
+        start_time = datetime.strptime(f'1900{i // 5}-01-01 00-00-00', '%Y-%m-%d %H-%M-%S')
+        start_times.append(start_time)
+    input_validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    features = features if features else None
+    return features, targets, input_validator.fit(features, targets, start_times=start_times), feature_columns
+
+
+def get_forecasting_fit_dictionary(X, y, validator, backend, budget_type='epochs', forecast_horizon=5, freq='1D'):
+    datamanager = TimeSeriesForecastingDataset(
+        X=X, Y=y,
+        validator=validator,
+        freq=freq,
+        n_prediction_steps=forecast_horizon,
+    )
+
+    info = datamanager.get_required_dataset_info()
+
+    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
+
+    fit_dictionary = {
+        'X_train': datamanager.train_tensors[0],
+        'y_train': datamanager.train_tensors[1],
+        'dataset_properties': dataset_properties,
+        # Training configuration
+        'num_run': 1,
+        'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
+        'device': 'cpu',
+        'torch_num_threads': 1,
+        'early_stopping': 10,
+        'use_tensorboard_logger': False,
+        'use_pynisher': False,
+        'metrics_during_training': False,
+        'seed': 1,
+        'budget_type': 'epochs',
+        'epochs': 5,
+        'split_id': 0,
+        'backend': backend,
+        'logger_port': logging.handlers.DEFAULT_TCP_LOGGING_PORT,
+    }
+    if budget_type == 'epochs':
+        fit_dictionary.update({'budget_type': 'epochs',
+                               'epochs': 5})
+    elif budget_type == 'resolution':
+        fit_dictionary.update({'budget_type': 'resolution',
+                               'sample_interval': 10})
+    elif budget_type == 'num_sample_per_seq':
+        fit_dictionary.update({'budget_type': 'num_samples',
+                               'fraction_samples_per_seq': 0.1})
+    elif budget_type == 'num_seq':
+        fit_dictionary.update({'budget_type': 'num_samples',
+                               'fraction_seq': 0.1})
+    else:
+        raise NotImplementedError
+    backend.save_datamanager(datamanager)
+    return fit_dictionary
+
+
+@pytest.fixture
+def fit_dictionary_uni_variant_wo_missing():
+    x, y, validator = get_forecasting_data(uni_variant=True, with_missing_value=False)
+    return get_forecasting_fit_dictionary(x, y, validator)
+
+
+@pytest.fixture
+def fit_dictionary_uni_variant_w_missing():
+    x, y, validator = get_forecasting_data(uni_variant=True, with_missing_value=True)
+    return get_forecasting_fit_dictionary(x, y, validator)
+
+
+@pytest.fixture
+def fit_dictionary_uni_variant_wo_missing():
+    x, y, validator = get_forecasting_data(uni_variant=False, with_missing_value=False)
+    return get_forecasting_fit_dictionary(x, y, validator)
+
+
+# Fixtures for forecasting validators.
+@pytest.fixture
+def input_data_forecasting_featuretest(request):
+    return [input_data_featuretest(request) for _ in range(3)]
diff --git a/test/test_data/test_forecasting_target_validator.py b/test/test_data/test_forecasting_target_validator.py
new file mode 100644
index 000000000..d07eeabf9
--- /dev/null
+++ b/test/test_data/test_forecasting_target_validator.py
@@ -0,0 +1,59 @@
+import numpy as np
+
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+
+import pytest
+
+from scipy import sparse
+
+import sklearn.datasets
+import sklearn.model_selection
+from sklearn.utils.multiclass import type_of_target
+
+from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
+from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
+
+
+def test_forecasting_target_transform():
+    validator = TimeSeriesTargetValidator(is_classification=False)
+    series_length = 10
+    y = np.ones(series_length)
+    validator.fit(y)
+    y_transformed_0 = validator.transform(y)
+    assert isinstance(y_transformed_0, pd.DataFrame)
+    assert np.all(y_transformed_0.index.values == np.zeros(series_length, dtype=np.int64))
+
+    index_1 = np.full(series_length, 1)
+    y_transformed_1 = validator.transform(y, index_1)
+    assert np.all(y_transformed_1.index.values == index_1)
+
+    index_2 = pd.Index([f"a{i}" for i in range(series_length)])
+    y_transformed_2 = validator.transform(y, index_2)
+    assert np.all(y_transformed_2.index.values == index_2)
+
+    index_3 = [('a', 'a')] * (series_length // 3) + \
+              [('a', 'b')] * (series_length // 3) + \
+              [('b', 'a')] * (series_length - series_length // 3 * 2)
+    index_3 = pd.MultiIndex.from_tuples(index_3)
+    y_transformed_3 = validator.transform(y, index_3)
+    assert isinstance(y_transformed_3.index, pd.MultiIndex)
+    assert np.all(y_transformed_3.index == index_3)
+
+
+def test_forecasting_target_missing_values():
+    """
+    Makes sure we raise a proper message to the user,
+    when providing not supported data input
+    """
+    validator1 = TimeSeriesTargetValidator(is_classification=False)
+    target_1 = np.array([np.nan, 1, 2])
+    validator1.fit(target_1)
+    assert validator1.transform(target_1).isnull().values.sum() == 1
+
+    validator2 = TimeSeriesTargetValidator(is_classification=False)
+    target_2 = sparse.csr_matrix(np.array([1, np.nan, np.nan]))
+    with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
+        # sparse matrix is unsupported for nan filling
+        validator2.fit(target_2)
+

From 77dcb7c0eae55655057604584199b90028104365 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 3 May 2022 22:27:52 +0200
Subject: [PATCH 241/347] not allow sparse forecasting target

---
 .../data/time_series_target_validator.py      | 18 ++++++++++++----
 .../test_forecasting_target_validator.py      | 11 +++++-----
 .../test_time_series_feature_validator.py     | 21 +++++++++++++++++++
 3 files changed, 41 insertions(+), 9 deletions(-)
 create mode 100644 test/test_data/test_time_series_feature_validator.py

diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 08d9d1359..1bef1e025 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 from scipy.sparse import issparse
+from sklearn.base import BaseEstimator
 
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 from autoPyTorch.data.base_target_validator import SupportedTargetTypes
@@ -21,6 +22,17 @@ def __init__(self,
             raise NotImplementedError("Classification is currently not supported for forecasting tasks!")
         super().__init__(is_classification, logger)
 
+    def fit(
+            self,
+            y_train: SupportedTargetTypes,
+            y_test: Optional[SupportedTargetTypes] = None,
+    ) -> BaseEstimator:
+        if issparse(y_train):
+            # TODO fix this
+            raise NotImplementedError("Sparse Target is unsupported for forecasting task!")
+        return super().fit(y_train, y_test)
+
+
     def transform(self,
                   y: SupportedTargetTypes,
                   index: Optional[Union[pd.Index, np.ndarray]] = None,
@@ -46,13 +58,11 @@ def transform(self,
             index = np.array([0] * y.shape[0])
         if y.ndim == 1:
             y = np.expand_dims(y, -1)
-        if isinstance(y, np.ndarray):
-            y: pd.DataFrame = pd.DataFrame(y)
-        else:
-            y: pd.DataFrame = pd.DataFrame.sparse.from_spmatrix(y)
+        y: pd.DataFrame = pd.DataFrame(y)
         y.index = index
         return y
 
     @property
     def allow_missing_values(self) -> bool:
         return True
+
diff --git a/test/test_data/test_forecasting_target_validator.py b/test/test_data/test_forecasting_target_validator.py
index d07eeabf9..976f7902c 100644
--- a/test/test_data/test_forecasting_target_validator.py
+++ b/test/test_data/test_forecasting_target_validator.py
@@ -40,6 +40,12 @@ def test_forecasting_target_transform():
     assert isinstance(y_transformed_3.index, pd.MultiIndex)
     assert np.all(y_transformed_3.index == index_3)
 
+    validator2 = TimeSeriesTargetValidator(is_classification=False)
+    target_2 = sparse.csr_matrix(np.array([1, 1, 1]))
+    with pytest.raises(NotImplementedError, match=r"Sparse Target is unsupported for forecasting task!"):
+        # sparse matrix is unsupported for nan filling
+        validator2.fit(target_2)
+
 
 def test_forecasting_target_missing_values():
     """
@@ -51,9 +57,4 @@ def test_forecasting_target_missing_values():
     validator1.fit(target_1)
     assert validator1.transform(target_1).isnull().values.sum() == 1
 
-    validator2 = TimeSeriesTargetValidator(is_classification=False)
-    target_2 = sparse.csr_matrix(np.array([1, np.nan, np.nan]))
-    with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
-        # sparse matrix is unsupported for nan filling
-        validator2.fit(target_2)
 
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_time_series_feature_validator.py
new file mode 100644
index 000000000..27c419a99
--- /dev/null
+++ b/test/test_data/test_time_series_feature_validator.py
@@ -0,0 +1,21 @@
+import copy
+import functools
+
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+from scipy import sparse
+
+import sklearn.datasets
+import sklearn.model_selection
+
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+
+def test_forecasting_validator_
+    validator = TimeSeriesFeatureValidator()
+
+
+

From 6932199a3402e769fc6cfc98c0a5c2355f3df0ea Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 4 May 2022 20:03:38 +0200
Subject: [PATCH 242/347] test for data validator

---
 .../data/time_series_feature_validator.py     | 49 ++++++++---
 .../data/time_series_forecasting_validator.py | 63 +++-----------
 .../data/time_series_target_validator.py      |  4 +-
 test/conftest.py                              | 52 +++++++++++
 .../test_forecasting_input_validator.py       | 46 ++++++++++
 .../test_forecasting_target_validator.py      | 23 +++--
 .../test_time_series_feature_validator.py     | 87 ++++++++++++++++++-
 7 files changed, 244 insertions(+), 80 deletions(-)
 create mode 100644 test/test_data/test_forecasting_input_validator.py

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index d77077469..8929fa29a 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -1,7 +1,8 @@
 import logging
-from typing import Optional, Union, Tuple
+from typing import Optional, Union, Tuple, List
 import pandas as pd
 import numpy as np
+from scipy.sparse import issparse
 
 from sklearn.base import BaseEstimator
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator, SupportedFeatTypes
@@ -16,15 +17,16 @@ def __init__(
         super().__init__(logger)
         self.only_contain_series_idx = False
         self.static_features = ()
-        self.series_idx: Optional[Union[Tuple[Union[str, int]]]] = None
+        self.series_idx: Optional[Union[List[Union[str, int]]]] = None
 
     def get_reordered_columns(self):
-        return self.transformed_columns + list(set(self.column_order) - set(self.transformed_columns))
+        return self.transformed_columns + [col for col in self.column_order if col not in set(self.transformed_columns)]
 
     def fit(self,
             X_train: Union[pd.DataFrame, np.ndarray],
             X_test: Union[pd.DataFrame, np.ndarray] = None,
-            series_idx: Optional[Union[Tuple[Union[str, int]]]] = None) -> BaseEstimator:
+            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+            sequence_lengths: Optional[List[int]] = None) -> BaseEstimator:
         """
 
         Arguments:
@@ -38,19 +40,29 @@ def fit(self,
             series_idx (Optional[Union[str, int]]):
                 Series Index, to identify each individual series
 
+            sequence_lengths (Optional[List[int]]):
+                Length of each sequence
+
         Returns:
             self:
                 The fitted base estimator
         """
+        if issparse(X_train):
+            raise NotImplementedError('Sparse matrix is currently unsupported for Forecasting tasks')
+        index = None
         if series_idx is not None:
             self.series_idx = series_idx
             # remove series idx as they are not part of features
+            # TODO consider them as static features?
             if isinstance(X_train, pd.DataFrame):
                 for series_id in series_idx:
                     if series_id not in X_train.columns:
                         raise ValueError(f"All Series ID must be contained in the training column, however, {series_id}"
                                          f"is not part of {X_train.columns.tolist()}")
-                self.only_contain_series_idx = len(X_train.columns) == series_idx
+                if X_train[list(series_idx)].isnull().values.any():
+                    raise ValueError('NaN should not exit in Series ID!')
+                index = pd.MultiIndex.from_frame(pd.DataFrame(X_train[series_idx]))
+                self.only_contain_series_idx = len(X_train.columns) == len(series_idx)
                 if self.only_contain_series_idx:
                     self._is_fitted = True
 
@@ -69,30 +81,41 @@ def fit(self,
                                           f"X_train is {type(X_train)} ")
         else:
             super().fit(X_train, X_test)
-        if isinstance(X_train, np.ndarray):
-            X_train = pd.DataFrame(X_train, index=[0] * len(X_train))
+        X_train = pd.DataFrame(X_train)
+        if index is None:
+            if sequence_lengths is None:
+                index = np.zeros(len(X_train))
+            else:
+                if np.sum(sequence_lengths) != len(X_train):
+                    raise ValueError("The Sum of Sequence length must equal to the length of hte dataset")
+                index = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+        X_train.index = index
+
         static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
         self.static_features = tuple(idx for idx in static_features.index if static_features[idx])
         return self
 
     def transform(
             self,
-            X: SupportedFeatTypes,
+            X: Union[pd.DataFrame, np.ndarray],
             index: Optional[Union[pd.Index, np.ndarray]] = None,
-    ) -> Union[pd.DataFrame]:
+    ) -> Optional[pd.DataFrame]:
+        if self.only_contain_series_idx:
+            return None
         if self.series_idx is not None:
             if isinstance(X, pd.DataFrame):
-                if self.only_contain_series_idx:
-                    return None
                 X = X.drop(self.series_idx, axis=1)
             else:
                 raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
                                           f"X_train is {type(X)} ")
         X = super(TimeSeriesFeatureValidator, self).transform(X)
-        if index is None:
-            index = np.array([0.] * len(X))
         if X.ndim == 1:
             X = np.expand_dims(X, -1)
         X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
+        if index is None:
+            index = np.array([0] * len(X))
+        else:
+            if len(index) != X.shape[0]:
+                raise ValueError('Given index must have length as the input features!')
         X.index = index
         return X
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 99b8c3ebf..c0b10d368 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -33,7 +33,6 @@ def __init__(self,
                                                           logger=self.logger)
         self._is_uni_variant = False
         self.known_future_features = None
-        self.n_prediction_steps = 1
         self.start_times = None
         self.feature_shapes: Dict[str, int] = {}
         self.feature_names: List[str] = []
@@ -47,8 +46,6 @@ def fit(
             X_test: Optional[Union[List, pd.DataFrame]] = None,
             y_test: Optional[Union[List, pd.DataFrame]] = None,
             start_times: Optional[List[pd.DatetimeIndex]] = None,
-            freq: str = '1Y',
-            n_prediction_steps: int = 1,
             known_future_features: Optional[List[Union[int, str]]] = None,
     ) -> BaseEstimator:
         """
@@ -63,18 +60,13 @@ def fit(
             y_test (Optional[Union[List, pd.DataFrame]]): target in the future
             start_times (Optional[List[pd.DatetimeIndex]]): start times on which the first element of each series is
                 sampled
-            freq (str): the frequency that the data is sampled
-            n_prediction_steps (int): number of prediction steps (forecast horizon)
             known_future_features (Optional[List[Union[int, str]]]): which features are known even in the future
 
         """
-        if isinstance(series_idx, (str, int)):
-            series_idx = [series_idx]
         self.series_idx = series_idx
-        self.n_prediction_steps = n_prediction_steps
 
         if start_times is None:
-            start_times = [pd.DatetimeIndex(pd.to_datetime(['2000-01-01']), freq=freq)] * len(y_train)
+            start_times = [pd.Timestamp('1900-01-01')] * len(y_train)
         else:
             assert len(start_times) == len(y_train), 'start_times_train must have the same length as y_train!'
 
@@ -85,7 +77,10 @@ def fit(
         if isinstance(y_train, List):
             # X_train and y_train are stored as lists
             y_train_stacked = self.join_series(y_train)
-            y_test_stacked = self.join_series(y_test) if y_test is not None else None
+            if y_test is not None:
+                y_test_stacked, seq_y_test_length = self.join_series(y_test, return_seq_lengths=True)
+            else:
+                y_test_stacked = None
 
             if self._is_uni_variant:
                 self.feature_validator.num_features = 0
@@ -101,15 +96,14 @@ def fit(
                 if len(X_train) != len(y_train):
                     raise ValueError("Inconsistent number of sequences for features and targets,"
                                      " {} for features and {} for targets".format(len(X_train), len(y_train), ))
-                X_train_stacked = self.join_series(X_train)
+                X_train_stacked, sequence_lengths = self.join_series(X_train, return_seq_lengths=True)
                 X_test_stacked = self.join_series(X_test) if X_test is not None else None
-                if X_test is not None:
-                    if len(X_test) != len(y_test):
+                if X_test_stacked is not None and y_test_stacked is not None:
+                    if X_test_stacked.shape != y_test_stacked.shape:
                         raise ValueError("Inconsistent number of test datapoints for features and targets,"
                                          " {} for features and {} for targets".format(len(X_test), len(y_test), ))
-                    # TODO write a feature input validator to check X_test for known_future_features
-                    super().fit(X_train[0], y_train[0], X_test[0], y_test[0])
-                self.feature_validator.fit(X_train_stacked, X_test_stacked, series_idx=series_idx)
+                self.feature_validator.fit(X_train_stacked, X_test_stacked,
+                                           series_idx=series_idx, sequence_lengths=sequence_lengths)
                 self.target_validator.fit(y_train_stacked, y_test_stacked)
 
                 if self.feature_validator.only_contain_series_idx:
@@ -117,10 +111,6 @@ def fit(
 
                 self._is_fitted = True
 
-                self.check_input_shapes(X_train, y_train, is_training=True)
-
-                if X_test is not None:
-                    self.check_input_shapes(X_test, y_test, is_training=False)
                 self.feature_names = self.feature_validator.get_reordered_columns()
                 self.feature_shapes = {feature_name: 1 for feature_name in self.feature_names}
         else:
@@ -129,28 +119,6 @@ def fit(
 
         return self
 
-    @staticmethod
-    def get_num_features(X):
-        X_shape = np.shape(X)
-        return 1 if len(X_shape) == 1 else X_shape[1]
-
-    @staticmethod
-    def check_input_shapes(X, y, is_training: bool = True):
-        num_features = [0] * len(X)
-        out_dimensionality = [0] * len(y)
-
-        for i in range(len(X)):
-            num_features[i] = TimeSeriesForecastingInputValidator.get_num_features(X[i])
-            out_dimensionality[i] = TimeSeriesForecastingInputValidator.get_num_features(y[i])
-
-        if not np.all(np.asarray(num_features) == num_features[0]):
-            raise ValueError(f"All the sequences need to have the same number of features in "
-                             f"{'train' if is_training else 'test'} set!")
-
-        if not np.all(np.asarray(out_dimensionality) == out_dimensionality[0]):
-            raise ValueError(f"All the sequences need to have the same number of targets in "
-                             f"{'train' if is_training else 'test'} set!")
-
     def transform(
             self,
             X: Optional[Union[List, pd.DataFrame]],
@@ -222,10 +190,6 @@ def _transform_X(self,
         else:
             # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
             x_stacked = pd.concat(X)
-            x_columns = x_stacked.columns
-            for ser_id in self.series_idx:
-                if ser_id not in x_columns:
-                    raise ValueError(f'{ser_id} does not exist in input feature X')
 
             series_number = pd.MultiIndex.from_frame(x_stacked[self.series_idx])
 
@@ -238,7 +202,7 @@ def _transform_X(self,
         return x_transformed, series_number
 
     @staticmethod
-    def join_series(X: List[SupportedFeatTypes],
+    def join_series(X: List[Union[pd.DataFrame, np.ndarray]],
                     return_seq_lengths: bool = False) -> Union[pd.DataFrame,
                                                                Tuple[pd.DataFrame, List[int]]]:
         """
@@ -253,11 +217,6 @@ def join_series(X: List[SupportedFeatTypes],
             raise ValueError(f'Input must be a list, but it is {type(X)}')
         if isinstance(X[0], pd.DataFrame):
             joint_input = pd.concat(X)
-        elif isinstance(X[0], sparse.spmatrix):
-            if len(X[0].shape) > 1:
-                joint_input = sparse.vstack(X)
-            else:
-                joint_input = sparse.hstack(X)
         elif isinstance(X[0], (List, np.ndarray)):
             joint_input = np.concatenate(X)
         else:
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 1bef1e025..2e5ae137c 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -32,7 +32,6 @@ def fit(
             raise NotImplementedError("Sparse Target is unsupported for forecasting task!")
         return super().fit(y_train, y_test)
 
-
     def transform(self,
                   y: SupportedTargetTypes,
                   index: Optional[Union[pd.Index, np.ndarray]] = None,
@@ -56,6 +55,9 @@ def transform(self,
 
         if index is None:
             index = np.array([0] * y.shape[0])
+        else:
+            if len(index) != y.shape[0]:
+                raise ValueError('Index must have length as the input targets!')
         if y.ndim == 1:
             y = np.expand_dims(y, -1)
         y: pd.DataFrame = pd.DataFrame(y)
diff --git a/test/conftest.py b/test/conftest.py
index 2f3a0d9e2..a8f5860d6 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -736,6 +736,58 @@ def get_forecasting_fit_dictionary(X, y, validator, backend, budget_type='epochs
     return fit_dictionary
 
 
+# Fixtures for forecasting input validators
+@pytest.fixture
+def input_data_forecastingfeaturetest(request):
+    if request.param == 'numpy_nonan':
+        return np.random.uniform(10, size=(100, 10)), None, None
+    elif request.param == 'numpy_with_static':
+        return np.zeros([2, 3], dtype=np.int), None, None
+    elif request.param == 'numpy_with_seq_length':
+        return np.zeros([5, 3], dtype=np.int), None, [2, 3]
+    elif request.param == 'pandas_wo_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category'), None, [2]
+    elif request.param == 'pandas_w_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 0},
+            {'A': 0, 'B': 1},
+        ], dtype='category'), 'A', [2]
+    elif request.param == 'pandas_only_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 0},
+            {'A': 0, 'B': 1},
+        ], dtype='category'), ['A', 'B'], [2]
+    elif request.param == 'pandas_without_seriesid':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 3, 'B': 4},
+        ], dtype='category'), None, [2]
+    elif request.param == 'pandas_with_static_features':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 4},
+        ], dtype='category'), None, [2]
+    elif request.param == 'pandas_multi_seq':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 4},
+            {'A': 3, 'B': 2},
+            {'A': 2, 'B': 4},
+        ], dtype='category'), None, [2, 2]
+    elif request.param == 'pandas_with_static_features_multi_series':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 2},
+            {'A': 2, 'B': 3},
+            {'A': 2, 'B': 3},
+        ], dtype='category'), 'A', None
+    else:
+        ValueError("Unsupported indirect fixture {}".format(request.param))
+
+
 @pytest.fixture
 def fit_dictionary_uni_variant_wo_missing():
     x, y, validator = get_forecasting_data(uni_variant=True, with_missing_value=False)
diff --git a/test/test_data/test_forecasting_input_validator.py b/test/test_data/test_forecasting_input_validator.py
new file mode 100644
index 000000000..1cc6f5274
--- /dev/null
+++ b/test/test_data/test_forecasting_input_validator.py
@@ -0,0 +1,46 @@
+import numpy as np
+import pytest
+import pandas as pd
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+
+"""
+# Actual checks for the features
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+            'numpy_nonan',
+            'numpy_with_static',
+            'numpy_with_seq_length',
+            'pandas_wo_seriesid',
+            'pandas_w_seriesid',
+            'pandas_only_seriesid',
+            'pandas_without_seriesid',
+            'pandas_with_static_features',
+            'pandas_multi_seq',
+            'pandas_with_static_features_multi_series',
+    ),
+    indirect=True
+)
+"""
+
+def test_uni_variant_validator():
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    y_train = [[0.] * 5, [1.] * 10]
+    y_test = [[0.] * 3, [1.] * 3]
+    validator.fit(X_train=None, y_train=y_train, X_test=None, y_test=y_test)
+    assert validator.start_times == [pd.Timestamp('1900-01-01')] * len(y_train)
+
+    assert validator._is_fitted
+    assert validator._is_uni_variant
+    assert validator.feature_validator.num_features == 0
+    assert len(validator.feature_validator.numerical_columns) == 0
+    assert len(validator.feature_validator.categorical_columns) == 0
+    assert validator.feature_validator._is_fitted is False
+    assert len(validator.feature_shapes) == 0
+    assert len(validator.feature_names) == 0
+
+    x_transformed, y_transformed, sequence_lengths = validator.transform(None, y_train)
+    assert x_transformed is None
+    assert isinstance(y_transformed, pd.DataFrame)
+    assert np.all(sequence_lengths == [5, 10])
+    assert y_transformed.index.tolist() == sum([[i] * l_seq for i, l_seq in enumerate(sequence_lengths)], [])
diff --git a/test/test_data/test_forecasting_target_validator.py b/test/test_data/test_forecasting_target_validator.py
index 976f7902c..901a15caa 100644
--- a/test/test_data/test_forecasting_target_validator.py
+++ b/test/test_data/test_forecasting_target_validator.py
@@ -1,17 +1,10 @@
 import numpy as np
 
 import pandas as pd
-from pandas.api.types import is_numeric_dtype
-
 import pytest
 
 from scipy import sparse
 
-import sklearn.datasets
-import sklearn.model_selection
-from sklearn.utils.multiclass import type_of_target
-
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
 
 
@@ -40,11 +33,19 @@ def test_forecasting_target_transform():
     assert isinstance(y_transformed_3.index, pd.MultiIndex)
     assert np.all(y_transformed_3.index == index_3)
 
-    validator2 = TimeSeriesTargetValidator(is_classification=False)
-    target_2 = sparse.csr_matrix(np.array([1, 1, 1]))
+
+def test_forecasting_target_handle_exception():
+    validator = TimeSeriesTargetValidator(is_classification=False)
+    target_sparse = sparse.csr_matrix(np.array([1, 1, 1]))
     with pytest.raises(NotImplementedError, match=r"Sparse Target is unsupported for forecasting task!"):
         # sparse matrix is unsupported for nan filling
-        validator2.fit(target_2)
+        validator.fit(target_sparse)
+
+    series_length = 10
+    y = np.ones(series_length)
+    validator.fit(y)
+    with pytest.raises(ValueError, match=r"Index must have length as the input targets!"):
+        validator.transform(y, np.asarray([1, 2, 3]))
 
 
 def test_forecasting_target_missing_values():
@@ -56,5 +57,3 @@ def test_forecasting_target_missing_values():
     target_1 = np.array([np.nan, 1, 2])
     validator1.fit(target_1)
     assert validator1.transform(target_1).isnull().values.sum() == 1
-
-
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_time_series_feature_validator.py
index 27c419a99..6513ff41f 100644
--- a/test/test_data/test_time_series_feature_validator.py
+++ b/test/test_data/test_time_series_feature_validator.py
@@ -11,11 +11,94 @@
 
 import sklearn.datasets
 import sklearn.model_selection
-
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 
-def test_forecasting_validator_
+
+# Actual checks for the features
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'numpy_nonan',
+        'numpy_with_static',
+        'numpy_with_seq_length',
+        'pandas_wo_seriesid',
+        'pandas_w_seriesid',
+        'pandas_only_seriesid',
+        'pandas_without_seriesid',
+        'pandas_with_static_features',
+        'pandas_multi_seq',
+        'pandas_with_static_features_multi_series',
+    ),
+    indirect=True
+)
+def test_forecasting_validator_supported_types(input_data_forecastingfeaturetest):
+    data, series_idx, seq_lengths = input_data_forecastingfeaturetest
+    validator = TimeSeriesFeatureValidator()
+    validator.fit(data, data, series_idx, seq_lengths)
+
+    if series_idx is not None:
+        index = pd.MultiIndex.from_frame(pd.DataFrame(data[series_idx]))
+    elif seq_lengths is not None:
+        index = np.arange(len(seq_lengths)).repeat(seq_lengths)
+    else:
+        index = None
+    if series_idx is not None and np.all(series_idx == data.columns):
+        assert validator.only_contain_series_idx is True
+        return
+
+    transformed_X = validator.transform(data, index)
+    assert isinstance(transformed_X, pd.DataFrame)
+    if series_idx is None and seq_lengths is None:
+        assert np.all(transformed_X.index == 0)
+    else:
+        if series_idx is not None:
+            assert series_idx not in transformed_X
+        else:
+            if seq_lengths is not None:
+                for i, group in enumerate(transformed_X.groupby(transformed_X.index)):
+                    assert len(group[1]) == seq_lengths[i]
+    # static features
+    all_columns = transformed_X.columns
+    all_columns_are_unique = {col: True for col in all_columns}
+    for group in transformed_X.groupby(transformed_X.index):
+        for col in group[1].columns:
+            unique = np.unique(group[1][col])
+            all_columns_are_unique[col] = all_columns_are_unique[col] & len(unique) == 1
+    for key, value in all_columns_are_unique.items():
+        if key in validator.static_features:
+            assert value is True
+        else:
+            assert value is False
+    assert validator._is_fitted
+
+
+def test_forecasting_validator_get_reordered_columns():
+    df = pd.DataFrame([
+        {'category': 'one', 'int': 1, 'float': 1.0, 'bool': True},
+        {'category': 'two', 'int': 2, 'float': 2.0, 'bool': False},
+    ])
+
+    for col in df.columns:
+        df[col] = df[col].astype(col)
+
     validator = TimeSeriesFeatureValidator()
+    validator.fit(df)
+    reorder_cols = validator.get_reordered_columns()
+    assert reorder_cols == ['category', 'bool', 'int', 'float']
 
 
+def test_forecasting_validator_handle_exception():
+    df = pd.DataFrame([
+        {'A': 1, 'B': 2},
+        {'A': np.NAN, 'B': 3},
 
+    ])
+    validator = TimeSeriesFeatureValidator()
+    with pytest.raises(ValueError, match=r"All Series ID must be contained in the training column"):
+        validator.fit(df, series_idx=['B', 'C'])
+    with pytest.raises(ValueError, match=r'NaN should not exit in Series ID!'):
+        validator.fit(df, series_idx=['A'])
+    valirator2 = TimeSeriesFeatureValidator()
+    valirator2.fit(df)
+    with pytest.raises(ValueError, match=r'Given index must have length as the input features!'):
+        valirator2.transform(df, index=[0] * 5)

From ee97108e9bb9982f8e30ce6650be0ef7849ffa84 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 5 May 2022 15:46:11 +0200
Subject: [PATCH 243/347] test for validations

---
 .../data/time_series_feature_validator.py     |   1 +
 .../data/time_series_forecasting_validator.py |   9 +-
 .../test_forecasting_input_validator.py       | 112 ++++++++++++++----
 3 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 8929fa29a..920917b01 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -63,6 +63,7 @@ def fit(self,
                     raise ValueError('NaN should not exit in Series ID!')
                 index = pd.MultiIndex.from_frame(pd.DataFrame(X_train[series_idx]))
                 self.only_contain_series_idx = len(X_train.columns) == len(series_idx)
+
                 if self.only_contain_series_idx:
                     self._is_fitted = True
 
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index c0b10d368..0ff0d0883 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -99,9 +99,10 @@ def fit(
                 X_train_stacked, sequence_lengths = self.join_series(X_train, return_seq_lengths=True)
                 X_test_stacked = self.join_series(X_test) if X_test is not None else None
                 if X_test_stacked is not None and y_test_stacked is not None:
-                    if X_test_stacked.shape != y_test_stacked.shape:
+                    if len(X_test_stacked) != len(y_test_stacked):
                         raise ValueError("Inconsistent number of test datapoints for features and targets,"
                                          " {} for features and {} for targets".format(len(X_test), len(y_test), ))
+
                 self.feature_validator.fit(X_train_stacked, X_test_stacked,
                                            series_idx=series_idx, sequence_lengths=sequence_lengths)
                 self.target_validator.fit(y_train_stacked, y_test_stacked)
@@ -159,6 +160,8 @@ def transform(
                     if X is None:
                         raise ValueError('Multi Variant dataset requires X as input!')
                     assert len(X) == len(y), "Length of features must equal to length of targets!"
+                if self.series_idx is not None and X is None:
+                    raise ValueError('X must be given as series_idx!')
 
                 for seq_idx in range(num_sequences):
                     sequence_lengths[seq_idx] = len(y[seq_idx])
@@ -191,10 +194,10 @@ def _transform_X(self,
             # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
             x_stacked = pd.concat(X)
 
-            series_number = pd.MultiIndex.from_frame(x_stacked[self.series_idx])
+            series_number = pd.MultiIndex.from_frame(pd.DataFrame(x_stacked[self.series_idx]))
 
             if not self._is_uni_variant:
-                x_transformed = self.feature_validator.transform(x_stacked.drop(self.series_idx, axis=1),
+                x_transformed = self.feature_validator.transform(x_stacked,
                                                                  index=series_number)
             else:
                 x_transformed = None
diff --git a/test/test_data/test_forecasting_input_validator.py b/test/test_data/test_forecasting_input_validator.py
index 1cc6f5274..566a6fb40 100644
--- a/test/test_data/test_forecasting_input_validator.py
+++ b/test/test_data/test_forecasting_input_validator.py
@@ -3,27 +3,8 @@
 import pandas as pd
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 
-"""
-# Actual checks for the features
-@pytest.mark.parametrize(
-    'input_data_forecastingfeaturetest',
-    (
-            'numpy_nonan',
-            'numpy_with_static',
-            'numpy_with_seq_length',
-            'pandas_wo_seriesid',
-            'pandas_w_seriesid',
-            'pandas_only_seriesid',
-            'pandas_without_seriesid',
-            'pandas_with_static_features',
-            'pandas_multi_seq',
-            'pandas_with_static_features_multi_series',
-    ),
-    indirect=True
-)
-"""
 
-def test_uni_variant_validator():
+def test_uni_variant_validator_only_y():
     validator = TimeSeriesForecastingInputValidator(is_classification=False)
     y_train = [[0.] * 5, [1.] * 10]
     y_test = [[0.] * 3, [1.] * 3]
@@ -44,3 +25,94 @@ def test_uni_variant_validator():
     assert isinstance(y_transformed, pd.DataFrame)
     assert np.all(sequence_lengths == [5, 10])
     assert y_transformed.index.tolist() == sum([[i] * l_seq for i, l_seq in enumerate(sequence_lengths)], [])
+
+
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'pandas_only_seriesid',
+    ),
+    indirect=True
+)
+def test_uni_variant_validator_with_series_id(input_data_forecastingfeaturetest):
+    data, series_idx, seq_lengths = input_data_forecastingfeaturetest
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    start_times = [pd.Timestamp('2000-01-01')]
+    x = [data]
+    y = [list(range(len(data)))]
+    validator.fit(x, y, start_times=start_times, series_idx=series_idx)
+    assert validator._is_uni_variant is True
+    assert validator.start_times == start_times
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert x_transformed is None
+    # for uni_variant validator, setting X as None should not cause any issue
+    with pytest.raises(ValueError, match=r"X must be given as series_idx!"):
+        _ = validator.transform(None, y)
+
+
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'pandas_w_seriesid',
+    ),
+    indirect=True
+)
+def test_multi_variant_validator_with_series_id(input_data_forecastingfeaturetest):
+    data, series_idx, seq_lengths = input_data_forecastingfeaturetest
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    start_times = [pd.Timestamp('2000-01-01')]
+    x = [data]
+    y = [list(range(len(data)))]
+    validator.fit(x, y, start_times=start_times, series_idx=series_idx)
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert series_idx not in x_transformed
+
+
+def test_forecasting_validator():
+    df = pd.DataFrame([
+        {'category': 'one', 'int': 1, 'float': 1.0, 'bool': True},
+        {'category': 'two', 'int': 2, 'float': 2.0, 'bool': False},
+    ])
+
+    for col in df.columns:
+        df[col] = df[col].astype(col)
+
+    x = [df, df]
+    y = [[1., 2.], [1., 2.]]
+
+    validator = TimeSeriesForecastingInputValidator()
+    validator.fit(x, y, start_times=[pd.Timestamp('1900-01-01')] * 2)
+    feature_names = ['category', 'bool', 'int', 'float']
+    assert validator._is_uni_variant is False
+    assert validator.feature_names == feature_names
+
+    for fea_name in feature_names:
+        assert fea_name in validator.feature_shapes
+        assert validator.feature_shapes[fea_name] == 1
+
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert isinstance(x_transformed, pd.DataFrame)
+    assert isinstance(y_transformed, pd.DataFrame)
+    assert np.all(x_transformed.index == y_transformed.index)
+    assert len(x_transformed) == sum(sequence_lengths)
+
+    # y is only allowed to be None if validate_for_future_features is True
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, None, validate_for_future_features=True)
+    with pytest.raises(ValueError, match=r"Targets must be given!"):
+        validator.transform(x)
+    with pytest.raises(ValueError, match=r"Multi Variant dataset requires X as input!"):
+        validator.transform(None, y)
+
+
+def test_forecasting_handle_exception():
+    validator = TimeSeriesForecastingInputValidator()
+    # if X and y has different lengths
+    X = [np.ones(3), np.ones(3)]
+    y = [[1], ]
+    with pytest.raises(ValueError, match="Inconsistent number of sequences for features and targets"):
+        validator.fit(X, y)
+
+    y = [[1], [1]]
+    # test data must have the same shapes as they are attached to the tails of the datasets
+    with pytest.raises(ValueError, match="Inconsistent number of test datapoints for features and targets"):
+        validator.fit(X, y, X_test=X, y_test=y)

From b7f51f20e0eaeab63e709228122de27f9df5c772 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 5 May 2022 23:40:45 +0200
Subject: [PATCH 244/347] test on TimeSeriesSequence

---
 autoPyTorch/datasets/resampling_strategy.py   |   1 -
 autoPyTorch/datasets/time_series_dataset.py   | 143 ++++++-------
 autoPyTorch/evaluation/abstract_evaluator.py  |   2 +-
 ...time_series_forecasting_train_evaluator.py |  10 +-
 .../base_forecasting_decoder.py               |   3 +-
 .../components/training/metrics/metrics.py    |  12 +-
 .../components/training/metrics/utils.py      |   6 +-
 .../forecasting_base_trainer.py               |   2 +-
 .../test_forecasting_input_validator.py       |   2 +-
 .../test_time_series_datasets.py              | 198 ++++++++++++++++++
 10 files changed, 292 insertions(+), 87 deletions(-)
 create mode 100644 test/test_datasets/test_time_series_datasets.py

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 577b20bf7..c0522e7e9 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -181,7 +181,6 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
         """
         n_prediction_steps = kwargs['n_prediction_steps']
         n_repeat = kwargs['n_repeat']
-        # TODO consider how we handle test size properly
         # Time Series prediction only requires on set of prediction for each
         # This implement needs to be combined with time series forecasting dataloader, where each time an entire
         # time series is used for prediction
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 00487adfa..92e7caf38 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -71,11 +71,31 @@ def extract_feature_index(feature_shapes: Dict[str, int],
     return tuple(feature_index)
 
 
+def compute_time_features(start_time: pd.Timestamp,
+                          date_period_length: int,
+                          time_feature_length: int,
+                          freq: str,
+                          time_feature_transforms: List[TimeFeature]) -> np.ndarray:
+    date_info = pd.date_range(start=start_time,
+                              periods=date_period_length,
+                              freq=freq)[-time_feature_length:]
+    try:
+        time_features = np.vstack(
+            [transform(date_info).to_numpy(float)
+             if not isinstance(transform, ConstantTransform) else transform(date_info)
+             for transform in time_feature_transforms]
+        ).T
+    except OutOfBoundsDatetime:
+        # This is only a temporal solution TODO consider how to solve this!
+        time_features = np.zeros([time_feature_length, len(time_feature_transforms)])
+    return time_features
+
+
 class TimeSeriesSequence(Dataset):
     def __init__(self,
                  X: Optional[np.ndarray],
-                 Y: Union[np.ndarray],
-                 start_time_train: Optional[pd.DatetimeIndex] = None,
+                 Y: np.ndarray,
+                 start_time: Optional[pd.DatetimeIndex] = None,
                  freq: str = '1Y',
                  time_feature_transform: List[TimeFeature] = [],
                  X_test: Optional[np.ndarray] = None,
@@ -103,9 +123,9 @@ def __init__(self,
         self.Y = Y
 
         self.observed_target = ~np.isnan(self.Y)
-        if start_time_train is None:
-            start_time_train = pd.DatetimeIndex(pd.to_datetime(['1900-01-01']), freq=freq)
-        self.start_time_train = start_time_train
+        if start_time is None:
+            start_time = pd.Timestamp('1900-01-01')
+        self.start_time = start_time
 
         self.X_val = None
         self.Y_val = None
@@ -122,13 +142,12 @@ def __init__(self,
         self.train_transform = train_transforms
         self.val_transform = val_transforms
         self.sp = sp
+
         if compute_mase_coefficient_value:
             if is_test_set:
-                self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp,
-                                                                 n_prediction_steps=n_prediction_steps)
+                self.mase_coefficient = compute_mase_coefficient(self.Y, sp=self.sp)
             else:
-                self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp,
-                                                                 n_prediction_steps=n_prediction_steps)
+                self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp)
 
         else:
             self.mase_coefficient = 1.0
@@ -136,8 +155,25 @@ def __init__(self,
 
         self.transform_time_features = False
         self._cached_time_features: Optional[np.ndarray] = time_features
+        self._is_test_set = is_test_set
         self.is_test_set = is_test_set
 
+    @property
+    def is_test_set(self):
+        return self._is_test_set
+
+    @is_test_set.setter
+    def is_test_set(self, value: bool):
+        if value != self._is_test_set and self.known_future_features_index:
+            if self.X_test is None:
+                raise ValueError("If future features are required, X_test must be given for"
+                                 " setting TimeSeriesSequence as test set!")
+            if value is True:
+                self.X = np.concatenate([self.X, self.X_test])
+            else:
+                self.X = self.X[:-len(self.X_test)]
+        self._is_test_set = value
+
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
         """
@@ -157,10 +193,7 @@ def __getitem__(self, index: int, train: bool = True) \
             index = self.__len__() + index
 
         if self.X is not None:
-            if hasattr(self.X, 'iloc'):
-                past_features = self.X.iloc[:index + 1]
-            else:
-                past_features = self.X[:index + 1]
+            past_features = self.X[:index + 1]
 
             if self.known_future_features_index:
                 future_features = self.X[
@@ -172,6 +205,15 @@ def __getitem__(self, index: int, train: bool = True) \
             past_features = None
             future_features = None
 
+        if self.train_transform is not None and train and past_features is not None:
+            past_features = self.train_transform(past_features)
+            if future_features is not None:
+                future_features = self.train_transform(future_features)
+        elif self.val_transform is not None and not train and past_features is not None:
+            past_features = self.val_transform(past_features)
+            if future_features is not None:
+                future_features = self.val_transform(future_features)
+
         if self.transform_time_features:
             if self.time_feature_transform:
                 self.compute_time_features()
@@ -181,30 +223,16 @@ def __getitem__(self, index: int, train: bool = True) \
                 else:
                     past_features = self._cached_time_features[:index + 1]
                 if future_features is not None:
-                    try:
-                        future_features = np.hstack([
-                            future_features,
-                            self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
-                        ])
-                    except Exception:
-                        import pdb
-
-                        pdb.set_trace()
+                    future_features = np.hstack([
+                        future_features,
+                        self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+                    ])
                 else:
                     future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
 
         if future_features is not None and future_features.shape[0] == 0:
             future_features = None
 
-        if self.train_transform is not None and train and past_features is not None:
-            past_features = self.train_transform(past_features)
-            if future_features is not None:
-                future_features = self.train_transform(future_features)
-        elif self.val_transform is not None and not train and past_features is not None:
-            past_features = self.val_transform(past_features)
-            if future_features is not None:
-                future_features = self.val_transform(future_features)
-
         # In case of prediction, the targets are not provided
         targets = self.Y
         if self.is_test_set:
@@ -244,29 +272,16 @@ def compute_time_features(self, ):
             periods = self.Y.shape[0]
             if self.is_test_set:
                 periods += self.n_prediction_steps
+            self._cached_time_features = compute_time_features(self.start_time, periods,
+                                                               periods, self.freq, self.time_feature_transform)
 
-            date_info = pd.date_range(start=self.start_time_train,
-                                      periods=periods,
-                                      freq=self.freq)
-
-            self._cached_time_features = np.vstack(
-                [transform(date_info).to_numpy(float) for transform in self.time_feature_transform]
-            ).T
         else:
             if self.is_test_set:
                 if self._cached_time_features.shape[0] == self.Y.shape[0]:
-                    try:
-                        date_info = pd.date_range(start=self.start_time_train,
-                                                  periods=self.n_prediction_steps + self.Y.shape[0],
-                                                  freq=self.freq)
-                        time_feature_future = np.vstack(
-                            [transform(date_info).to_numpy(float)
-                             if not isinstance(transform, ConstantTransform) else transform(date_info)
-                             for transform in self.time_feature_transform]
-                        ).T
-                    except OutOfBoundsDatetime:
-                        # This is only a temporal solution TODO consider how to solve this!
-                        time_feature_future = np.zeros([self.n_prediction_steps, len(self.time_feature_transform)])
+                    time_feature_future = compute_time_features(self.start_time,
+                                                                self.n_prediction_steps + self.Y.shape[0],
+                                                                self.n_prediction_steps,
+                                                                self.freq, self.time_feature_transform)
 
                     self._cached_time_features = np.concatenate([self._cached_time_features, time_feature_future])
 
@@ -299,7 +314,7 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
             raise ValueError("get_val_seq_set is not supported for the test sequences!")
         if index < 0:
             index = self.__len__() + index
-        if index == self.__len__() - 1:
+        if index >= self.__len__() - 1:
             return copy.copy(self)
         else:
             if self.X is not None:
@@ -312,8 +327,8 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                 cached_time_features = self._cached_time_features[:index + 1 + self.n_prediction_steps]
 
             return TimeSeriesSequence(X=X,
-                                      Y=self.Y[:index + 1],
-                                      start_time_train=self.start_time_train,
+                                      Y=self.Y[:index + 1 + self.n_prediction_steps],
+                                      start_time=self.start_time,
                                       freq=self.freq,
                                       time_feature_transform=self.time_feature_transform,
                                       train_transforms=self.train_transform,
@@ -321,7 +336,6 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
                                       n_prediction_steps=self.n_prediction_steps,
                                       known_future_features_index=self.known_future_features_index,
                                       sp=self.sp,
-                                      only_has_past_targets=True,
                                       compute_mase_coefficient_value=False,
                                       time_features=cached_time_features)
 
@@ -336,7 +350,7 @@ def get_test_target(self, test_idx: int):
     def update_attribute(self, **kwargs):
         for key, value in kwargs.items():
             if not hasattr(self, key):
-                raise ValueError('Trying to update invalid attribute for TimeSeriesSequence!!!')
+                raise ValueError('Trying to update invalid attribute for TimeSeriesSequence!')
             setattr(self, key, value)
 
 
@@ -378,7 +392,6 @@ def __init__(self,
         if y values needs to be normalized with mean 0 and variance 1
         if the dataset is trained with log_prob losses, this needs to be specified in the very beginning such that the
         header's configspace can be built beforehand.
-        :param static_features: statistic features, invariant across different
         """
         # Preprocess time series data information
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
@@ -601,17 +614,7 @@ def compute_time_features(start_times: List[pd.DatetimeIndex],
                 series_lengths_max[start_t] = seq_l
         series_time_features = {}
         for start_t, max_l in series_lengths_max.items():
-            try:
-                date_info = pd.date_range(start=start_t,
-                                          periods=max_l,
-                                          freq=freq)
-                series_time_features[start_t] = np.vstack(
-                    [transform(date_info).to_numpy(float)
-                     if not isinstance(transform, ConstantTransform) else transform(date_info)
-                     for transform in time_feature_transform]
-                ).T
-            except OutOfBoundsDatetime:
-                series_time_features[start_t] = np.zeros([max_l, len(time_feature_transform)])
+            series_time_features[start_t] = compute_time_features(start_t, max_l, max_l, freq, time_feature_transform)
         return series_time_features
 
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:
@@ -779,7 +782,7 @@ def make_sequences_datasets(X: pd.DataFrame,
         if X_test is not None:
             x_test_group = X_test.groupby(X_test.index)
 
-        for i_ser, (start_train, y) in enumerate(zip(start_times, y_group)):
+        for i_ser, (start_time, y) in enumerate(zip(start_times, y_group)):
             ser_id = y[0]
             y_ser = y[1].transform(np.array).values
             x_ser = x_group.get_group(ser_id).transform(np.array).values if X is not None else None
@@ -794,10 +797,10 @@ def make_sequences_datasets(X: pd.DataFrame,
             sequence = TimeSeriesSequence(
                 X=x_ser,
                 Y=y_ser,
-                start_time_train=start_train,
+                start_time=start_time,
                 X_test=x_test_ser,
                 Y_test=y_test_ser,
-                time_features=time_features[start_train][:len(y_ser)],
+                time_features=time_features[start_time][:len(y_ser)],
                 is_test_set=is_test_set,
                 **sequences_kwargs)
             sequence_datasets.append(sequence)
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 617e6b0e0..4f082d6de 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -30,7 +30,7 @@
     REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
-    TABULAR_TASKS, TIMESERIES_TASKS,
+    TABULAR_TASKS,
     FORECASTING_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index d238881a0..acd59226d 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -104,11 +104,11 @@ def fit_predict_and_loss(self) -> None:
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
 
-            mase_cofficient = self.generate_mase_coefficient_for_validation(test_split)
+            mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)
 
             forecasting_kwargs = {'sp': self.seasonality,
                                   'n_prediction_steps': self.n_prediction_steps,
-                                  'mase_cofficient': mase_cofficient,
+                                  'mase_coefficient': mase_coefficient,
                                   }
 
             train_loss = None
@@ -119,12 +119,12 @@ def fit_predict_and_loss(self) -> None:
 
             status = StatusType.SUCCESS
 
-            self.Y_optimization *= mase_cofficient
+            self.Y_optimization *= mase_coefficient
 
             self.finish_up(
                 loss=loss,
                 train_loss=train_loss,
-                opt_pred=y_opt_pred * mase_cofficient,
+                opt_pred=y_opt_pred * mase_coefficient,
                 valid_pred=y_valid_pred,
                 test_pred=y_test_pred,
                 additional_run_info=additional_run_info,
@@ -175,7 +175,7 @@ def fit_predict_and_loss(self) -> None:
                 # the average.
                 train_fold_weights[i] = len(train_split)
 
-                forecasting_kwargs = {'mase_cofficient': mase_coefficient_all[i],
+                forecasting_kwargs = {'mase_coefficient': mase_coefficient_all[i],
                                       'sp': self.seasonality,
                                       'n_prediction_steps': self.n_prediction_steps,
                                       }
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index ef7461285..725bdcb43 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -84,7 +84,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if 'n_decoder_output_features' not in X:
             future_features = X['dataset_properties']['known_future_features']
             feature_shapes = X['dataset_properties']['feature_shapes']
-            future_in_features = sum([feature_shapes[fu_feat] for fu_feat in future_features]).item()
+
+            future_in_features = sum([feature_shapes[fu_feat] for fu_feat in future_features])
 
             if X['transform_time_features']:
                 n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 4548dfef0..b950d8ce0 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -57,7 +57,7 @@
 # To avoid storing unnecessary scale values here, we scale all the values under
 # AutoPytorch.evaluation.time_series_forecasting_train_evaluator
 
-def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int, n_prediction_steps: int) -> float:
+def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> float:
     """
     compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
     this function aims at reducing the memroy requirement
@@ -72,9 +72,13 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int, n_pr
     if sp >= len(past_target):
         # in this case, we simply consider the mean value of the entire sequence
         # TODO condsider if there is a better way of handling this
-        mase_denominator = forecasting_metrics.mean_absolute_error(past_target,
-                                                                   np.zeros_like(past_target),
-                                                                   multioutput="raw_values")
+        try:
+            mase_denominator = forecasting_metrics.mean_absolute_error(past_target,
+                                                                       np.zeros_like(past_target),
+                                                                       multioutput="raw_values")
+        except ValueError:
+            return 1
+
     else:
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
                                                                    past_target[:-sp],
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 62be56098..5d4b70d58 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -126,9 +126,9 @@ def calculate_score(
     if task_type in FORECASTING_TASKS:
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:
-            if metric_ in MASE_LOSSES and 'mase_cofficient' in score_kwargs:
-                target_scaled = target * score_kwargs['mase_cofficient']
-                cprediction_scaled = cprediction * score_kwargs['mase_cofficient']
+            if metric_ in MASE_LOSSES and 'mase_coefficient' in score_kwargs:
+                target_scaled = target * score_kwargs['mase_coefficient']
+                cprediction_scaled = cprediction * score_kwargs['mase_coefficient']
                 score_dict[metric_.name] = metric_._sign * metric_(target_scaled, cprediction_scaled, **score_kwargs)
             else:
                 score_dict[metric_.name] = metric_._sign * metric_(target, cprediction, **score_kwargs)
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 17549d7cb..956064531 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -296,7 +296,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         # mase_coefficent has the shape [B, 1, 1]
         # to be compatible with outputs_data with shape [B, n_prediction_steps, num_output]
         mase_coefficients = np.expand_dims(torch.cat(mase_coefficients, dim=0).numpy(), axis=[1])
-        self.metrics_kwargs.update({'mase_cofficient': mase_coefficients})
+        self.metrics_kwargs.update({'mase_coefficient': mase_coefficients})
 
         self._scheduler_step(step_interval=StepIntervalUnit.valid, loss=loss_sum / N)
 
diff --git a/test/test_data/test_forecasting_input_validator.py b/test/test_data/test_forecasting_input_validator.py
index 566a6fb40..7f2bb852c 100644
--- a/test/test_data/test_forecasting_input_validator.py
+++ b/test/test_data/test_forecasting_input_validator.py
@@ -97,7 +97,7 @@ def test_forecasting_validator():
     assert len(x_transformed) == sum(sequence_lengths)
 
     # y is only allowed to be None if validate_for_future_features is True
-    x_transformed, y_transformed, sequence_lengths = validator.transform(x, None, validate_for_future_features=True)
+    _ = validator.transform(x, None, validate_for_future_features=True)
     with pytest.raises(ValueError, match=r"Targets must be given!"):
         validator.transform(x)
     with pytest.raises(ValueError, match=r"Multi Variant dataset requires X as input!"):
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
new file mode 100644
index 000000000..f34a939cc
--- /dev/null
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -0,0 +1,198 @@
+import numpy as np
+import torch
+import pandas as pd
+import pytest
+import unittest
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from gluonts.time_feature import Constant as ConstantTransform, DayOfMonth
+
+
+class ZeroTransformer:
+    def __call__(self, x: np.ndarray):
+        return np.zeros_like(x)
+
+
+class TestTimeSeriesSequence(unittest.TestCase):
+    def setUp(self) -> None:
+        rng = np.random.RandomState(1)
+        self.data_length = 10
+        self.n_prediction_steps = 3
+
+        n_features = 5
+
+        self.x_data = rng.rand(self.data_length, n_features)
+        self.y = rng.rand(self.data_length, 1)
+
+        self.x_test_data = rng.rand(self.n_prediction_steps, 5)
+        self.y_test = rng.rand(self.n_prediction_steps, 1)
+        self.time_feature_transform = [DayOfMonth(), ConstantTransform(10.0)]
+        self.known_future_features_index = [0, 2]
+        self.seq_uni = TimeSeriesSequence(X=None, Y=self.y, Y_test=self.y_test,
+                                          n_prediction_steps=self.n_prediction_steps,
+                                          time_feature_transform=self.time_feature_transform)
+        self.seq_multi = TimeSeriesSequence(X=self.x_data,
+                                            Y=self.y,
+                                            X_test=self.x_test_data,
+                                            Y_test=self.y_test, n_prediction_steps=self.n_prediction_steps,
+                                            time_feature_transform=self.time_feature_transform,
+                                            freq="1M")
+        self.seq_multi_with_future = TimeSeriesSequence(X=self.x_data,
+                                                        Y=self.y,
+                                                        X_test=self.x_test_data,
+                                                        Y_test=self.y_test, n_prediction_steps=self.n_prediction_steps,
+                                                        time_feature_transform=self.time_feature_transform,
+                                                        known_future_features_index=self.known_future_features_index,
+                                                        freq="1M")
+
+    def test_sequence_uni_variant_base(self):
+        self.assertEqual(len(self.seq_uni), self.data_length - self.n_prediction_steps)
+        idx = 6
+        data, target = self.seq_uni[idx]
+        self.assertTrue(isinstance(data['past_targets'], torch.Tensor))
+        self.assertEqual(len(data['past_targets']), idx + 1)
+        self.assertEqual(data['decoder_lengths'], self.n_prediction_steps)
+        self.assertEqual(self.seq_uni.start_time, pd.Timestamp('1900-01-01'))
+        mase_coefficient_1 = data['mase_coefficient']
+        self.assertEqual(mase_coefficient_1.size, 1)
+        # all data is observed
+        self.assertTrue(data['past_observed_targets'].all())
+
+        self.assertTrue(np.allclose(data['past_targets'].numpy(),
+                                    self.y[:idx + 1]))
+        self.assertTrue(np.allclose(target['future_targets'].numpy(),
+                                    self.y[idx + 1:1 + idx + self.n_prediction_steps]))
+
+        self.assertTrue(target['future_observed_targets'].all())
+
+        self.assertTrue(self.seq_uni[-2][0]["past_targets"].size, self.data_length - self.n_prediction_steps - 2 + 1)
+
+    def test_get_val_seq_and_test_targets(self):
+        val_seq = self.seq_uni.get_val_seq_set(-1)
+        self.assertEqual(len(val_seq), len(self.seq_uni))
+
+        val_seq = self.seq_uni.get_val_seq_set(5)
+        self.assertEqual(len(val_seq), 5 + 1)
+
+        test_targets = self.seq_uni.get_test_target(-1)
+        self.assertTrue(np.all(self.y[-self.n_prediction_steps:] == test_targets))
+
+        test_targets = self.seq_uni.get_test_target(5)
+        self.assertTrue(np.all(self.y[5 + 1: 5 + 1 + self.n_prediction_steps] == test_targets))
+
+    def test_uni_get_update_time_faetures(self):
+        self.seq_uni.update_attribute(transform_time_features=True)
+
+        data, target = self.seq_uni[3]
+        past_features = data["past_features"]
+        future_features = data["future_features"]
+
+        self.assertEqual(len(self.seq_uni._cached_time_features), len(self.y))
+        self.assertTrue(list(past_features.shape) == [3 + 1, len(self.time_feature_transform)])
+        self.assertTrue(list(future_features.shape) == [self.n_prediction_steps, len(self.time_feature_transform)])
+        self.assertTrue(torch.all(past_features[:, 1] == 10.))
+        self.assertTrue(torch.all(future_features[:, 1] == 10.))
+
+    def test_uni_to_test_set(self):
+        # For test set, its length should equal to y's length
+        self.seq_uni.is_test_set = True
+        self.assertEqual(len(self.seq_uni), len(self.y))
+
+        self.seq_uni.transform_time_features = True
+
+        data, target = self.seq_uni[-1]
+        self.assertTrue(target is None)
+        self.assertEqual(len(data["past_targets"]), len(self.y))
+        self.assertEqual(len(data["past_features"]), len(self.y))
+        self.assertEqual(len(self.seq_uni._cached_time_features), len(self.y) + self.n_prediction_steps)
+
+    def test_observed_values(self):
+        y_with_nan = self.seq_uni.Y.copy()
+        y_with_nan[[3, -2]] = np.nan
+        seq_1 = TimeSeriesSequence(X=None, Y=y_with_nan, n_prediction_steps=self.n_prediction_steps)
+        data, target = seq_1[-1]
+        self.assertFalse(data["past_observed_targets"][3])
+        self.assertTrue(target["future_observed_targets"][2])
+
+    def test_compute_mase_coefficient(self):
+        seq_2 = TimeSeriesSequence(X=None, Y=self.y, n_prediction_steps=self.n_prediction_steps, is_test_set=True)
+        self.assertNotEqual(self.seq_uni.mase_coefficient, seq_2.mase_coefficient)
+
+    def test_sequence_multi_variant_base(self):
+        data, _ = self.seq_multi[-1]
+        self.assertEqual(list(data["past_features"].shape), [len(self.seq_multi), self.x_data.shape[-1]])
+        self.assertTrue(data['future_features'] is None)
+
+        data, _ = self.seq_multi[-1]
+
+    def test_multi_known_future_variant(self):
+        data, _ = self.seq_multi_with_future[-1]
+        num_future_var = len(self.known_future_features_index)
+        future_features = data['future_features']
+        self.assertEqual(list(future_features.shape), [self.n_prediction_steps, num_future_var])
+        self.assertTrue(np.allclose(
+            future_features.numpy(),
+            self.x_data[-self.n_prediction_steps:, self.known_future_features_index])
+        )
+
+    def test_multi_transform_features(self):
+        self.seq_multi_with_future.transform_time_features = True
+        num_future_var = len(self.known_future_features_index)
+
+        data, _ = self.seq_multi_with_future[-1]
+        past_features = data["past_features"]
+        self.assertEqual(list(past_features.shape),
+                         [len(self.seq_multi_with_future), self.x_data.shape[-1] + len(self.time_feature_transform)])
+
+        self.assertTrue(np.allclose(
+            past_features[:, -len(self.time_feature_transform):].numpy(),
+            self.seq_multi_with_future._cached_time_features[:-self.n_prediction_steps]
+        ))
+
+        future_features = data["future_features"]
+        self.assertEqual(list(future_features.shape),
+                         [self.n_prediction_steps, num_future_var + len(self.time_feature_transform)])
+
+        self.assertTrue(np.allclose(
+            future_features[:, -len(self.time_feature_transform):].numpy(),
+            self.seq_multi_with_future._cached_time_features[-self.n_prediction_steps:]
+        ))
+
+    def test_multi_to_test_set(self):
+        self.seq_multi_with_future.is_test_set = True
+        self.assertEqual(len(self.seq_multi_with_future.X), len(self.x_data) + len(self.x_test_data))
+        data, _ = self.seq_multi_with_future[-1]
+
+        self.assertTrue(np.allclose(data["past_features"].numpy(), self.x_data))
+        self.assertTrue(
+            np.allclose(data["future_features"].numpy(), self.x_test_data[:, self.known_future_features_index])
+        )
+
+        self.seq_multi_with_future.is_test_set = False
+        self.assertEqual(len(self.seq_multi_with_future.X), len(self.x_data))
+
+        seq_2 = self.seq_multi_with_future.get_val_seq_set(6)
+        self.assertEqual(len(seq_2), 6 + 1)
+
+    def test_transformation(self):
+        self.seq_multi.update_transform(ZeroTransformer(), train=True)
+        data, _ = self.seq_multi[-1]
+        self.assertTrue(torch.all(data['past_features'][:, :-len(self.time_feature_transform)] == 0.))
+
+        self.seq_multi.update_transform(ZeroTransformer(), train=False)
+        data, _ = self.seq_multi.__getitem__(-1, False)
+        self.assertTrue(torch.all(data['past_features'][:, :-len(self.time_feature_transform)] == 0.))
+
+    def test_exception(self):
+        seq_1 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=None,
+                                   known_future_features_index=self.known_future_features_index)
+        with self.assertRaises(ValueError):
+            seq_1.is_test_set = True
+
+        seq_2 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=None,
+                                   is_test_set=True)
+
+        with self.assertRaises(ValueError):
+            seq_2.get_val_seq_set(5)
+
+        with self.assertRaises(ValueError):
+            seq_2.get_test_target(5)

From 08bfe1894ded6773db5a3f1a1dc59c033f9f6976 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 5 May 2022 23:53:39 +0200
Subject: [PATCH 245/347] maint

---
 autoPyTorch/datasets/time_series_dataset.py     | 1 -
 test/test_datasets/test_time_series_datasets.py | 7 +++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 92e7caf38..6b2f45c2d 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -282,7 +282,6 @@ def compute_time_features(self, ):
                                                                 self.n_prediction_steps + self.Y.shape[0],
                                                                 self.n_prediction_steps,
                                                                 self.freq, self.time_feature_transform)
-
                     self._cached_time_features = np.concatenate([self._cached_time_features, time_feature_future])
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index f34a939cc..903e699ed 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -70,8 +70,10 @@ def test_get_val_seq_and_test_targets(self):
         val_seq = self.seq_uni.get_val_seq_set(-1)
         self.assertEqual(len(val_seq), len(self.seq_uni))
 
+        self.seq_uni.compute_time_features()
         val_seq = self.seq_uni.get_val_seq_set(5)
         self.assertEqual(len(val_seq), 5 + 1)
+        self.assertEqual(len(val_seq._cached_time_features), 5+1 + self.n_prediction_steps)
 
         test_targets = self.seq_uni.get_test_target(-1)
         self.assertTrue(np.all(self.y[-self.n_prediction_steps:] == test_targets))
@@ -79,7 +81,7 @@ def test_get_val_seq_and_test_targets(self):
         test_targets = self.seq_uni.get_test_target(5)
         self.assertTrue(np.all(self.y[5 + 1: 5 + 1 + self.n_prediction_steps] == test_targets))
 
-    def test_uni_get_update_time_faetures(self):
+    def test_uni_get_update_time_features(self):
         self.seq_uni.update_attribute(transform_time_features=True)
 
         data, target = self.seq_uni[3]
@@ -93,11 +95,12 @@ def test_uni_get_update_time_faetures(self):
         self.assertTrue(torch.all(future_features[:, 1] == 10.))
 
     def test_uni_to_test_set(self):
+        self.seq_uni.transform_time_features = True
+        self.seq_uni.compute_time_features()
         # For test set, its length should equal to y's length
         self.seq_uni.is_test_set = True
         self.assertEqual(len(self.seq_uni), len(self.y))
 
-        self.seq_uni.transform_time_features = True
 
         data, target = self.seq_uni[-1]
         self.assertTrue(target is None)

From 478ad68af936c5b7d92b7c3606c40031a7a638ff Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 6 May 2022 13:27:16 +0200
Subject: [PATCH 246/347] test for resampling

---
 autoPyTorch/datasets/resampling_strategy.py   | 41 +++++----
 autoPyTorch/datasets/time_series_dataset.py   | 12 +--
 ...time_series_forecasting_train_evaluator.py |  1 -
 .../test_resampling_strategies.py             | 83 ++++++++++++++++++-
 .../test_time_series_datasets.py              |  3 +-
 5 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index c0522e7e9..2629671d2 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -48,7 +48,7 @@ def holdout_split_forecasting(holdout: TimeSeriesSplit, indices: np.ndarray, n_p
     try:
         train, val = list(holdout.split(indices))[-1]
         val = [val[-1 - i * n_prediction_steps] for i in reversed(range(n_repeat))]
-    except ValueError:
+    except (ValueError, IndexError):
         train = np.array([], dtype=indices.dtype)
         val = [-1]
     return indices[train], indices[val]
@@ -267,6 +267,9 @@ def time_series_cross_validation(random_state: np.random.RandomState,
         Args:
             indices (np.ndarray): array of indices to be split
             num_splits (int): number of cross validation splits
+            n_prediction_steps(int): forecsting horizon, to ensure that there is no overlapping between splits
+            n_repeat (int): number of sequences inside each split, e.g., inside each split, we could ask the model to
+                predict n_reapet times
 
         Returns:
             splits (List[Tuple[List, List]]): list of tuples of training and validation indices
@@ -298,12 +301,14 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
         The first holdout setting: trend setting, simply consider the tail of the sequence as validation sets and the
         part before as training set
         The second holdout setting: seasonality setting, ensures that the distance between validation sets and test sets
-        is the multiple of seasonality period. Thus we could ensure that validation and test sets are at the same
+        is a multiple of seasonality period. We could thus ensure that validation and test sets are at the same
         position of the period
 
         Args:
             indices (np.ndarray): array of indices to be split
             num_splits (int): number of cross validation splits
+            seasonality_h_value (int): distance between the start of the validation set and the test set, this value
+                need to be (roughly) a multiple of freq_value
 
         Returns:
             splits (List[Tuple[List, List]]): list of tuples of training and validation indices
@@ -311,8 +316,9 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
         n_prediction_steps = kwargs['n_prediction_steps']
         seasonality_h_value = kwargs['seasonality_h_value']
         n_repeat = kwargs["n_repeat"]
+
         assert seasonality_h_value >= n_prediction_steps
-        cv = TimeSeriesSplit(n_splits=2, test_size=1, gap=n_prediction_steps - 1)
+        cv = TimeSeriesSplit(n_splits=num_splits, test_size=n_prediction_steps * n_repeat, gap=0)
 
         train_t, val_t = holdout_split_forecasting(holdout=cv,
                                                    indices=indices,
@@ -320,25 +326,16 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
                                                    n_repeat=n_repeat)
 
         splits = [(train_t, val_t)]
-        if len(indices) < seasonality_h_value - n_prediction_steps:
-            if len(indices) == 1:
-                train_s = train_t
-                val_s = val_t
-            else:
-                train_s, val_s = holdout_split_forecasting(cv, indices[:-1],
-                                                           n_prediction_steps=n_prediction_steps,
-                                                           n_repeat=n_repeat)
-        else:
-            train_s, val_s = holdout_split_forecasting(cv, indices[:-seasonality_h_value + n_prediction_steps],
-                                                       n_prediction_steps=n_prediction_steps,
-                                                       n_repeat=n_repeat)
-        splits.append((train_s, val_s))
-        if num_splits > 2:
-            freq_value = int(kwargs['freq_value'])
-            for i_split in range(2, num_splits):
-                n_tail = (i_split - 1) * freq_value + seasonality_h_value - n_prediction_steps
-                train_s, val_s = holdout_split_forecasting(cv, indices[:-n_tail],
-                                                           n_prediction_steps=n_prediction_steps,
+        if num_splits > 1:
+            for i_split in range(1, num_splits):
+                n_tail = i_split * seasonality_h_value
+                if n_tail > len(indices):
+                    # normally this should not happen as seasonality_h_value is carefully computed by ForecastingDataset
+                    indices_split = indices
+                else:
+                    indices_split = indices[:-n_tail]
+                train_s, val_s = holdout_split_forecasting(cv, indices_split,
+                                                           n_prediction_steps=seasonality_h_value,
                                                            n_repeat=n_repeat)
                 splits.append((train_s, val_s))
         return splits
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 6b2f45c2d..f4555d7fb 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -989,7 +989,7 @@ def get_split_strategy(sequence_lengths: List[int],
                 seasonality_h_value = int(
                     np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
 
-                while minimal_seq_length < (num_splits - 1) * freq_value + seasonality_h_value - n_prediction_steps:
+                while minimal_seq_length < (num_splits - 1) * seasonality_h_value:
                     if num_splits <= 2:
                         break
                     num_splits -= 1
@@ -1016,15 +1016,11 @@ def get_split_strategy(sequence_lengths: List[int],
                 n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps * num_splits))
             elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
                 seasonality_h_value = int(np.round(
-                    (n_prediction_steps * n_repeat // int(freq_value) + 1) * freq_value)
+                    (n_prediction_steps // int(freq_value) + 1) * freq_value)
                 )
-
-                while minimal_seq_length // 5 < (num_splits - 1) * n_repeat * freq_value \
-                        + seasonality_h_value - n_repeat * n_prediction_steps:
+                while minimal_seq_length // 5 < (num_splits - 1) * n_repeat * seasonality_h_value:
                     n_repeat -= 1
-                    seasonality_h_value = int(np.round(
-                        (n_prediction_steps * n_repeat // int(freq_value) + 1) * freq_value)
-                    )
+
             elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
                 n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps) - 1)
 
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index acd59226d..089e036b3 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -326,7 +326,6 @@ def _predict(self, pipeline: BaseEstimator,
 
         opt_pred = opt_pred.reshape(-1, self.num_targets)
 
-        # TODO we consider X_valid and X_test as a multiple sequences???
         if self.X_valid is not None:
             valid_sets = []
             for val_seq in enumerate(self.datamanager.datasets):
diff --git a/test/test_datasets/test_resampling_strategies.py b/test/test_datasets/test_resampling_strategies.py
index 7f14275a3..6300b6e5c 100644
--- a/test/test_datasets/test_resampling_strategies.py
+++ b/test/test_datasets/test_resampling_strategies.py
@@ -1,5 +1,4 @@
 import numpy as np
-
 from autoPyTorch.datasets.resampling_strategy import CrossValFuncs, HoldOutFuncs
 
 
@@ -20,6 +19,42 @@ def test_holdoutfuncs():
     assert 0 in y[val]
     assert 0 in y[train]
 
+    # Forecasting
+    n_prediction_steps = 3
+    n_repeats = 1
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeat=n_repeats)
+    # val must start n_predictions_steps after train
+    assert val[0] - train[-1] == n_prediction_steps
+    assert len(val) == n_repeats
+
+    n_prediction_steps = 2
+    n_repeats = 2
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeat=n_repeats)
+    assert val[0] - train[-1] == n_prediction_steps
+    assert len(val) == n_repeats
+    # No overlapping between different splits
+    assert val[1] - val[0] == n_prediction_steps
+
+    # Failure case
+    # Forecasting
+    n_prediction_steps = 10
+    n_repeats = 1
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeat=n_repeats)
+    # n_prediction steps is larger than the length of the sequence
+    assert len(train) == 0
+    assert val == 9
+
+    # TODO Theoretically, this should work properly, we need to write our own spliter
+    n_prediction_steps = 2
+    n_repeats = 3
+    train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
+                                                       n_repeat=n_repeats)
+    assert len(train) == 0
+    assert val == 9
+
 
 def test_crossvalfuncs():
     split = CrossValFuncs()
@@ -40,3 +75,49 @@ def test_crossvalfuncs():
     splits = split.stratified_k_fold_cross_validation(0, 10, X, stratify=y)
     assert len(splits) == 10
     assert all([0 in y[s[1]] for s in splits])
+
+    def eval_ts_cv(num_splits, n_prediction_steps, n_repeats):
+        splits = split.time_series_cross_validation(0, num_splits, X,
+                                                    n_prediction_steps=n_prediction_steps, n_repeat=n_repeats)
+        assert len(splits) == num_splits
+        for i, sp in enumerate(splits):
+            assert len(sp[1]) == n_repeats
+            assert sp[1][0] - sp[0][-1] == n_prediction_steps
+            if i > 0:
+                assert sp[1][0] - splits[i - 1][1][-1] == n_prediction_steps
+
+    eval_ts_cv(2, 10, 1)
+    eval_ts_cv(3, 10, 3)
+
+    def eval_ts_sea_cv(num_splits, n_prediction_steps, n_repeats, freq_value):
+        seasonality_h_value = int(np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
+        splits = split.time_series_ts_cross_validation(0, num_splits=num_splits,
+                                                       indices=X,
+                                                       n_prediction_steps=n_prediction_steps,
+                                                       n_repeat=n_repeats,
+                                                       seasonality_h_value=seasonality_h_value)
+        assert len(splits) == num_splits
+        assert splits[0][1][-1] == len(X) - 1
+        if num_splits > 1:
+            for i in range(1, num_splits):
+                dis_val_start_to_test = len(X) - 1 - (splits[i][1])
+                assert np.all(dis_val_start_to_test % freq_value == 0)
+
+    eval_ts_sea_cv(2, 10, 2, 6)
+    eval_ts_sea_cv(2, 10, 1, 12)
+    eval_ts_sea_cv(3, 10, 1, 6)
+
+    n_prediction_steps = 10
+    freq_value = 24
+    n_repeats = 1
+    num_splits = 2
+    seasonality_h_value = int(np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
+
+    sp2 = split.time_series_ts_cross_validation(0, num_splits=num_splits,
+                                                indices=X[:20],
+                                                n_prediction_steps=n_prediction_steps,
+                                                n_repeat=n_repeats,
+                                                seasonality_h_value=seasonality_h_value)
+    # We cannot do a split, thus the two splits are the same
+
+    assert np.all(sp2[1][1] == sp2[0][1])
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 903e699ed..4be0a1a64 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -101,7 +101,6 @@ def test_uni_to_test_set(self):
         self.seq_uni.is_test_set = True
         self.assertEqual(len(self.seq_uni), len(self.y))
 
-
         data, target = self.seq_uni[-1]
         self.assertTrue(target is None)
         self.assertEqual(len(data["past_targets"]), len(self.y))
@@ -199,3 +198,5 @@ def test_exception(self):
 
         with self.assertRaises(ValueError):
             seq_2.get_test_target(5)
+
+

From 1986593d57a730671145bc0e89750bac56e31c02 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 7 May 2022 22:15:25 +0200
Subject: [PATCH 247/347] test for dataset 1

---
 autoPyTorch/datasets/time_series_dataset.py   | 84 +++++++++----------
 .../setup/network/forecasting_network.py      |  3 +-
 .../time_series_forecasting_data_loader.py    |  6 +-
 test/conftest.py                              | 73 +++++++++-------
 .../test_time_series_datasets.py              | 37 +++++++-
 5 files changed, 125 insertions(+), 78 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f4555d7fb..97019eb92 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -216,7 +216,7 @@ def __getitem__(self, index: int, train: bool = True) \
 
         if self.transform_time_features:
             if self.time_feature_transform:
-                self.compute_time_features()
+                self.cache_time_features()
 
                 if past_features is not None:
                     past_features = np.hstack([past_features, self._cached_time_features[:index + 1]])
@@ -267,7 +267,7 @@ def __getitem__(self, index: int, train: bool = True) \
     def __len__(self) -> int:
         return self.Y.shape[0] if self.is_test_set else self.Y.shape[0] - self.n_prediction_steps
 
-    def compute_time_features(self, ):
+    def cache_time_features(self, ):
         if self._cached_time_features is None:
             periods = self.Y.shape[0]
             if self.is_test_set:
@@ -395,35 +395,11 @@ def __init__(self,
         # Preprocess time series data information
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
-        if freq is None:
-            self.freq = None
-            self.freq_value = None
-
-        if isinstance(freq, str):
-            if freq not in SEASONALITY_MAP:
-                Warning("The given freq name is not supported by our dataset, we will use the default "
-                        "configuration space on the hyperparameter window_size, if you want to adapt this value"
-                        "you could pass freq with a numerical value")
-            freq_value = SEASONALITY_MAP.get(freq, None)
-        else:
-            freq_value = freq
-
-        if isinstance(freq_value, list):
-            min_base_size = min(n_prediction_steps, MAX_WINDOW_SIZE_BASE)
-            if np.max(freq_value) < min_base_size:
-                tmp_freq = max(freq_value)
-            else:
-                tmp_freq = min([freq_value_item for
-                                freq_value_item in freq_value if freq_value_item >= min_base_size])
-            freq_value = tmp_freq
-
-        seasonality = SEASONALITY_MAP.get(freq, 1)
-        if isinstance(seasonality, list):
-            seasonality = min(seasonality)  # Use to calculate MASE
+        seasonality, freq, freq_value = self.compute_freq_values(freq, n_prediction_steps)
         self.seasonality = int(seasonality)
 
-        self.freq: Optional[str] = freq
-        self.freq_value: Optional[int] = freq_value
+        self.freq: str = freq
+        self.freq_value: int = freq_value
 
         self.n_prediction_steps = n_prediction_steps
 
@@ -442,8 +418,7 @@ def __init__(self,
 
         if not self.validator._is_fitted:
             self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
-                               start_times=start_times,
-                               n_prediction_steps=n_prediction_steps)
+                               start_times=start_times)
 
         self.is_uni_variant = self.validator._is_uni_variant
 
@@ -498,7 +473,7 @@ def __init__(self,
 
         sequence_datasets, train_tensors, sequence_lengths = self.transform_data_into_time_series_sequence(
             X, Y,
-            start_times=start_times,
+            start_times=self.start_times,
             X_test=X_test,
             Y_test=Y_test, )
 
@@ -597,6 +572,39 @@ def __init__(self,
 
         self.lagged_value = lagged_value
 
+    def compute_freq_values(self,
+                            freq: Optional[Union[str, int, List[int]]],
+                            n_prediction_steps: int) -> Tuple[Real, str, Real]:
+        """
+        Compute frequency related values
+        """
+        if freq is None:
+            freq = '1Y'
+
+        if isinstance(freq, str):
+            if freq not in SEASONALITY_MAP:
+                Warning("The given freq name is not supported by our dataset, we will use the default "
+                        "configuration space on the hyperparameter window_size, if you want to adapt this value"
+                        "you could pass freq with a numerical value")
+            freq_value = SEASONALITY_MAP.get(freq, None)
+        else:
+            freq_value = freq
+            freq = '1Y'
+
+        if isinstance(freq_value, list):
+            min_base_size = min(n_prediction_steps, MAX_WINDOW_SIZE_BASE)
+            if np.max(freq_value) < min_base_size:
+                tmp_freq = max(freq_value)
+            else:
+                tmp_freq = min([freq_value_item for
+                                freq_value_item in freq_value if freq_value_item >= min_base_size])
+            freq_value = tmp_freq
+
+        seasonality = SEASONALITY_MAP.get(freq, 1)
+        if isinstance(seasonality, list):
+            seasonality = min(seasonality)  # Use to calculate MASE
+        return seasonality, freq, freq_value
+
     @staticmethod
     def compute_time_features(start_times: List[pd.DatetimeIndex],
                               seq_lengths: List[int],
@@ -729,7 +737,7 @@ def transform_data_into_time_series_sequence(self,
         return sequence_datasets, train_tensors, sequence_lengths
 
     @staticmethod
-    def make_sequences_datasets(X: pd.DataFrame,
+    def make_sequences_datasets(X: Optional[pd.DataFrame],
                                 Y: pd.DataFrame,
                                 start_times: List[pd.DatetimeIndex],
                                 time_features: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
@@ -923,18 +931,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
         dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
                                    'sp': self.seasonality,  # For metric computation,
-                                   'freq': self.freq,
-                                   'sequence_lengths_train': self.sequence_lengths_train,
-                                   'seq_length_max': self.seq_length_max,
                                    'input_shape': self.input_shape,
-                                   'lagged_value': self.lagged_value,
-                                   'feature_names': self.feature_names,
-                                   'feature_shapes': self.feature_shapes,
-                                   'known_future_features': self.known_future_features,
-                                   'static_features': self.static_features,
                                    'time_feature_transform': self.time_feature_transform,
-                                   'time_feature_names': self.time_feature_names,
-                                   'future_feature_shapes': self.future_feature_shapes,
                                    'uni_variant': self.is_uni_variant,
                                    'targets_have_missing_values': self.train_tensors[1].isnull().values.any(),
                                    'features_have_missing_values': False if self.train_tensors[0] is None
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 8fac068e3..3f6008bc4 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -55,7 +55,8 @@ def _required_fit_requirements(self):
             FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('static_features', (Tuple,), user_defined=True, dataset_property=False),
-            FitRequirement('time_feature_names', (Iterable,), user_defined=False, dataset_property=True)
+            FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement("static_features", (Tuple, ), user_defined=True, dataset_property=True)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 1710ddba0..baddedb9c 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -92,7 +92,9 @@ def __init__(self,
         self.add_fit_requirements(
             [FitRequirement("known_future_features", (Tuple,), user_defined=True, dataset_property=True),
              FitRequirement("feature_shapes", (Dict,), user_defined=True, dataset_property=True),
-             FitRequirement("feature_names", (Tuple, ), user_defined=True, dataset_property=True)])
+             FitRequirement("feature_names", (Tuple, ), user_defined=True, dataset_property=True),
+             FitRequirement("sequence_lengths_train", (List,), user_defined=True, dataset_property=True),
+             FitRequirement("freq", (str, ), user_defined=True, dataset_property=True)])
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -339,7 +341,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
 
                 x_seq.update_attribute(**update_dict)
                 if self.transform_time_features:
-                    x_seq.compute_time_features()
+                    x_seq.cache_time_features()
 
                 x_seq.freq = self.freq
                 if not self.dataset_small_preprocess:
diff --git a/test/conftest.py b/test/conftest.py
index a8f5860d6..232289915 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -621,7 +621,23 @@ def input_data_featuretest(request):
 
 
 # Forecasting tasks
-def get_forecasting_data(uni_variant, with_missing_value=False, type_X='pd', with_series_id=False):
+def get_forecasting_data(request):
+    uni_variant = False
+    targets_with_missing_value = False
+    features_with_missing_value = False
+    type_X = 'pd'
+    with_series_id = False
+    if request == 'uni_variant_wo_missing':
+        uni_variant = True
+    elif request == 'uni_variant_w_missing':
+        uni_variant = True
+        targets_with_missing_value = True
+    elif request == 'multi_variant_wo_missing':
+        with_missing_value = False
+    elif request == 'multi_variant_w_missing':
+        features_with_missing_value = True
+    else:
+        raise NotImplementedError
     generator = check_random_state(0)
     n_seq = 10
     base_length = 50
@@ -635,9 +651,9 @@ def get_forecasting_data(uni_variant, with_missing_value=False, type_X='pd', wit
     # for categorical features, the following character indicate how the feature is stored:
     # s: stored as string; n: stored as
     if type_X == 'pd':
-        feature_columns = ['n1', 'cs2_10', 'f3', 'cn4_5', 's5']
+        feature_columns = ['n1', 'cs2_10', 'n3', 'cn4_5', 'n5']
     else:
-        feature_columns = ['n1', 'cn2_5', 'f3', 'cn4_5', 's5']
+        feature_columns = ['n1', 'cn2_5', 'n3', 'cn4_5', 'n5']
 
     def generate_forecasting_features(feature_type, length):
         feature_type_content = list(feature_type)
@@ -645,7 +661,7 @@ def generate_forecasting_features(feature_type, length):
             # numerical features
             return generator.rand(length)
         elif feature_type_content[0] == 'c':
-            num_class = int(feature_type.split("_"))
+            num_class = int(feature_type.split("_")[-1])
             if feature_type_content[1] == 's':
                 return generator.choice([f'value_{feature_id}' for feature_id in range(num_class)],
                                         size=length, replace=True)
@@ -656,34 +672,43 @@ def generate_forecasting_features(feature_type, length):
         else:
             raise NotImplementedError
 
+    features = []
     for i in range(n_seq):
-        new_seq = np.arange(i * 1000, base_length + i * 1010)
+        new_seq = np.arange(i * 1000, base_length + i * 1010).astype(np.float)
         series_length = base_length + i * 10
 
         targets.append(np.arange(i * 1000, series_length + i * 1000))
         if not uni_variant:
             if type_X == 'np':
-                features = np.asarray([generate_forecasting_features(col, series_length) for col in feature_columns])
+                feature = np.asarray([generate_forecasting_features(col, series_length) for col in feature_columns])
             elif type_X == 'pd':
-                features = {col: generate_forecasting_features(col, series_length) for col in feature_columns}
+                feature = {col: generate_forecasting_features(col, series_length) for col in feature_columns}
                 if with_series_id:
-                    features["series_id"] = [i] * series_length
-                features = pd.DataFrame(
-                    features
+                    feature["series_id"] = [i] * series_length
+                feature = pd.DataFrame(
+                    feature
                 )
+
+                for col in feature.columns:
+                    if col.startswith("n"):
+                        feature[col] = feature[col].astype('float')
+                    elif col.startswith("cs"):
+                        feature[col] = feature[col].astype('category')
+                    elif col.startswith("cn"):
+                        feature[col] = feature[col].astype('int')
             else:
                 raise NotImplementedError
-            features.append(features)
+            features.append(feature)
 
-        if with_missing_value:
+        if targets_with_missing_value:
             new_seq[5] = np.NAN
             new_seq[-5] = np.NAN
 
-        start_time = datetime.strptime(f'1900{i // 5}-01-01 00-00-00', '%Y-%m-%d %H-%M-%S')
+        start_time = datetime.datetime.strptime(f'190{i // 5}-01-01 00-00-00', '%Y-%m-%d %H-%M-%S')
         start_times.append(start_time)
     input_validator = TimeSeriesForecastingInputValidator(is_classification=False)
-    features = features if features else None
-    return features, targets, input_validator.fit(features, targets, start_times=start_times), feature_columns
+    features = features if len(features) > 0 else None
+    return features, targets, input_validator.fit(features, targets, start_times=start_times)
 
 
 def get_forecasting_fit_dictionary(X, y, validator, backend, budget_type='epochs', forecast_horizon=5, freq='1D'):
@@ -789,21 +814,9 @@ def input_data_forecastingfeaturetest(request):
 
 
 @pytest.fixture
-def fit_dictionary_uni_variant_wo_missing():
-    x, y, validator = get_forecasting_data(uni_variant=True, with_missing_value=False)
-    return get_forecasting_fit_dictionary(x, y, validator)
-
-
-@pytest.fixture
-def fit_dictionary_uni_variant_w_missing():
-    x, y, validator = get_forecasting_data(uni_variant=True, with_missing_value=True)
-    return get_forecasting_fit_dictionary(x, y, validator)
-
-
-@pytest.fixture
-def fit_dictionary_uni_variant_wo_missing():
-    x, y, validator = get_forecasting_data(uni_variant=False, with_missing_value=False)
-    return get_forecasting_fit_dictionary(x, y, validator)
+def get_fit_dictionary_forecasting(request, backend):
+    X, y, validator = get_forecasting_data(request.param)
+    return get_forecasting_fit_dictionary(X, y, validator, backend)
 
 
 # Fixtures for forecasting validators.
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 4be0a1a64..6711cfef5 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -1,3 +1,5 @@
+from typing import List, Callable, Tuple
+
 import numpy as np
 import torch
 import pandas as pd
@@ -5,6 +7,7 @@
 import unittest
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from gluonts.time_feature import Constant as ConstantTransform, DayOfMonth
+from autoPyTorch.utils.pipeline import get_dataset_requirements
 
 
 class ZeroTransformer:
@@ -70,7 +73,7 @@ def test_get_val_seq_and_test_targets(self):
         val_seq = self.seq_uni.get_val_seq_set(-1)
         self.assertEqual(len(val_seq), len(self.seq_uni))
 
-        self.seq_uni.compute_time_features()
+        self.seq_uni.cache_time_features()
         val_seq = self.seq_uni.get_val_seq_set(5)
         self.assertEqual(len(val_seq), 5 + 1)
         self.assertEqual(len(val_seq._cached_time_features), 5+1 + self.n_prediction_steps)
@@ -96,7 +99,7 @@ def test_uni_get_update_time_features(self):
 
     def test_uni_to_test_set(self):
         self.seq_uni.transform_time_features = True
-        self.seq_uni.compute_time_features()
+        self.seq_uni.cache_time_features()
         # For test set, its length should equal to y's length
         self.seq_uni.is_test_set = True
         self.assertEqual(len(self.seq_uni), len(self.y))
@@ -200,3 +203,33 @@ def test_exception(self):
             seq_2.get_test_target(5)
 
 
+@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing',
+                                                        'uni_variant_w_missing',
+                                                        'multi_variant_wo_missing',
+                                                        'uni_variant_w_missing'], indirect=True)
+def test_dataset_properties(backend, get_fit_dictionary_forecasting):
+    # The fixture creates a datamanager by itself
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    info = {'task_type': datamanager.task_type,
+            'numerical_features': datamanager.numerical_features,
+            'categorical_features': datamanager.categorical_features,
+            'output_type': datamanager.output_type,
+            'numerical_columns': datamanager.numerical_columns,
+            'categorical_columns': datamanager.categorical_columns,
+            'target_columns': (1,),
+            'issparse': False}
+
+    dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
+    assert dataset_properties['n_prediction_steps'] == datamanager.n_prediction_steps
+    assert dataset_properties['sp'] == datamanager.seasonality
+    assert dataset_properties['freq'] == datamanager.freq
+    assert isinstance(dataset_properties['input_shape'], Tuple)
+    assert isinstance(dataset_properties['time_feature_transform'], List)
+    for item in dataset_properties['time_feature_transform']:
+        assert isinstance(item, Callable)
+    assert dataset_properties['uni_variant'] == (get_fit_dictionary_forecasting['X_train'] is None)
+    assert dataset_properties['targets_have_missing_values'] == \
+           get_fit_dictionary_forecasting['y_train'].isnull().values.any()
+    if get_fit_dictionary_forecasting['X_train'] is not None:
+        assert dataset_properties['features_have_missing_values'] == \
+               get_fit_dictionary_forecasting['X_train'].isnull().values.any()
\ No newline at end of file

From 112c8765b0b78333884432cb0f6dfd1a4dfd928b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 8 May 2022 20:01:52 +0200
Subject: [PATCH 248/347] test for datasets

---
 .../data/time_series_forecasting_validator.py |   2 +-
 autoPyTorch/datasets/resampling_strategy.py   |  34 ++--
 autoPyTorch/datasets/time_series_dataset.py   |  95 ++++-----
 .../scaling/utils.py                          |   2 +-
 test/conftest.py                              |  16 +-
 .../test_resampling_strategies.py             |  19 +-
 .../test_time_series_datasets.py              | 188 +++++++++++++++++-
 7 files changed, 279 insertions(+), 77 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 0ff0d0883..ee39c11db 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -135,7 +135,7 @@ def transform(
         """
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
-        if validate_for_future_features:
+        if validate_for_future_features and y is None:
             if X is None:
                 return None, None, []
             if isinstance(X, List):
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index 2629671d2..ef073827f 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -40,14 +40,14 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
 
 
 def holdout_split_forecasting(holdout: TimeSeriesSplit, indices: np.ndarray, n_prediction_steps: int,
-                              n_repeat: int = 1) -> Tuple[np.ndarray, np.ndarray]:
+                              n_repeats: int = 1) -> Tuple[np.ndarray, np.ndarray]:
     """
     A function that do holdout split without raising an error: When the target sequence is too short to be split into
     training and validation set, the training set will simply ignore that and we only consider the validation set.
     """
     try:
         train, val = list(holdout.split(indices))[-1]
-        val = [val[-1 - i * n_prediction_steps] for i in reversed(range(n_repeat))]
+        val = [val[-1 - i * n_prediction_steps] for i in reversed(range(n_repeats))]
     except (ValueError, IndexError):
         train = np.array([], dtype=indices.dtype)
         val = [-1]
@@ -180,16 +180,16 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
         Returns:
         """
         n_prediction_steps = kwargs['n_prediction_steps']
-        n_repeat = kwargs['n_repeat']
+        n_repeats = kwargs['n_repeats']
         # Time Series prediction only requires on set of prediction for each
         # This implement needs to be combined with time series forecasting dataloader, where each time an entire
         # time series is used for prediction
-        cv = TimeSeriesSplit(n_splits=2, test_size=1 + n_prediction_steps * (n_repeat - 1), gap=n_prediction_steps - 1)
+        cv = TimeSeriesSplit(n_splits=2, test_size=1 + n_prediction_steps * (n_repeats - 1), gap=n_prediction_steps - 1)
 
         train, val = holdout_split_forecasting(holdout=cv,
                                                indices=indices,
                                                n_prediction_steps=n_prediction_steps,
-                                               n_repeat=n_repeat)
+                                               n_repeats=n_repeats)
         return train, val
 
     @classmethod
@@ -268,7 +268,7 @@ def time_series_cross_validation(random_state: np.random.RandomState,
             indices (np.ndarray): array of indices to be split
             num_splits (int): number of cross validation splits
             n_prediction_steps(int): forecsting horizon, to ensure that there is no overlapping between splits
-            n_repeat (int): number of sequences inside each split, e.g., inside each split, we could ask the model to
+            n_repeats (int): number of sequences inside each split, e.g., inside each split, we could ask the model to
                 predict n_reapet times
 
         Returns:
@@ -283,11 +283,11 @@ def time_series_cross_validation(random_state: np.random.RandomState,
 
         """
         test_size = kwargs['n_prediction_steps']
-        n_repeat = kwargs['n_repeat']
-        cv = TimeSeriesSplit(n_splits=num_splits, test_size=test_size * n_repeat, gap=0)
+        n_repeats = kwargs['n_repeats']
+        cv = TimeSeriesSplit(n_splits=num_splits, test_size=test_size * n_repeats, gap=0)
         splits = [(
             indices[split[0]],
-            indices[split[1][[-1 - n * test_size for n in reversed(range(n_repeat))]]]) for split in cv.split(indices)]
+            indices[split[1][[-1 - n * test_size for n in reversed(range(n_repeats))]]]) for split in cv.split(indices)]
         return splits
 
     @staticmethod
@@ -315,20 +315,22 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
         """
         n_prediction_steps = kwargs['n_prediction_steps']
         seasonality_h_value = kwargs['seasonality_h_value']
-        n_repeat = kwargs["n_repeat"]
+        n_repeats = kwargs["n_repeats"]
 
         assert seasonality_h_value >= n_prediction_steps
-        cv = TimeSeriesSplit(n_splits=num_splits, test_size=n_prediction_steps * n_repeat, gap=0)
+        cv = TimeSeriesSplit(n_splits=2, test_size=n_prediction_steps * n_repeats, gap=0)
 
         train_t, val_t = holdout_split_forecasting(holdout=cv,
                                                    indices=indices,
                                                    n_prediction_steps=n_prediction_steps,
-                                                   n_repeat=n_repeat)
+                                                   n_repeats=n_repeats)
 
         splits = [(train_t, val_t)]
         if num_splits > 1:
+            cv = TimeSeriesSplit(n_splits=2, test_size=seasonality_h_value * n_repeats, gap=0)
+            n_tail = - n_prediction_steps
             for i_split in range(1, num_splits):
-                n_tail = i_split * seasonality_h_value
+                n_tail += seasonality_h_value
                 if n_tail > len(indices):
                     # normally this should not happen as seasonality_h_value is carefully computed by ForecastingDataset
                     indices_split = indices
@@ -336,7 +338,11 @@ def time_series_ts_cross_validation(random_state: np.random.RandomState,
                     indices_split = indices[:-n_tail]
                 train_s, val_s = holdout_split_forecasting(cv, indices_split,
                                                            n_prediction_steps=seasonality_h_value,
-                                                           n_repeat=n_repeat)
+                                                           n_repeats=n_repeats)
+                if len(train_s) > 0:
+                    train_s = np.concatenate(
+                        [train_s, np.arange(seasonality_h_value - n_prediction_steps) + train_s[-1] + 1]
+                    )
                 splits.append((train_s, val_s))
         return splits
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 97019eb92..bbef75c13 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,4 +1,5 @@
 import os
+import pdb
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from numbers import Real
 import uuid
@@ -437,7 +438,7 @@ def __init__(self,
 
         self.static_features = self.validator.feature_validator.static_features
 
-        self._transform_time_feature = False
+        self._transform_time_features = False
         if not time_feature_transform:
             time_feature_transform = time_features_from_frequency_str(self.freq)
             if not time_feature_transform:
@@ -460,6 +461,8 @@ def __init__(self,
                                                             self.feature_names,
                                                             queried_features=known_future_features)
 
+        self.known_future_features = tuple(known_future_features)
+
         # initialize datasets
         self.sequences_builder_kwargs = {"freq": self.freq,
                                          "time_feature_transform": self.time_feature_transform,
@@ -481,8 +484,6 @@ def __init__(self,
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
-        self.known_future_features = tuple(known_future_features)
-
         self.num_sequences = len(Y)
         self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
 
@@ -572,8 +573,8 @@ def __init__(self,
 
         self.lagged_value = lagged_value
 
-    def compute_freq_values(self,
-                            freq: Optional[Union[str, int, List[int]]],
+    @staticmethod
+    def compute_freq_values(freq: Optional[Union[str, int, List[int]]],
                             n_prediction_steps: int) -> Tuple[Real, str, Real]:
         """
         Compute frequency related values
@@ -586,11 +587,12 @@ def compute_freq_values(self,
                 Warning("The given freq name is not supported by our dataset, we will use the default "
                         "configuration space on the hyperparameter window_size, if you want to adapt this value"
                         "you could pass freq with a numerical value")
-            freq_value = SEASONALITY_MAP.get(freq, None)
+            freq_value = SEASONALITY_MAP.get(freq, 1)
         else:
             freq_value = freq
             freq = '1Y'
 
+        seasonality = freq_value
         if isinstance(freq_value, list):
             min_base_size = min(n_prediction_steps, MAX_WINDOW_SIZE_BASE)
             if np.max(freq_value) < min_base_size:
@@ -600,7 +602,6 @@ def compute_freq_values(self,
                                 freq_value_item in freq_value if freq_value_item >= min_base_size])
             freq_value = tmp_freq
 
-        seasonality = SEASONALITY_MAP.get(freq, 1)
         if isinstance(seasonality, list):
             seasonality = min(seasonality)  # Use to calculate MASE
         return seasonality, freq, freq_value
@@ -638,6 +639,9 @@ def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Unio
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return dataset_idx, sample_idx
 
+    def __len__(self):
+        return ConcatDataset.__len__(self)
+
     def __getitem__(self, idx, train=True):
         dataset_idx, sample_idx = self._get_dataset_indices(idx)
         return self.datasets[dataset_idx].__getitem__(sample_idx, train)
@@ -702,7 +706,7 @@ def transform_data_into_time_series_sequence(self,
             train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
                 training tensors
         """
-        dataset_with_future_features = is_test_set and X is not None and len(self.known_future_features) > 0
+        dataset_with_future_features = X is not None and len(self.known_future_features) > 0
         if dataset_with_future_features and X_test is None:
             raise ValueError('When constructing test sets and known future features exist, X_test must be given!')
         X, Y, sequence_lengths = self.validator.transform(X, Y)
@@ -711,7 +715,7 @@ def transform_data_into_time_series_sequence(self,
                                                    self.freq,
                                                    self.time_feature_transform)
 
-        if Y_test is not None:
+        if Y_test is not None or X_test is not None:
             X_test, Y_test, _ = self.validator.transform(X_test, Y_test,
                                                          validate_for_future_features=dataset_with_future_features)
 
@@ -720,7 +724,8 @@ def transform_data_into_time_series_sequence(self,
             mean = y_groups.agg("mean")
             std = y_groups.agg("std")
             std[std == 0] = 1.
-            Y = (Y[mean] - mean) / std
+            std.fillna(1.)
+            Y = (Y - mean) / std
             self.y_mean = mean
             self.y_std = std
             if Y_test is not None:
@@ -797,10 +802,6 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
             y_test_ser = y_test_group.get_group(ser_id).transform(np.array).values if Y_test is not None else None
             x_test_ser = x_test_group.get_group(ser_id).transform(np.array).values if X_test is not None else None
 
-            if dataset_with_future_features:
-                x_ser = np.concatenate([x_ser, x_test_ser])
-                x_test_ser = None
-
             sequence = TimeSeriesSequence(
                 X=x_ser,
                 Y=y_ser,
@@ -867,6 +868,7 @@ def transform_time_features(self):
 
     @transform_time_features.setter
     def transform_time_features(self, value: bool):
+        self._transform_time_feature = value
         for seq in self.datasets:
             seq.transform_time_features = value
 
@@ -885,26 +887,26 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
                 'val_share', None)
             if self.resampling_strategy_args is not None:
                 val_share = self.resampling_strategy_args.get('val_share', val_share)
-                n_repeat = self.resampling_strategy_args.get("n_repeat", 1)
+                n_repeats = self.resampling_strategy_args.get("n_repeats", 1)
             else:
-                n_repeat = 1
+                n_repeats = 1
             splits.append(self.create_holdout_val_split(holdout_val_type=self.resampling_strategy,
                                                         val_share=val_share,
-                                                        n_repeat=n_repeat))
+                                                        n_repeats=n_repeats))
 
         elif isinstance(self.resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[self.resampling_strategy].get(
                 'num_splits', None)
             if self.resampling_strategy_args is not None:
                 num_splits = self.resampling_strategy_args.get('num_splits', num_splits)
-                n_repeat = self.resampling_strategy_args.get("n_repeat", 1)
+                n_repeats = self.resampling_strategy_args.get("n_repeats", 1)
             else:
-                n_repeat = 1
+                n_repeats = 1
             # Create the split if it was not created before
             splits.extend(self.create_cross_val_splits(
                 cross_val_type=self.resampling_strategy,
                 num_splits=cast(int, num_splits),
-                n_repeat=n_repeat
+                n_repeats=n_repeats
             ))
         elif self.resampling_strategy is None:
             splits.append(self.create_refit_split())
@@ -964,9 +966,9 @@ def get_split_strategy(sequence_lengths: List[int],
         minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
         if isinstance(resampling_strategy, CrossValTypes):
             num_splits = DEFAULT_RESAMPLING_PARAMETERS[resampling_strategy].get(
-                'num_splits', None)
+                'num_splits', 5)
             if resampling_strategy_args is not None:
-                num_splits = resampling_strategy_args.get('num_split', num_splits)
+                num_splits = resampling_strategy_args.get('num_splits', num_splits)
 
             if resampling_strategy != CrossValTypes.time_series_ts_cross_validation:
                 while minimal_seq_length - n_prediction_steps * num_splits <= 0:
@@ -998,48 +1000,50 @@ def get_split_strategy(sequence_lengths: List[int],
 
         num_seqs = len(sequence_lengths)
 
-        if resampling_strategy_args is not None and "n_repeat" not in resampling_strategy_args:
-            n_repeat = resampling_strategy_args["n_repeat"]
+        if resampling_strategy_args is not None and "n_repeats" in resampling_strategy_args:
+            n_repeats = resampling_strategy_args["n_repeats"]
         else:
-            n_repeat = None
+            n_repeats = None
         if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
                 minimal_seq_length > 50 * n_prediction_steps:
-            if n_repeat is None:
+            if n_repeats is None:
                 if num_seqs < 100:
-                    n_repeat = int(np.ceil(100.0 / num_seqs))
+                    n_repeats = int(np.ceil(100.0 / num_seqs))
                 else:
-                    n_repeat = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
-
+                    n_repeats = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
+        else:
+            if n_repeats is None:
+                n_repeats = 1
             if resampling_strategy == CrossValTypes.time_series_cross_validation:
-                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps * num_splits))
+                n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps * num_splits))
             elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
                 seasonality_h_value = int(np.round(
                     (n_prediction_steps // int(freq_value) + 1) * freq_value)
                 )
-                while minimal_seq_length // 5 < (num_splits - 1) * n_repeat * seasonality_h_value:
-                    n_repeat -= 1
+                while minimal_seq_length // 5 < (num_splits - 1) * n_repeats * seasonality_h_value - n_prediction_steps:
+                    n_repeats -= 1
 
             elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
-                n_repeat = min(n_repeat, minimal_seq_length // (5 * n_prediction_steps) - 1)
+                n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps) - 1)
 
             else:
-                n_repeat = 1
+                n_repeats = 1
 
-            n_repeat = max(n_repeat, 1)
-        if n_repeat is None:
-            n_repeat = 1
+            n_repeats = max(n_repeats, 1)
+        if n_repeats is None:
+            n_repeats = 1
 
         if resampling_strategy_args is None:
-            resampling_strategy_args = {'n_repeat': n_repeat}
+            resampling_strategy_args = {'n_repeats': n_repeats}
         else:
-            resampling_strategy_args.update({'n_repeat': n_repeat})
+            resampling_strategy_args.update({'n_repeats': n_repeats})
         return resampling_strategy, resampling_strategy_args
 
     def create_cross_val_splits(
             self,
             cross_val_type: CrossValTypes,
             num_splits: int,
-            n_repeat=1,
+            n_repeats=1,
     ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
         """
         This function creates the cross validation split for the given task.
@@ -1048,7 +1052,7 @@ def create_cross_val_splits(
         Args:
             cross_val_type (CrossValTypes):
             num_splits (int): number of splits to be created
-            n_repeat (int): how many n_prediction_steps to repeat in the validation set
+            n_repeats (int): how many n_prediction_steps to repeat in the validation set
 
         Returns:
             (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]):
@@ -1069,7 +1073,7 @@ def create_cross_val_splits(
             seasonality_h_value = int(np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
             kwargs.update({'seasonality_h_value': seasonality_h_value,
                            'freq_value': self.freq_value})
-        kwargs["n_repeat"] = n_repeat
+        kwargs["n_repeats"] = n_repeats
 
         splits = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
 
@@ -1097,7 +1101,7 @@ def create_holdout_val_split(
             self,
             holdout_val_type: HoldoutValTypes,
             val_share: float,
-            n_repeat: int = 1,
+            n_repeats: int = 1,
     ) -> Tuple[np.ndarray, np.ndarray]:
         """
         This function creates the holdout split for the given task.
@@ -1106,7 +1110,7 @@ def create_holdout_val_split(
         Args:
             holdout_val_type (HoldoutValTypes):
             val_share (float): share of the validation data
-            n_repeat (int): how many n_prediction_steps to repeat in the validation set
+            n_repeats (int): how many n_prediction_steps to repeat in the validation set
 
         Returns:
             (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
@@ -1121,7 +1125,7 @@ def create_holdout_val_split(
         if not isinstance(holdout_val_type, HoldoutValTypes):
             raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
         kwargs = {"n_prediction_steps": self.n_prediction_steps,
-                  "n_repeat": n_repeat}
+                  "n_repeats": n_repeats}
 
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
@@ -1181,3 +1185,4 @@ def generate_test_seqs(self) -> List[TimeSeriesSequence]:
                 raise ValueError("If future features are required, X_test must be given!")
             test_seq.X = np.concatenate([test_seq.X, test_seq.X_test])
         return test_sets
+
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 9b383f563..91e6d2215 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -40,7 +40,7 @@ def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
                 X_grouped = X.groupby(X.index)
 
                 self.loc = X_grouped.agg("mean")
-                self.scale = X_grouped.agg("std")
+                self.scale = X_grouped.agg("std").fillna(0.0)
 
                 # for static features, if we do normalization w.r.t. each group, then they will become the same values,
                 # thus we treat them differently: normalize with the entire dataset
diff --git a/test/conftest.py b/test/conftest.py
index 232289915..b32aa6392 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -633,7 +633,7 @@ def get_forecasting_data(request):
         uni_variant = True
         targets_with_missing_value = True
     elif request == 'multi_variant_wo_missing':
-        with_missing_value = False
+        targets_with_missing_value = False
     elif request == 'multi_variant_w_missing':
         features_with_missing_value = True
     else:
@@ -712,11 +712,25 @@ def generate_forecasting_features(feature_type, length):
 
 
 def get_forecasting_fit_dictionary(X, y, validator, backend, budget_type='epochs', forecast_horizon=5, freq='1D'):
+    if X is not None:
+        X_test = []
+        for x in X:
+            if hasattr(x, 'iloc'):
+                X_test.append(x.iloc[-forecast_horizon:].copy())
+            else:
+                X_test.append(x[-forecast_horizon:].copy())
+        known_future_features = tuple(X[0].columns) if isinstance(X[0], pd.DataFrame) else \
+            np.arange(X[0].shape[-1]).tolist()
+    else:
+        X_test = None
+        known_future_features = None
     datamanager = TimeSeriesForecastingDataset(
         X=X, Y=y,
+        X_test=X_test,
         validator=validator,
         freq=freq,
         n_prediction_steps=forecast_horizon,
+        known_future_features=known_future_features
     )
 
     info = datamanager.get_required_dataset_info()
diff --git a/test/test_datasets/test_resampling_strategies.py b/test/test_datasets/test_resampling_strategies.py
index 6300b6e5c..2228db0dd 100644
--- a/test/test_datasets/test_resampling_strategies.py
+++ b/test/test_datasets/test_resampling_strategies.py
@@ -23,7 +23,7 @@ def test_holdoutfuncs():
     n_prediction_steps = 3
     n_repeats = 1
     train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
-                                                       n_repeat=n_repeats)
+                                                       n_repeats=n_repeats)
     # val must start n_predictions_steps after train
     assert val[0] - train[-1] == n_prediction_steps
     assert len(val) == n_repeats
@@ -31,7 +31,7 @@ def test_holdoutfuncs():
     n_prediction_steps = 2
     n_repeats = 2
     train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
-                                                       n_repeat=n_repeats)
+                                                       n_repeats=n_repeats)
     assert val[0] - train[-1] == n_prediction_steps
     assert len(val) == n_repeats
     # No overlapping between different splits
@@ -42,7 +42,7 @@ def test_holdoutfuncs():
     n_prediction_steps = 10
     n_repeats = 1
     train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
-                                                       n_repeat=n_repeats)
+                                                       n_repeats=n_repeats)
     # n_prediction steps is larger than the length of the sequence
     assert len(train) == 0
     assert val == 9
@@ -51,7 +51,7 @@ def test_holdoutfuncs():
     n_prediction_steps = 2
     n_repeats = 3
     train, val = split.time_series_hold_out_validation(0, 0, X, n_prediction_steps=n_prediction_steps,
-                                                       n_repeat=n_repeats)
+                                                       n_repeats=n_repeats)
     assert len(train) == 0
     assert val == 9
 
@@ -78,7 +78,7 @@ def test_crossvalfuncs():
 
     def eval_ts_cv(num_splits, n_prediction_steps, n_repeats):
         splits = split.time_series_cross_validation(0, num_splits, X,
-                                                    n_prediction_steps=n_prediction_steps, n_repeat=n_repeats)
+                                                    n_prediction_steps=n_prediction_steps, n_repeats=n_repeats)
         assert len(splits) == num_splits
         for i, sp in enumerate(splits):
             assert len(sp[1]) == n_repeats
@@ -94,13 +94,13 @@ def eval_ts_sea_cv(num_splits, n_prediction_steps, n_repeats, freq_value):
         splits = split.time_series_ts_cross_validation(0, num_splits=num_splits,
                                                        indices=X,
                                                        n_prediction_steps=n_prediction_steps,
-                                                       n_repeat=n_repeats,
+                                                       n_repeats=n_repeats,
                                                        seasonality_h_value=seasonality_h_value)
         assert len(splits) == num_splits
         assert splits[0][1][-1] == len(X) - 1
         if num_splits > 1:
             for i in range(1, num_splits):
-                dis_val_start_to_test = len(X) - 1 - (splits[i][1])
+                dis_val_start_to_test = len(X) - 1 - (splits[i][1] - n_prediction_steps)
                 assert np.all(dis_val_start_to_test % freq_value == 0)
 
     eval_ts_sea_cv(2, 10, 2, 6)
@@ -114,10 +114,11 @@ def eval_ts_sea_cv(num_splits, n_prediction_steps, n_repeats, freq_value):
     seasonality_h_value = int(np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
 
     sp2 = split.time_series_ts_cross_validation(0, num_splits=num_splits,
-                                                indices=X[:20],
+                                                indices=X[:10],
                                                 n_prediction_steps=n_prediction_steps,
-                                                n_repeat=n_repeats,
+                                                n_repeats=n_repeats,
                                                 seasonality_h_value=seasonality_h_value)
     # We cannot do a split, thus the two splits are the same
 
     assert np.all(sp2[1][1] == sp2[0][1])
+
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 6711cfef5..f92f09bf5 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -5,8 +5,12 @@
 import pandas as pd
 import pytest
 import unittest
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 from gluonts.time_feature import Constant as ConstantTransform, DayOfMonth
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes
+)
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
 
@@ -76,7 +80,7 @@ def test_get_val_seq_and_test_targets(self):
         self.seq_uni.cache_time_features()
         val_seq = self.seq_uni.get_val_seq_set(5)
         self.assertEqual(len(val_seq), 5 + 1)
-        self.assertEqual(len(val_seq._cached_time_features), 5+1 + self.n_prediction_steps)
+        self.assertEqual(len(val_seq._cached_time_features), 5 + 1 + self.n_prediction_steps)
 
         test_targets = self.seq_uni.get_test_target(-1)
         self.assertTrue(np.all(self.y[-self.n_prediction_steps:] == test_targets))
@@ -204,9 +208,9 @@ def test_exception(self):
 
 
 @pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing',
-                                                        'uni_variant_w_missing',
-                                                        'multi_variant_wo_missing',
-                                                        'uni_variant_w_missing'], indirect=True)
+                                                            'uni_variant_w_missing',
+                                                            'multi_variant_wo_missing',
+                                                            'uni_variant_w_missing'], indirect=True)
 def test_dataset_properties(backend, get_fit_dictionary_forecasting):
     # The fixture creates a datamanager by itself
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
@@ -232,4 +236,176 @@ def test_dataset_properties(backend, get_fit_dictionary_forecasting):
            get_fit_dictionary_forecasting['y_train'].isnull().values.any()
     if get_fit_dictionary_forecasting['X_train'] is not None:
         assert dataset_properties['features_have_missing_values'] == \
-               get_fit_dictionary_forecasting['X_train'].isnull().values.any()
\ No newline at end of file
+               get_fit_dictionary_forecasting['X_train'].isnull().values.any()
+
+
+def test_freq_valeus():
+    freq = '1H'
+    n_prediction_steps = 12
+
+    seasonality, freq, freq_value = TimeSeriesForecastingDataset.compute_freq_values(freq, n_prediction_steps)
+    assert seasonality == 24
+    assert freq == '1H'
+    assert freq_value == 24
+
+    n_prediction_steps = 36
+    seasonality, freq, freq_value = TimeSeriesForecastingDataset.compute_freq_values(freq, n_prediction_steps)
+    assert seasonality == 24
+    assert freq_value == 168
+
+    freq = [2, 3, 4]
+    n_prediction_steps = 10
+    seasonality, freq, freq_value = TimeSeriesForecastingDataset.compute_freq_values(freq, n_prediction_steps)
+    assert seasonality == 2
+    assert freq == '1Y'
+    assert freq_value == 4
+
+
+def test_target_normalization():
+    Y = [[1, 2], [3, 4, 5]]
+    dataset = TimeSeriesForecastingDataset(None, Y, normalize_y=True)
+
+    assert np.allclose(dataset.y_mean.values, np.vstack([np.mean(y) for y in Y]))
+    assert np.allclose(dataset.y_std.values, np.vstack([np.std(y, ddof=1) for y in Y]))
+    assert np.allclose(dataset.train_tensors[1].values.flatten(),
+                       np.hstack([(y - np.mean(y))/np.std(y, ddof=1) for y in Y]))
+
+
+@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
+def test_dataset_index(backend, get_fit_dictionary_forecasting):
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    assert np.allclose(datamanager[5][0]['past_targets'][-1].numpy(), 5.0)
+    assert np.allclose(datamanager[50][0]['past_targets'][-1].numpy(), 1005.0)
+    assert np.allclose(datamanager[150][0]['past_targets'][-1].numpy(), 2050.0)
+    assert np.allclose(datamanager[-1][0]['past_targets'][-1].numpy(), 9134.0)
+
+    assert datamanager.get_time_series_seq(50) == datamanager.datasets[1]
+
+    # test for validation indices
+    val_indices = datamanager.splits[0][1]
+    val_set = [datamanager.get_validation_set(val_idx) for val_idx in val_indices]
+    val_targets = np.concatenate([val_seq[-1][1]['future_targets'].numpy() for val_seq in val_set])
+    assert np.allclose(val_targets, datamanager.get_test_target(val_indices))
+
+
+@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
+def test_update_dataset(backend, get_fit_dictionary_forecasting):
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    X = datamanager.train_tensors[0]
+    for col in X.columns:
+        X[col] = X.index
+    datamanager.replace_data(X, None)
+    for i, data in enumerate(datamanager.datasets):
+        assert np.allclose(data.X, np.ones_like(data.X) * i)
+
+    datamanager.update_transform(ZeroTransformer(), train=True)
+    assert np.allclose(datamanager[0][0]['past_features'].numpy(), np.zeros(len(X.columns)))
+    assert datamanager.transform_time_features is False
+
+    datamanager.transform_time_features = True
+    for dataset in datamanager.datasets:
+        assert dataset.transform_time_features is True
+    seq_lengths = datamanager.sequence_lengths_train
+    new_test_seq = datamanager.generate_test_seqs()
+    for seq_len, test_seq in zip(seq_lengths, new_test_seq):
+        # seq_len is len(y) - n_prediction_steps, here we expand X_test with another n_prediction_steps
+        assert test_seq.X.shape[0] - seq_len == 2 * datamanager.n_prediction_steps
+
+
+def test_splits():
+
+    y = [np.arange(100 + i * 10) for i in range(10)]
+    resampling_strategy_args = {'num_splits': 5}
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    assert len(dataset.splits) == 5
+    assert dataset.splits[0][1][0] == (100 - 10 - 1)
+    for split in dataset.splits:
+        # We need to ensure that the training indices only interrupt at where the validation sets start, e.g.,
+        #  the tail of each sequence
+        assert len(np.unique(split[0] - np.arange(len(split[0])))) == len(y)
+        assert np.all(split[1][1:] - split[1][:-1] == [100 + i * 10 for i in range(9)])
+        assert len(split[1]) == len(y)
+
+    y = [np.arange(100) for _ in range(10)]
+    resampling_strategy_args = {'num_splits': 5,
+                                'n_repeats': 2}
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    assert len(dataset.splits) == 5
+    for split in dataset.splits:
+        assert len(split[1]) == len(y) * 1
+
+    y = [np.arange(40) for _ in range(10)]
+    resampling_strategy_args = {'num_splits': 5}
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    # the length of each sequence does not support 5 splitions
+    assert len(dataset.splits) == 3
+
+    # datasets with long but little sequence
+    y = [np.arange(4000) for _ in range(2)]
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_ts_cross_validation,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    # the length of each sequence does not support 5 splits
+    assert len(dataset.splits) == 2
+    for split in dataset.splits:
+        assert len(split[1]) == len(y) * 50
+
+    resampling_strategy = CrossValTypes.time_series_cross_validation
+
+    y = [np.arange(40) for _ in range(10)]
+    resampling_strategy_args = {'num_splits': 5,
+                                'n_repeats': 5}
+
+    resampling_strategy, resampling_strategy_args = TimeSeriesForecastingDataset.get_split_strategy(
+        [60] * 10,
+        10,
+        25,
+        CrossValTypes.time_series_ts_cross_validation,
+        resampling_strategy_args=resampling_strategy_args,
+    )
+    assert resampling_strategy_args['num_splits'] == 3
+    assert resampling_strategy_args['n_repeats'] == 1
+
+    resampling_strategy, resampling_strategy_args = TimeSeriesForecastingDataset.get_split_strategy(
+        [15] * 10,
+        10,
+        25,
+        CrossValTypes.time_series_cross_validation,
+    )
+    assert resampling_strategy == HoldoutValTypes.time_series_hold_out_validation
+
+    resampling_strategy_args = {'num_splits': 5,
+                                'n_repeats': 5}
+    resampling_strategy, resampling_strategy_args = TimeSeriesForecastingDataset.get_split_strategy(
+        [60] * 10,
+        10,
+        25,
+        CrossValTypes.time_series_cross_validation,
+        resampling_strategy_args=resampling_strategy_args,
+    )
+    assert resampling_strategy_args['num_splits'] == 4
+    assert resampling_strategy_args['n_repeats'] == 1
+
+    y = [np.arange(60) for _ in range(10)]
+    dataset = TimeSeriesForecastingDataset(None, y,
+                                           resampling_strategy=CrossValTypes.time_series_cross_validation,
+                                           resampling_strategy_args=resampling_strategy_args,
+                                           n_prediction_steps=10,
+                                           freq='1M')
+    assert len(dataset.splits) == 4
+
+    refit_set = dataset.create_refit_set()
+    assert len(refit_set.splits[0][0]) == len(refit_set)

From 235e310670c6d8b90ee2e550bbe30927e62d8484 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 9 May 2022 20:47:26 +0200
Subject: [PATCH 249/347] test on tae

---
 .../data/time_series_forecasting_validator.py |   4 -
 autoPyTorch/datasets/time_series_dataset.py   |   2 +-
 autoPyTorch/evaluation/tae.py                 |   4 +-
 ...time_series_forecasting_train_evaluator.py |   4 +-
 test/conftest.py                              |  26 ++-
 .../test_time_series_datasets.py              |  24 ++-
 test/test_evaluation/evaluation_util.py       |  31 ++-
 test/test_evaluation/test_evaluation.py       |  30 +++
 .../test_forecasting_evaluators.py            | 192 ++++++++++++++++++
 9 files changed, 297 insertions(+), 20 deletions(-)
 create mode 100644 test/test_evaluation/test_forecasting_evaluators.py

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index ee39c11db..7be693e8a 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -32,7 +32,6 @@ def __init__(self,
         self.target_validator = TimeSeriesTargetValidator(is_classification=self.is_classification,
                                                           logger=self.logger)
         self._is_uni_variant = False
-        self.known_future_features = None
         self.start_times = None
         self.feature_shapes: Dict[str, int] = {}
         self.feature_names: List[str] = []
@@ -46,7 +45,6 @@ def fit(
             X_test: Optional[Union[List, pd.DataFrame]] = None,
             y_test: Optional[Union[List, pd.DataFrame]] = None,
             start_times: Optional[List[pd.DatetimeIndex]] = None,
-            known_future_features: Optional[List[Union[int, str]]] = None,
     ) -> BaseEstimator:
         """
         fit the validator with the training data, (optionally) start times and other information
@@ -60,7 +58,6 @@ def fit(
             y_test (Optional[Union[List, pd.DataFrame]]): target in the future
             start_times (Optional[List[pd.DatetimeIndex]]): start times on which the first element of each series is
                 sampled
-            known_future_features (Optional[List[Union[int, str]]]): which features are known even in the future
 
         """
         self.series_idx = series_idx
@@ -91,7 +88,6 @@ def fit(
 
                 self._is_fitted = True
             else:
-                self.known_future_features = known_future_features
                 # Check that the data is valid
                 if len(X_train) != len(y_train):
                     raise ValueError("Inconsistent number of sequences for features and targets,"
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index bbef75c13..70100b169 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -364,7 +364,7 @@ def __init__(self,
                  X_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                  Y_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                  start_times: Optional[List[pd.DatetimeIndex]] = None,
-                 known_future_features: Optional[Tuple[str]] = None,
+                 known_future_features: Optional[Tuple[Union[str, int]]] = None,
                  time_feature_transform: Optional[List[TimeFeature]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  resampling_strategy: Optional[Union[
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index d7366cac1..c58383b0d 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -28,7 +28,6 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes
 )
-from autoPyTorch.evaluation.test_evaluator import eval_test_function
 from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.utils import (
     DisableFileOutputParameters,
@@ -150,8 +149,7 @@ def __init__(
             eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
             self.output_y_hat_optimization = output_y_hat_optimization
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
-            # TODO check eval_test for forecasting tasks
-            eval_function = eval_test_function
+            eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
             self.output_y_hat_optimization = False
 
         self.worst_possible_result = cost_for_crash
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 089e036b3..47d7274af 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -10,6 +10,7 @@
 from smac.tae import StatusType
 
 from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.automl_common.common.utils.backend import Backend
@@ -30,7 +31,7 @@ def __init__(self, backend: Backend, queue: Queue,
                  num_run: Optional[int] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
-                 disable_file_output: Union[bool, List] = False,
+                 disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  keep_models: Optional[bool] = None,
@@ -322,6 +323,7 @@ def _predict(self, pipeline: BaseEstimator,
                                                                                             self.n_prediction_steps,
                                                                                             self.num_targets])
             y_opt_full[test_split_subset_idx] = opt_pred.reshape([-1, self.n_prediction_steps, self.num_targets])
+
             opt_pred = y_opt_full
 
         opt_pred = opt_pred.reshape(-1, self.num_targets)
diff --git a/test/conftest.py b/test/conftest.py
index b32aa6392..73daa9c65 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -623,8 +623,7 @@ def input_data_featuretest(request):
 # Forecasting tasks
 def get_forecasting_data(request):
     uni_variant = False
-    targets_with_missing_value = False
-    features_with_missing_value = False
+    with_missing_values = False
     type_X = 'pd'
     with_series_id = False
     if request == 'uni_variant_wo_missing':
@@ -633,9 +632,9 @@ def get_forecasting_data(request):
         uni_variant = True
         targets_with_missing_value = True
     elif request == 'multi_variant_wo_missing':
-        targets_with_missing_value = False
+        with_missing_values = False
     elif request == 'multi_variant_w_missing':
-        features_with_missing_value = True
+        with_missing_values = True
     else:
         raise NotImplementedError
     generator = check_random_state(0)
@@ -643,8 +642,6 @@ def get_forecasting_data(request):
     base_length = 50
     targets = []
 
-    features = []
-
     start_times = []
     # the first character indicates the type of the feature:
     # n: numerical, c: categorical, s: static
@@ -700,7 +697,7 @@ def generate_forecasting_features(feature_type, length):
                 raise NotImplementedError
             features.append(feature)
 
-        if targets_with_missing_value:
+        if with_missing_values:
             new_seq[5] = np.NAN
             new_seq[-5] = np.NAN
 
@@ -711,7 +708,7 @@ def generate_forecasting_features(feature_type, length):
     return features, targets, input_validator.fit(features, targets, start_times=start_times)
 
 
-def get_forecasting_fit_dictionary(X, y, validator, backend, budget_type='epochs', forecast_horizon=5, freq='1D'):
+def get_forecasting_datamangaer(X, y, validator, forecast_horizon=5, freq='1D'):
     if X is not None:
         X_test = []
         for x in X:
@@ -732,7 +729,10 @@ def get_forecasting_fit_dictionary(X, y, validator, backend, budget_type='epochs
         n_prediction_steps=forecast_horizon,
         known_future_features=known_future_features
     )
+    return datamanager
+
 
+def get_forecasting_fit_dictionary(datamanager, backend, budget_type='epochs'):
     info = datamanager.get_required_dataset_info()
 
     dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
@@ -827,10 +827,18 @@ def input_data_forecastingfeaturetest(request):
         ValueError("Unsupported indirect fixture {}".format(request.param))
 
 
+@pytest.fixture(scope="class")
+def get_forecasting_datamanager(request):
+    X, y, validator = get_forecasting_data(request.param)
+    datamanager = get_forecasting_datamangaer(X, y, validator)
+    return datamanager
+
+
 @pytest.fixture
 def get_fit_dictionary_forecasting(request, backend):
     X, y, validator = get_forecasting_data(request.param)
-    return get_forecasting_fit_dictionary(X, y, validator, backend)
+    datamanager = get_forecasting_datamangaer(X, y, validator)
+    return get_forecasting_fit_dictionary(datamanager, backend)
 
 
 # Fixtures for forecasting validators.
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index f92f09bf5..f5300cda2 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -6,7 +6,11 @@
 import pytest
 import unittest
 from gluonts.time_feature import Constant as ConstantTransform, DayOfMonth
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.datasets.time_series_dataset import (
+    TimeSeriesForecastingDataset,
+    TimeSeriesSequence,
+    extract_feature_index
+)
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes
@@ -409,3 +413,21 @@ def test_splits():
 
     refit_set = dataset.create_refit_set()
     assert len(refit_set.splits[0][0]) == len(refit_set)
+
+
+def test_extract_time_features():
+    feature_shapes = {'b': 5, 'a': 3, 'c': 7, 'd': 12}
+    feature_names = ['a', 'b', 'c', 'd']
+    queried_features = ('b', 'd')
+    feature_index = extract_feature_index(feature_shapes, feature_names, queried_features)
+    feature_index2 = []
+    idx_tracker = 0
+    for fea_name in feature_names:
+        feature_s = feature_shapes[fea_name]
+        if fea_name in queried_features:
+            feature_index2.append(list(range(idx_tracker, idx_tracker + feature_s)))
+        idx_tracker += feature_s
+
+    assert feature_index == tuple(sum(feature_index2, []))
+    # the value should not be relevant with the order of queried_features
+    assert feature_index == extract_feature_index(feature_shapes, feature_names, ('d', 'b'))
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index 088726963..02dab602d 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -1,6 +1,7 @@
 import functools
 import traceback
 import unittest
+import pytest
 
 import numpy as np
 from numpy.linalg import LinAlgError
@@ -12,8 +13,11 @@
 from sklearn import preprocessing
 
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
+
 from autoPyTorch.pipeline.components.training.metrics.metrics import (
     accuracy,
     balanced_accuracy,
@@ -241,9 +245,34 @@ def get_500_classes_datamanager(resampling_strategy=HoldoutValTypes.holdout_vali
     return dataset
 
 
+def get_forecasting_dataset(n_seq=10,
+                            n_prediction_steps=5,
+                            resampling_strategy=HoldoutValTypes.time_series_hold_out_validation):
+    base_length = 50
+    X = []
+    targets = []
+    X_test = []
+
+    for i in range(n_seq):
+        series_length = base_length + i * 10
+
+        targets.append(np.arange(i * 1000, series_length + i * 1000))
+        X.append(targets[-1] - 1)
+        X_test.append(np.arange(X[-1][-1] + 1, X[-1][-1] + 1 + n_prediction_steps))
+    input_validator = TimeSeriesForecastingInputValidator(is_classification=False).fit(X, targets)
+    return TimeSeriesForecastingDataset(X=X, Y=targets, X_test=X_test,
+                                        known_future_features=(0,),
+                                        validator=input_validator,
+                                        resampling_strategy=resampling_strategy,
+                                        n_prediction_steps=n_prediction_steps
+                                        )
+
+
 def get_dataset_getters():
     return [get_binary_classification_datamanager,
             get_multiclass_classification_datamanager,
             get_500_classes_datamanager,
             get_abalone_datamanager,
-            get_regression_datamanager]
+            get_regression_datamanager,
+            get_forecasting_dataset]
+
diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py
index 2cabb6a73..8222efe70 100644
--- a/test/test_evaluation/test_evaluation.py
+++ b/test/test_evaluation/test_evaluation.py
@@ -18,6 +18,7 @@
 from smac.utils.constants import MAXINT
 
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy, log_loss
 
 this_directory = os.path.dirname(__file__)
@@ -432,6 +433,35 @@ def test_eval_with_simple_intensification(self):
             run_info_out, _ = ta.run_wrapper(run_info)
             self.assertEqual(run_info_out.budget, budget)
 
+    def test_eval_with_addition_eval_func_kwargs(self):
+        config = unittest.mock.Mock(spec=int)
+        config.config_id = 198
+
+        ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
+                                    stats=self.stats,
+                                    memory_limit=3072,
+                                    multi_objectives=["cost"],
+                                    metric=accuracy,
+                                    cost_for_crash=get_cost_of_crash(accuracy),
+                                    abort_on_first_run_crash=False,
+                                    logger_port=self.logger_port,
+                                    pynisher_context='fork',
+                                    budget_type='runtime',
+                                    evaluator_class=TimeSeriesForecastingTrainEvaluator
+                                    )
+
+        ta.pynisher_logger = unittest.mock.Mock()
+        run_info = RunInfo(config=config, cutoff=3000, instance=None,
+                           instance_specific=None, seed=1, capped=False)
+
+        for budget in [0.0, 50.0]:
+            # Simple intensification always returns budget = 0
+            # Other intensifications return a non-zero value
+            self.stats.submitted_ta_runs += 1
+            run_info = run_info._replace(budget=budget)
+            run_info_out, _ = ta.run_wrapper(run_info)
+            self.assertEqual(run_info_out.budget, budget)
+
 
 @pytest.mark.parametrize("metric,expected", [(accuracy, 1.0), (log_loss, MAXINT)])
 def test_get_cost_of_crash(metric, expected):
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
new file mode 100644
index 000000000..2afa56395
--- /dev/null
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -0,0 +1,192 @@
+import multiprocessing
+import os
+import queue
+import shutil
+import sys
+import unittest
+import unittest.mock
+import pytest
+
+from ConfigSpace import Configuration
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+from smac.tae import StatusType
+
+from autoPyTorch.automl_common.common.utils.backend import create
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, NoResamplingStrategyTypes
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
+from autoPyTorch.evaluation.utils import read_queue
+from autoPyTorch.pipeline.base_pipeline import BasePipeline
+from autoPyTorch.pipeline.components.training.metrics.metrics import mean_MASE_forecasting
+
+this_directory = os.path.dirname(__file__)
+sys.path.append(this_directory)
+from evaluation_util import (  # noqa (E402: module level import not at top of file)
+    BaseEvaluatorTest,
+    get_binary_classification_datamanager,
+    get_multiclass_classification_datamanager,
+    get_regression_datamanager,
+    get_forecasting_dataset
+)  # noqa (E402: module level import not at top of file)
+
+from test_evaluators import TestTrainEvaluator
+
+class BackendMock(object):
+    def load_datamanager(self):
+        return get_multiclass_classification_datamanager()
+
+
+class TestTimeSeriesForecastingTrainEvaluator(unittest.TestCase):
+    def setUp(self):
+        TestTrainEvaluator.setUp(self)
+
+    def tearDown(self):
+        TestTrainEvaluator.tearDown(self)
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_holdout(self, pipeline_mock):
+        pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 50}
+        D = get_forecasting_dataset()
+        n_prediction_steps = D.n_prediction_steps
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                        queue_,
+                                                        configuration=configuration,
+                                                        metric=mean_MASE_forecasting, budget=0,
+                                                        pipeline_config={'budget_type': 'epochs', 'epochs': 50},
+                                                        min_num_test_instances=100)
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+        self.assertEqual(len(rval[0]), 3)
+        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertEqual(result, 4592.0)
+        self.assertEqual(pipeline_mock.fit.call_count, 1)
+        # As forecasting inference could be quite expensive, we only allow one validation prediction
+        self.assertEqual(pipeline_mock.predict.call_count, 1)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], len(D.splits[0][1]) * n_prediction_steps)
+        self.assertIsNone(evaluator.file_output.call_args[0][1])
+        self.assertIsNone(evaluator.file_output.call_args[0][2])
+        self.assertEqual(evaluator.pipeline.fit.call_count, 1)
+
+        res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
+        assert np.all(res == 0.)
+
+
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_cv(self, pipeline_mock):
+        D = get_forecasting_dataset(resampling_strategy=CrossValTypes.time_series_cross_validation)
+        assert D.resampling_strategy_args['num_splits'] == 3
+
+        n_prediction_steps = D.n_prediction_steps
+
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                        queue_,
+                                                        configuration=configuration,
+                                                        metric=mean_MASE_forecasting, budget=0,
+                                                        pipeline_config={'budget_type': 'epochs', 'epochs': 50})
+
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+        self.assertEqual(len(rval[0]), 3)
+        self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
+
+        self.assertEqual(evaluator.file_output.call_count, 1)
+        self.assertAlmostEqual(result, 4587.208333333334)
+        self.assertEqual(pipeline_mock.fit.call_count, 3)
+        # 3 calls because of the 3 times validation evaluations
+        self.assertEqual(pipeline_mock.predict.call_count, 3)
+        # as the optimisation preds in cv is concatenation of the 5 folds,
+        # so it is 5*splits
+        self.assertEqual(evaluator.file_output.call_args[0][0].shape[0],
+                         3 * len(D.splits[0][1]) * n_prediction_steps, evaluator.file_output.call_args)
+        self.assertIsNone(evaluator.file_output.call_args[0][1])
+        # we do not have test sets
+        self.assertIsNone(evaluator.file_output.call_args[0][2])
+
+        res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
+        assert np.all(res == 0.)
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_proxy_val_set(self, pipeline_mock):
+        pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 0.1}
+        D = get_forecasting_dataset()
+        n_prediction_steps = D.n_prediction_steps
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                        queue_,
+                                                        configuration=configuration,
+                                                        metric=mean_MASE_forecasting, budget=0.3,
+                                                        pipeline_config={'budget_type': 'epochs', 'epochs': 50},
+                                                        min_num_test_instances=1)
+        evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
+        evaluator.file_output.return_value = (None, {})
+
+        evaluator.fit_predict_and_loss()
+
+        rval = read_queue(evaluator.queue)
+
+        self.assertEqual(len(rval), 1)
+        result = rval[0]['loss']
+
+        self.assertEqual(result, 925.2)
+        res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
+
+        n_evaluated_pip_mock = 0
+
+        for i_seq, seq_output in enumerate(res):
+            if i_seq % 3 == 0 and n_evaluated_pip_mock < 3:
+                n_evaluated_pip_mock += 1
+                assert np.all(seq_output == 0.)
+            else:
+                # predict with dummy predictor
+                assert np.all(seq_output == D.datasets[i_seq][-1][0]['past_targets'][-1].numpy())

From 9d8dd0b9c280a21ed92a0760c161557a8a85b08a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 9 May 2022 20:58:30 +0200
Subject: [PATCH 250/347] maint

---
 ...time_series_forecasting_train_evaluator.py | 41 ++++---------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 47d7274af..61b2a8869 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -1,3 +1,4 @@
+import warnings
 from multiprocessing.queues import Queue
 from typing import Any, Dict, List, Optional, Tuple, Union, Sequence
 
@@ -222,24 +223,12 @@ def fit_predict_and_loss(self) -> None:
             ])
 
             if self.X_valid is not None:
-                Y_valid_preds = np.array([Y_valid_pred[i]
-                                          for i in range(self.num_folds)
-                                          if Y_valid_pred[i] is not None])
-                # Average the predictions of several pipelines
-                if len(Y_valid_preds.shape) == 3:
-                    Y_valid_preds = np.nanmean(Y_valid_preds, axis=0)
-            else:
-                Y_valid_preds = None
+                warnings.warn('valid_pred is current unsuported for fore casting tasks!')
+            Y_valid_preds = None
 
             if self.X_test is not None:
-                Y_test_preds = np.array([Y_test_pred[i]
-                                         for i in range(self.num_folds)
-                                         if Y_test_pred[i] is not None])
-                # Average the predictions of several pipelines
-                if len(Y_test_preds.shape) == 3:
-                    Y_test_preds = np.nanmean(Y_test_preds, axis=0)
-            else:
-                Y_test_preds = None
+                warnings.warn('test_pred is current unsuported for fore casting tasks!')
+            Y_test_preds = None
 
             self.Y_optimization = Y_targets
             self.Y_actual_train = Y_train_targets
@@ -329,23 +318,11 @@ def _predict(self, pipeline: BaseEstimator,
         opt_pred = opt_pred.reshape(-1, self.num_targets)
 
         if self.X_valid is not None:
-            valid_sets = []
-            for val_seq in enumerate(self.datamanager.datasets):
-                valid_sets.append(val_seq.X_val)
-            valid_pred = self.predict_function(valid_sets, pipeline).flatten()
-
-            valid_pred = valid_pred.reshape(-1, self.num_targets)
-
-        else:
-            valid_pred = None
+            warnings.warn('valid_pred is current unsuported for fore casting tasks!')
+        valid_pred = None
 
         if self.X_test is not None:
-            test_sets = []
-            for test_seq in enumerate(self.datamanager.datasets):
-                test_sets.append(test_seq.X_test)
-            test_pred = self.predict_function(valid_sets, pipeline).flatten()
-            test_pred = test_pred.reshape(-1, self.num_targets)
-        else:
-            test_pred = None
+            warnings.warn('test_pred is current unsuported for fore casting tasks!')
+        test_pred = None
 
         return np.empty(1), opt_pred, valid_pred, test_pred

From dc4b602c21840de4e08c6f91adab1ab49365b45a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 10 May 2022 10:34:12 +0200
Subject: [PATCH 251/347] all evaluator to evalaute test sets

---
 autoPyTorch/datasets/time_series_dataset.py   | 38 +++++++++++--------
 autoPyTorch/evaluation/abstract_evaluator.py  | 17 ++++++---
 ...time_series_forecasting_train_evaluator.py | 31 ++++++++++-----
 test/conftest.py                              | 12 +++++-
 .../test_time_series_datasets.py              | 14 ++++++-
 test/test_evaluation/evaluation_util.py       |  4 ++
 .../test_forecasting_evaluators.py            | 31 +++++++--------
 7 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 70100b169..f010412ac 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -427,7 +427,7 @@ def __init__(self,
         self.categorical_columns = self.validator.feature_validator.categorical_columns
 
         self.num_features = self.validator.feature_validator.num_features  # type: int
-        self.num_target = self.validator.target_validator.out_dimensionality  # type: int
+        self.num_targets = self.validator.target_validator.out_dimensionality  # type: int
 
         self.categories = self.validator.feature_validator.categories
 
@@ -474,7 +474,7 @@ def __init__(self,
 
         self.normalize_y = normalize_y
 
-        sequence_datasets, train_tensors, sequence_lengths = self.transform_data_into_time_series_sequence(
+        sequence_datasets, train_tensors, test_tensors, sequence_lengths = self.transform_data_into_time_series_sequence(
             X, Y,
             start_times=self.start_times,
             X_test=X_test,
@@ -498,7 +498,7 @@ def __init__(self,
 
         self.train_tensors = train_tensors
 
-        self.test_tensors = None  # Test tensor is not applied to forecasting tasks
+        self.test_tensors = test_tensors
         self.val_tensors = None
 
         self.task_type: Optional[str] = None
@@ -519,12 +519,12 @@ def __init__(self,
                 self.output_type = "continuous"
 
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
-                num_target = len(np.unique(Y))
+                num_targets = len(np.unique(Y))
                 # self.output_shape = len(np.unique(Y))
             else:
                 # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
-                num_target = Y.shape[-1] if Y.ndim > 1 else 1
-            self.output_shape = [self.n_prediction_steps, num_target]
+                num_targets = Y.shape[-1] if Y.ndim > 1 else 1
+            self.output_shape = [self.n_prediction_steps, num_targets]
         else:
             raise ValueError('Forecasting dataset must contain target values!')
 
@@ -656,7 +656,7 @@ def get_time_series_seq(self, idx) -> TimeSeriesSequence:
 
     def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
         test_indices = np.where(test_indices < 0, test_indices + len(self), test_indices)
-        y_test = np.ones([len(test_indices), self.n_prediction_steps, self.num_target])
+        y_test = np.ones([len(test_indices), self.n_prediction_steps, self.num_targets])
         y_test_argsort = np.argsort(test_indices)
         dataset_idx = self._get_dataset_indices(test_indices[y_test_argsort[0]], only_dataset_idx=True)
 
@@ -668,7 +668,7 @@ def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
                 test_idx = test_idx - self.cumulative_sizes[dataset_idx - 1]
             y_test[y_i] = self.datasets[dataset_idx].get_test_target(test_idx)
 
-        return y_test.reshape([-1, self.num_target])
+        return y_test.reshape([-1, self.num_targets])
 
     def transform_data_into_time_series_sequence(self,
                                                  X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]],
@@ -678,7 +678,9 @@ def transform_data_into_time_series_sequence(self,
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                                                  Y_test: Optional[
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
-                                                 is_test_set: bool = False):
+                                                 is_test_set: bool = False) ->[
+
+    ]:
         """
         Transform the raw data into a list of TimeSeriesSequence that can be processed by AutoPyTorch Time Series
                 build a series time sequence datasets
@@ -689,8 +691,6 @@ def transform_data_into_time_series_sequence(self,
                 flattened train target array with size N_all (the sum of all the series sequences) and number of targets
             start_times: List[pd.DatetimeIndex]
                 start time of each training series
-            time_features_train: Dict[pd.Timestamp, np.ndarray]:
-                time features for each possible start training times
             X_test: Optional[np.ndarray (N_all_test, N_feature)]
                 flattened test feature array with size N_all_test (the sum of all the series sequences) and N_feature,
                 number of features
@@ -731,7 +731,7 @@ def transform_data_into_time_series_sequence(self,
             if Y_test is not None:
                 Y_test = (Y_test[mean.columns] - mean) / std
 
-        sequence_datasets, train_tensors = self.make_sequences_datasets(
+        sequence_datasets, train_tensors, test_tensors = self.make_sequences_datasets(
             X=X, Y=Y,
             X_test=X_test, Y_test=Y_test,
             start_times=start_times,
@@ -739,7 +739,7 @@ def transform_data_into_time_series_sequence(self,
             is_test_set=is_test_set,
             dataset_with_future_features=dataset_with_future_features,
             **self.sequences_builder_kwargs)
-        return sequence_datasets, train_tensors, sequence_lengths
+        return sequence_datasets, train_tensors, test_tensors, sequence_lengths
 
     @staticmethod
     def make_sequences_datasets(X: Optional[pd.DataFrame],
@@ -752,7 +752,8 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
                                 dataset_with_future_features: bool = False,
                                 **sequences_kwargs: Optional[Dict]) -> Tuple[
         List[TimeSeriesSequence],
-        Tuple[Optional[pd.DataFrame], pd.DataFrame]
+        Tuple[Optional[pd.DataFrame], pd.DataFrame],
+        Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]]
     ]:
         """
         build a series time sequence datasets
@@ -780,7 +781,9 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
         Returns:
             sequence_datasets : List[TimeSeriesSequence]
                 a list of datasets
-            train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
+            train_tensors: Tuple[pd.DataFrame, pd.DataFrame]
+                training tensors
+            train_tensors: Optional[Tuple[pd.DataFrame, pd.DataFrame]]
                 training tensors
 
         """
@@ -791,6 +794,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
             x_group = X.groupby(X.index)
         if Y_test is not None:
             y_test_group = Y_test.groupby(Y_test.index)
+
         if X_test is not None:
             x_test_group = X_test.groupby(X_test.index)
 
@@ -814,8 +818,10 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
             sequence_datasets.append(sequence)
 
         train_tensors = (X, Y)
+        # we could guarantee that Y_test has shape [len(seq) * n_prediction_steps, num_targets]
+        test_tensors = (None, Y_test.values) if Y_test is not None else None
 
-        return sequence_datasets, train_tensors
+        return sequence_datasets, train_tensors, test_tensors
 
     def replace_data(self,
                      X_train: pd.DataFrame,
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 4f082d6de..27d4ed7f4 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -727,7 +727,7 @@ def _get_pipeline(self) -> BaseEstimator:
             raise ValueError("Invalid configuration entered")
         return pipeline
 
-    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **loss_kwargs: Dict) -> Dict[str, float]:
+    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Dict) -> Dict[str, float]:
         """SMAC follows a minimization goal, so the make_scorer
         sign is used as a guide to obtain the value to reduce.
         The calculate_loss internally translate a score function to
@@ -753,12 +753,12 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **loss_kwargs: Dict) -> D
         else:
             metrics = [self.metric]
         return calculate_loss(
-            y_true, y_hat, self.task_type, metrics, **loss_kwargs)
+            y_true, y_hat, self.task_type, metrics, **metric_kwargs)
 
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   opt_pred: np.ndarray, valid_pred: Optional[np.ndarray],
                   test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
-                  file_output: bool, status: StatusType
+                  file_output: bool, status: StatusType, **metric_kwargs: Dict
                   ) -> Optional[Tuple[float, float, int, Dict]]:
         """This function does everything necessary after the fitting is done:
 
@@ -788,6 +788,8 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                 Whether or not this pipeline should output information to disk
             status (StatusType)
                 The status of the run, following SMAC StatusType syntax.
+            metric_kwargs (Dict)
+                Additional arguments for computing metrics
 
         Returns:
             duration (float):
@@ -810,7 +812,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             additional_run_info_ = {}
 
         validation_loss, test_loss = self.calculate_auxiliary_losses(
-            valid_pred, test_pred
+            valid_pred, test_pred, **metric_kwargs
         )
 
         if loss_ is not None:
@@ -842,6 +844,7 @@ def calculate_auxiliary_losses(
         self,
         Y_valid_pred: np.ndarray,
         Y_test_pred: np.ndarray,
+        **metric_kwargs: Dict
     ) -> Tuple[Optional[Dict[str, float]], Optional[Dict[str, float]]]:
         """
         A helper function to calculate the performance estimate of the
@@ -854,6 +857,8 @@ def calculate_auxiliary_losses(
             Y_test_pred (np.ndarray):
                 predictions on a test set provided by the user,
                 matching self.y_test
+            metric_kwargs (Dict)
+                additional argument for evaluating the loss metric
 
         Returns:
             validation_loss_dict (Optional[Dict[str, float]]):
@@ -866,12 +871,12 @@ def calculate_auxiliary_losses(
 
         if Y_valid_pred is not None:
             if self.y_valid is not None:
-                validation_loss_dict = self._loss(self.y_valid, Y_valid_pred)
+                validation_loss_dict = self._loss(self.y_valid, Y_valid_pred, **metric_kwargs)
 
         test_loss_dict: Optional[Dict[str, float]] = None
         if Y_test_pred is not None:
             if self.y_test is not None:
-                test_loss_dict = self._loss(self.y_test, Y_test_pred)
+                test_loss_dict = self._loss(self.y_test, Y_test_pred, **metric_kwargs)
 
         return validation_loss_dict, test_loss_dict
 
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 61b2a8869..9a5225ecd 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -73,7 +73,7 @@ def __init__(self, backend: Backend, queue: Queue,
         self.datamanager = backend.load_datamanager()
         self.n_prediction_steps = self.datamanager.n_prediction_steps
         self.num_sequences = self.datamanager.num_sequences
-        self.num_targets = self.datamanager.num_target
+        self.num_targets = self.datamanager.num_targets
         self.seq_length_min = np.min(self.num_sequences)
         seasonality = SEASONALITY_MAP.get(self.datamanager.freq, 1)
         if isinstance(seasonality, list):
@@ -82,6 +82,7 @@ def __init__(self, backend: Backend, queue: Queue,
 
         self.max_budget = max_budget
         self.min_num_test_instances = min_num_test_instances
+        self.eval_test_tensors = True
 
     def fit_predict_and_loss(self) -> None:
         """Fit, predict and compute the loss for cross-validation and
@@ -132,6 +133,7 @@ def fit_predict_and_loss(self) -> None:
                 additional_run_info=additional_run_info,
                 file_output=True,
                 status=status,
+                **forecasting_kwargs
             )
 
         else:
@@ -157,6 +159,8 @@ def fit_predict_and_loss(self) -> None:
                 mase_coefficient_all.append(mase_coefficient)
 
             for i, (train_split, test_split) in enumerate(self.splits):
+                if i > 0:
+                    self.eval_test_tensors = False
                 pipeline = self.pipelines[i]
 
                 train_pred, opt_pred, valid_pred, test_pred = self._fit_and_predict(pipeline, i,
@@ -222,13 +226,19 @@ def fit_predict_and_loss(self) -> None:
                 if Y_targets[i] is not None
             ])
 
-            if self.X_valid is not None:
+            if self.y_valid is not None:
                 warnings.warn('valid_pred is current unsuported for fore casting tasks!')
             Y_valid_preds = None
 
-            if self.X_test is not None:
-                warnings.warn('test_pred is current unsuported for fore casting tasks!')
-            Y_test_preds = None
+            if self.y_test is not None:
+                Y_test_preds = np.array([Y_test_pred[i] * mase_coefficient_all[0]
+                                         for i in range(self.num_folds)
+                                         if Y_test_pred[i] is not None])
+                # Average the predictions of several pipelines
+                if len(Y_test_preds.shape) == 3:
+                    Y_test_preds = np.nanmean(Y_test_preds, axis=0)
+            else:
+                Y_test_preds = None
 
             self.Y_optimization = Y_targets
             self.Y_actual_train = Y_train_targets
@@ -246,6 +256,7 @@ def fit_predict_and_loss(self) -> None:
                 additional_run_info=additional_run_info,
                 file_output=True,
                 status=status,
+                **forecasting_kwargs,
             )
 
     def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.ndarray:
@@ -317,12 +328,14 @@ def _predict(self, pipeline: BaseEstimator,
 
         opt_pred = opt_pred.reshape(-1, self.num_targets)
 
-        if self.X_valid is not None:
+        if self.y_valid is not None:
             warnings.warn('valid_pred is current unsuported for fore casting tasks!')
         valid_pred = None
 
-        if self.X_test is not None:
-            warnings.warn('test_pred is current unsuported for fore casting tasks!')
-        test_pred = None
+        if self.y_test is not None and self.eval_test_tensors:
+            test_seq = self.datamanager.generate_test_seqs()
+            test_pred = self.predict_function(test_seq, pipeline).reshape(-1, self.num_targets)
+        else:
+            test_pred = None
 
         return np.empty(1), opt_pred, valid_pred, test_pred
diff --git a/test/conftest.py b/test/conftest.py
index 73daa9c65..6e7ba471d 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -708,7 +708,7 @@ def generate_forecasting_features(feature_type, length):
     return features, targets, input_validator.fit(features, targets, start_times=start_times)
 
 
-def get_forecasting_datamangaer(X, y, validator, forecast_horizon=5, freq='1D'):
+def get_forecasting_datamangaer(X, y, validator, with_y_test=True, forecast_horizon=5, freq='1D'):
     if X is not None:
         X_test = []
         for x in X:
@@ -721,9 +721,19 @@ def get_forecasting_datamangaer(X, y, validator, forecast_horizon=5, freq='1D'):
     else:
         X_test = None
         known_future_features = None
+    if with_y_test:
+        y_test = []
+        for y_seq in y:
+            if hasattr(y_seq, 'iloc'):
+                y_test.append(y_seq.iloc[-forecast_horizon:].copy() + 1)
+            else:
+                y_test.append(y_seq[-forecast_horizon:].copy() + 1)
+    else:
+        y_test = None
     datamanager = TimeSeriesForecastingDataset(
         X=X, Y=y,
         X_test=X_test,
+        Y_test=y_test,
         validator=validator,
         freq=freq,
         n_prediction_steps=forecast_horizon,
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index f5300cda2..55457d149 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -316,8 +316,20 @@ def test_update_dataset(backend, get_fit_dictionary_forecasting):
         assert test_seq.X.shape[0] - seq_len == 2 * datamanager.n_prediction_steps
 
 
-def test_splits():
+@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
+def test_test_tensors(backend, get_fit_dictionary_forecasting):
+    datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
+    test_tensors = datamanager.test_tensors
+    forecast_horizon = datamanager.n_prediction_steps
+    n_seq = len(datamanager.datasets)
+    assert test_tensors[0] is None
+    assert test_tensors[1].shape == (n_seq * forecast_horizon, datamanager.num_targets)
 
+    datamanager2 = TimeSeriesForecastingDataset(X=None, Y=[[1, 2]])
+    assert datamanager2.test_tensors is None
+
+
+def test_splits():
     y = [np.arange(100 + i * 10) for i in range(10)]
     resampling_strategy_args = {'num_splits': 5}
     dataset = TimeSeriesForecastingDataset(None, y,
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index 02dab602d..71c6844f5 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -252,6 +252,7 @@ def get_forecasting_dataset(n_seq=10,
     X = []
     targets = []
     X_test = []
+    Y_test = []
 
     for i in range(n_seq):
         series_length = base_length + i * 10
@@ -259,8 +260,11 @@ def get_forecasting_dataset(n_seq=10,
         targets.append(np.arange(i * 1000, series_length + i * 1000))
         X.append(targets[-1] - 1)
         X_test.append(np.arange(X[-1][-1] + 1, X[-1][-1] + 1 + n_prediction_steps))
+        Y_test.append(np.arange(targets[-1][-1] + 1, targets[-1][-1] + 1 + n_prediction_steps))
+
     input_validator = TimeSeriesForecastingInputValidator(is_classification=False).fit(X, targets)
     return TimeSeriesForecastingDataset(X=X, Y=targets, X_test=X_test,
+                                        Y_test=Y_test,
                                         known_future_features=(0,),
                                         validator=input_validator,
                                         resampling_strategy=resampling_strategy,
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
index 2afa56395..2423d400f 100644
--- a/test/test_evaluation/test_forecasting_evaluators.py
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -1,25 +1,19 @@
 import multiprocessing
 import os
 import queue
-import shutil
 import sys
 import unittest
 import unittest.mock
-import pytest
 
 from ConfigSpace import Configuration
 
 import numpy as np
 
-from sklearn.base import BaseEstimator
-
-from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import create
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes, NoResamplingStrategyTypes
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes
 from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.evaluation.utils import read_queue
-from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.training.metrics.metrics import mean_MASE_forecasting
 
 this_directory = os.path.dirname(__file__)
@@ -34,6 +28,7 @@
 
 from test_evaluators import TestTrainEvaluator
 
+
 class BackendMock(object):
     def load_datamanager(self):
         return get_multiclass_classification_datamanager()
@@ -77,25 +72,26 @@ def test_holdout(self, pipeline_mock):
         self.assertEqual(len(rval), 1)
         result = rval[0]['loss']
         self.assertEqual(len(rval[0]), 3)
+
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(result, 4592.0)
         self.assertEqual(pipeline_mock.fit.call_count, 1)
-        # As forecasting inference could be quite expensive, we only allow one validation prediction
-        self.assertEqual(pipeline_mock.predict.call_count, 1)
+        # As forecasting inference could be quite expensive, we only allow one opt prediction and test prediction
+        self.assertEqual(pipeline_mock.predict.call_count, 2)
 
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], len(D.splits[0][1]) * n_prediction_steps)
         self.assertIsNone(evaluator.file_output.call_args[0][1])
-        self.assertIsNone(evaluator.file_output.call_args[0][2])
+
+        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
+                         D.test_tensors[1].shape[0])
         self.assertEqual(evaluator.pipeline.fit.call_count, 1)
 
         res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
         assert np.all(res == 0.)
 
-
-
     @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
     def test_cv(self, pipeline_mock):
         D = get_forecasting_dataset(resampling_strategy=CrossValTypes.time_series_cross_validation)
@@ -134,15 +130,16 @@ def test_cv(self, pipeline_mock):
         self.assertEqual(evaluator.file_output.call_count, 1)
         self.assertAlmostEqual(result, 4587.208333333334)
         self.assertEqual(pipeline_mock.fit.call_count, 3)
-        # 3 calls because of the 3 times validation evaluations
-        self.assertEqual(pipeline_mock.predict.call_count, 3)
-        # as the optimisation preds in cv is concatenation of the 5 folds,
-        # so it is 5*splits
+        # 3 calls because of the 3 times validation evaluations, however, we only evaluate test target once
+        self.assertEqual(pipeline_mock.predict.call_count, 4)
+        # as the optimisation preds in cv is concatenation of the 3 folds,
+        # so it is 3*splits
         self.assertEqual(evaluator.file_output.call_args[0][0].shape[0],
                          3 * len(D.splits[0][1]) * n_prediction_steps, evaluator.file_output.call_args)
         self.assertIsNone(evaluator.file_output.call_args[0][1])
         # we do not have test sets
-        self.assertIsNone(evaluator.file_output.call_args[0][2])
+        self.assertEqual(evaluator.file_output.call_args[0][2].shape[0],
+                         D.test_tensors[1].shape[0])
 
         res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
         assert np.all(res == 0.)

From e8cf8cbe67e79e1e6b48f135486a449ac3fe7375 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 10 May 2022 12:58:29 +0200
Subject: [PATCH 252/347] tests on losses

---
 autoPyTorch/datasets/time_series_dataset.py   |   4 +-
 .../pipeline/components/training/losses.py    |  88 ++++++------
 test/conftest.py                              |  10 ++
 test/test_pipeline/test_losses.py             | 130 +++++++++++++-----
 4 files changed, 154 insertions(+), 78 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f010412ac..5a290039e 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -516,13 +516,13 @@ def __init__(self,
             self.output_type: str = type_of_target(self.train_tensors[1][0].fillna(method="pad"))
 
             if self.output_type in ["binary", "multiclass"]:
+                # TODO in the future we also want forecasting classification task, we need to find a way to distinguish
+                # TODO these tasks with the integral forecasting tasks!
                 self.output_type = "continuous"
 
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
                 num_targets = len(np.unique(Y))
-                # self.output_shape = len(np.unique(Y))
             else:
-                # self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
                 num_targets = Y.shape[-1] if Y.ndim > 1 else 1
             self.output_shape = [self.n_prediction_steps, num_targets]
         else:
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index f896fa3cb..27b692cef 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -16,7 +16,7 @@
             MASELoss: supports continuous output types
             L1Loss: supports continuous output types
 """
-from typing import Any, Dict, Optional, Type, List
+from typing import Any, Dict, Optional, Type, List, Union
 
 import torch
 from torch.nn.modules.loss import (
@@ -31,73 +31,77 @@
     FORECASTING_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
 
 
-class LogProbLoss(Loss):
+class AbstractForecastingLoss(Loss):
     __constants__ = ['reduction']
 
     def __init__(self, reduction: str = 'mean') -> None:
-        super(LogProbLoss, self).__init__(reduction=reduction)
+        super(AbstractForecastingLoss, self).__init__(reduction=reduction)
 
-    def forward(self, input_dist: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
-        scores = input_dist.log_prob(target_tensor)
+    def aggregate_loss(self, loss_values: torch.Tensor) -> torch.Tensor:
         if self.reduction == 'mean':
-            return - scores.mean()
+            return loss_values.mean()
         elif self.reduction == 'sum':
-            return - scores.sum()
+            return loss_values.sum()
         else:
-            return -scores
+            return loss_values
 
 
-class MAPELoss(Loss):
-    __constants__ = ['reduction']
+class LogProbLoss(AbstractForecastingLoss):
+    def forward(self, input_dist: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
+        scores = input_dist.log_prob(target_tensor)
+        return self.aggregate_loss(-scores)
 
-    def __init__(self, reduction: str = 'mean') -> None:
-        super(MAPELoss, self).__init__(reduction=reduction)
 
-    def forward(self, predictions: torch.distributions.Distribution, target_tensor: torch.Tensor) -> torch.Tensor:
+class MAPELoss(AbstractForecastingLoss):
+    def forward(self, predictions: torch.Tensor, target_tensor: torch.Tensor) -> torch.Tensor:
         # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/model/n_beats/_network.py
         denominator = torch.abs(target_tensor)
         diff = torch.abs(predictions - target_tensor)
 
         flag = (denominator == 0).float()
 
-        loss = (diff * (1 - flag)) / (denominator + flag)
-
-        if self.reduction == 'mean':
-            return loss.mean()
-        elif self.reduction == 'sum':
-            return loss.sum()
-        else:
-            return loss
+        mape = (diff * (1 - flag)) / (denominator + flag)
 
+        return self.aggregate_loss(mape)
 
-class MASELoss(Loss):
-    __constants__ = ['reduction']
 
+class MASELoss(AbstractForecastingLoss):
     def __init__(self, reduction: str = 'mean') -> None:
-        super(MASELoss, self).__init__(reduce=reduction)
-        self._mase_coefficient = 1.0
+        super(MASELoss, self).__init__(reduction=reduction)
+        self._mase_coefficient: Union[float, torch.Tensor] = 1.0
 
     def set_mase_coefficient(self, mase_coefficient: torch.Tensor) -> 'MASELoss':
+        """
+        set mase coefficient for computing MASE losses
+        Args:
+            mase_coefficient (torch.Tensor): mase coefficient, its dimensions corresponds to [B, L, N] and can be
+                broadcasted
+
+        Returns:
+
+        """
         if len(mase_coefficient.shape) == 2:
             mase_coefficient = mase_coefficient.unsqueeze(1)
+
         self._mase_coefficient = mase_coefficient
         return self
 
     def forward(self,
-                predictions: torch.distributions.Distribution,
+                predictions: torch.Tensor,
                 target_tensor: torch.Tensor) -> torch.Tensor:
-        loss = torch.abs(predictions - target_tensor) * self._mase_coefficient
-        if self.reduction == 'mean':
-            return loss.mean()
-        elif self.reduction == 'sum':
-            return loss.sum()
-        else:
-            return loss
-
-
-class QuantileLoss(Loss):
-    __constants__ = ['reduction']
-
+        if isinstance(self._mase_coefficient, torch.Tensor):
+            mase_shape = self._mase_coefficient.shape
+            pred_shape = predictions.shape
+            if len(mase_shape) == len(pred_shape):
+                if mase_shape[0] != pred_shape[0] or mase_shape[-1] != pred_shape[-1]:
+                    raise ValueError(f"If self._mase_coefficient is a Tensor, it must have the same batch size and "
+                                     f"num_targets as the predictions, However, their shapes are {mase_shape}"
+                                     f"(self._mase_coefficient) and {pred_shape}(pred_shape)")
+        loss_values = torch.abs(predictions - target_tensor) * self._mase_coefficient
+        return self.aggregate_loss(loss_values)
+
+
+class QuantileLoss(AbstractForecastingLoss):
     def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5], loss_weights=None) -> None:
         super(QuantileLoss, self).__init__(reduction=reduction)
         self.quantiles = quantiles
@@ -118,12 +122,8 @@ def forward(self,
 
         losses_all = torch.mean(torch.concat(losses_all, dim=-1), dim=-1)
 
-        if self.reduction == 'mean':
-            return losses_all.mean()
-        elif self.reduction == 'sum':
-            return losses_all.sum()
-        else:
-            return losses_all
+        return self.aggregate_loss(losses_all)
+
 
 losses = dict(
     classification=dict(
diff --git a/test/conftest.py b/test/conftest.py
index 6e7ba471d..b58f56765 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -463,6 +463,16 @@ def loss_mse():
     return dataset_properties, predictions, name, targets, labels
 
 
+@pytest.fixture
+def loss_mape():
+    dataset_properties = {'task_type': 'time_series_forecasting', 'output_type': 'continuous'}
+    predictions = torch.randn(4)
+    name = 'MAPELoss'
+    targets = torch.randn(4)
+    labels = None
+    return dataset_properties, predictions, name, targets, labels
+
+
 @pytest.fixture
 def loss_details(request):
     return request.getfixturevalue(request.param)
diff --git a/test/test_pipeline/test_losses.py b/test/test_pipeline/test_losses.py
index 3eeba6a70..43bbcb587 100644
--- a/test/test_pipeline/test_losses.py
+++ b/test/test_pipeline/test_losses.py
@@ -6,7 +6,14 @@
 from torch import nn
 from torch.nn.modules.loss import _Loss as Loss
 
-from autoPyTorch.pipeline.components.training.losses import get_loss, losses
+from autoPyTorch.pipeline.components.training.losses import (
+    get_loss,
+    losses,
+    LogProbLoss,
+    MAPELoss,
+    MASELoss,
+    QuantileLoss
+)
 from autoPyTorch.utils.implementations import (
     LossWeightStrategyWeighted,
     LossWeightStrategyWeightedBinary,
@@ -45,7 +52,8 @@ def test_get_name_error():
 @pytest.mark.parametrize('loss_details', ['loss_cross_entropy_multiclass',
                                           'loss_cross_entropy_binary',
                                           'loss_bce',
-                                          'loss_mse'], indirect=True)
+                                          'loss_mse',
+                                          'loss_mape'], indirect=True)
 def test_losses(weighted, loss_details):
     dataset_properties, predictions, name, targets, labels = loss_details
     loss = get_loss(dataset_properties=dataset_properties, name=name)
@@ -66,6 +74,7 @@ def test_losses(weighted, loss_details):
 def test_loss_dict():
     assert 'classification' in losses.keys()
     assert 'regression' in losses.keys()
+    assert 'forecasting' in losses.keys()
     for task in losses.values():
         for loss in task.values():
             assert 'module' in loss.keys()
@@ -76,29 +85,29 @@ def test_loss_dict():
 
 @pytest.mark.parametrize('target,expected_weights', [
     (
-        # Expected 4 classes where first one is majority one
-        np.array([[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]),
-        # We reduce the contribution of the first class which has double elements
-        np.array([0.5, 1., 1., 1.]),
+            # Expected 4 classes where first one is majority one
+            np.array([[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]),
+            # We reduce the contribution of the first class which has double elements
+            np.array([0.5, 1., 1., 1.]),
     ),
     (
-        # Expected 2 classes -- multilable format
-        np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
-        # We reduce the contribution of the first class which 3 to 1 ratio
-        np.array([2 / 3, 2]),
+            # Expected 2 classes -- multilable format
+            np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
+            # We reduce the contribution of the first class which 3 to 1 ratio
+            np.array([2 / 3, 2]),
     ),
     (
-        # Expected 2 classes -- (-1, 1) format
-        np.array([[1], [1], [1], [0]]),
-        # We reduce the contribution of the second class, which has a 3 to 1 ratio
-        np.array([2, 2 / 3]),
+            # Expected 2 classes -- (-1, 1) format
+            np.array([[1], [1], [1], [0]]),
+            # We reduce the contribution of the second class, which has a 3 to 1 ratio
+            np.array([2, 2 / 3]),
     ),
     (
-        # Expected 2 classes -- single column
-        # We have to reduce the contribution of the second class with 5 to 1 ratio
-        np.array([1, 1, 1, 1, 1, 0]),
-        # We reduce the contribution of the first class which has double elements
-        np.array([3, 6 / 10]),
+            # Expected 2 classes -- single column
+            # We have to reduce the contribution of the second class with 5 to 1 ratio
+            np.array([1, 1, 1, 1, 1, 0]),
+            # We reduce the contribution of the first class which has double elements
+            np.array([3, 6 / 10]),
     ),
 ])
 def test_lossweightstrategyweighted(target, expected_weights):
@@ -113,23 +122,23 @@ def test_lossweightstrategyweighted(target, expected_weights):
 
 @pytest.mark.parametrize('target,expected_weights', [
     (
-        # Expected 2 classes -- multilable format
-        np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
-        # We reduce the contribution of the first class which 3 to 1 ratio
-        np.array([1 / 3, 3]),
+            # Expected 2 classes -- multilable format
+            np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
+            # We reduce the contribution of the first class which 3 to 1 ratio
+            np.array([1 / 3, 3]),
     ),
     (
-        # Expected 2 classes -- (-1, 1) format
-        np.array([[1], [1], [1], [0]]),
-        # We reduce the contribution of the second class, which has a 3 to 1 ratio
-        np.array([1 / 3]),
+            # Expected 2 classes -- (-1, 1) format
+            np.array([[1], [1], [1], [0]]),
+            # We reduce the contribution of the second class, which has a 3 to 1 ratio
+            np.array([1 / 3]),
     ),
     (
-        # Expected 2 classes -- single column
-        # We have to reduce the contribution of the second class with 5 to 1 ratio
-        np.array([1, 1, 1, 1, 1, 0]),
-        # We reduce the contribution of the first class which has double elements
-        np.array([0.2]),
+            # Expected 2 classes -- single column
+            # We have to reduce the contribution of the second class with 5 to 1 ratio
+            np.array([1, 1, 1, 1, 1, 0]),
+            # We reduce the contribution of the first class which has double elements
+            np.array([0.2]),
     ),
 ])
 def test_lossweightstrategyweightedbinary(target, expected_weights):
@@ -139,3 +148,60 @@ def test_lossweightstrategyweightedbinary(target, expected_weights):
         torch.from_numpy(target).float(),
         torch.from_numpy(target).float(),
     ) > 0
+
+
+def test_forecasting_losses():
+    target_dims = [2, 3, 1]
+    targets = torch.Tensor([[0.0, 1.0, 2.0],
+                            [0.0, 0.0, 0.0]]).reshape(target_dims)
+    prediction_prob = torch.distributions.normal.Normal(
+        torch.zeros(2, 3, 1),
+        torch.ones(2, 3, 1)
+    )
+    prediction_value = torch.Tensor([[[0.0, 0.0, 0.0],
+                                      [0.5, 0.5, 0.5]]]
+                                    ).reshape(target_dims)
+
+    log_prob_loss_raw = LogProbLoss(reduction="raw")
+    loss_prob_raw = log_prob_loss_raw(prediction_prob, targets)
+    assert torch.allclose(loss_prob_raw, - prediction_prob.log_prob(targets))
+
+    log_prob_loss_mean = LogProbLoss(reduction="mean")
+    loss_prob_mean = log_prob_loss_mean(prediction_prob, targets)
+    assert loss_prob_mean == torch.mean(loss_prob_raw)
+
+    log_prob_loss_sum = LogProbLoss(reduction="sum")
+    loss_prob_sum = log_prob_loss_sum(prediction_prob, targets)
+    assert loss_prob_sum == torch.sum(loss_prob_raw)
+
+    mape_loss = MAPELoss(reduction="raw")
+    loss_mape = mape_loss(prediction_value, targets)
+    assert torch.allclose(loss_mape, torch.Tensor([[0., 1., 1.], [0., 0., 0.]]).reshape(target_dims))
+
+    mase_loss = MASELoss(reduction="raw")
+    loss_mase_1 = mase_loss(prediction_value, targets)
+    assert torch.allclose(loss_mase_1, torch.Tensor([[0., 1., 2.], [0.5, 0.5, 0.5]]).reshape(target_dims))
+
+    mase_loss.set_mase_coefficient(torch.Tensor([[2.0], [1.0]]))
+    loss_mase_2 = mase_loss(prediction_value, targets)
+    assert torch.allclose(loss_mase_2, torch.Tensor([[0., 2., 4.], [0.5, 0.5, 0.5]]).reshape(target_dims))
+
+    mase_loss.set_mase_coefficient(torch.Tensor([[2.0, 2.0]]))
+    with pytest.raises(ValueError, match="If self._mase_coefficient is a Tensor"):
+        _ = mase_loss(prediction_value, targets)
+
+    quantile_loss = QuantileLoss(reduction="raw")
+    diff = 0.5
+    quantile_prediction = [
+        targets + diff
+    ]
+    loss_quantile_1 = quantile_loss(quantile_prediction, targets)
+    assert torch.all(loss_quantile_1 == diff / 2)
+
+    quantiles = [0.1, 0.5, 0.8]
+    quantile_loss.set_quantiles([0.1, 0.5, 0.8])
+    quantile_prediction = [
+        targets - diff, targets - diff, targets - diff
+    ]
+    loss_quantile_2 = quantile_loss(quantile_prediction, targets)
+    assert torch.allclose(loss_quantile_2, torch.ones_like(loss_quantile_2) * diff * np.mean(quantiles))

From e5b1c473d6444fee8ddf2c5618bdc17d2bd471c0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 10 May 2022 15:17:47 +0200
Subject: [PATCH 253/347] test for metrics

---
 .../components/training/metrics/base.py       |   8 +-
 .../components/training/metrics/metrics.py    |  12 +-
 .../components/training/metrics/utils.py      |   6 +
 test/test_pipeline/test_metrics.py            | 104 +++++++++++++++++-
 4 files changed, 119 insertions(+), 11 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 3b102416e..56432407f 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -201,16 +201,16 @@ def __call__(
             sample_weight: Optional[List[float]] = None,
             **kwarg: Dict,
     ) -> float:
-        """Evaluate time series forecastin losses given input data
+        """Evaluate time series forecasting losses given input data
         The description is nearly the same as the one defined under
         https://www.sktime.org/en/stable/api_reference/performance_metrics.html
 
         Parameters
         ----------
-        y_true : array-like
+        y_true : array-like, [n_seq x n_prediction_steps, n_output]
             Ground truth (correct) target values.
 
-        y_pred : array-like, [n_samples x n_classes]
+        y_pred : array-like, [n_seq x n_prediction_steps, n_output]
             Forecasted values.
 
         sp: int
@@ -271,7 +271,7 @@ def __call__(
         elif agg == 'median':
             return self._sign * np.median(losses_all)
         else:
-            raise ValueError(f'Unsupported aggregation type {agg}')
+            raise NotImplementedError(f'Unsupported aggregation type {agg}')
 
 
 def make_metric(
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index b950d8ce0..d8ade9f28 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -53,11 +53,10 @@
 
 # Standard Forecasting Scores
 
-
 # To avoid storing unnecessary scale values here, we scale all the values under
 # AutoPytorch.evaluation.time_series_forecasting_train_evaluator
 
-def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> float:
+def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> np.ndarray:
     """
     compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
     this function aims at reducing the memroy requirement
@@ -69,15 +68,18 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
         mase_coefficient: inverse of mase_denominator
     """
     past_target = np.nan_to_num(past_target)
+    max_past_target_abs = np.max(np.abs(past_target))
+    if max_past_target_abs == 0.:
+        return np.asarray(1.)
     if sp >= len(past_target):
         # in this case, we simply consider the mean value of the entire sequence
-        # TODO condsider if there is a better way of handling this
+        # TODO consider if there is a better way of handling this
         try:
             mase_denominator = forecasting_metrics.mean_absolute_error(past_target,
                                                                        np.zeros_like(past_target),
                                                                        multioutput="raw_values")
         except ValueError:
-            return 1
+            return np.asarray(1.)
 
     else:
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],
@@ -85,7 +87,7 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> f
                                                                    multioutput="raw_values")
 
     return np.where(mase_denominator == 0.0,
-                    np.zeros_like(mase_denominator),
+                    np.min([1., 1. / max_past_target_abs]),
                     1.0 / np.maximum(mase_denominator, forecasting_metrics._functions.EPS)
                     )
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 5d4b70d58..b6b781884 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -127,6 +127,12 @@ def calculate_score(
         cprediction = sanitize_array(prediction)
         for metric_ in metrics:
             if metric_ in MASE_LOSSES and 'mase_coefficient' in score_kwargs:
+                mase_coe_shape = score_kwargs['mase_coefficient'].shape
+                target_shape = target.shape
+                if mase_coe_shape[0] != target_shape[0] or mase_coe_shape[-1] != target_shape[-1]:
+                    raise ValueError(f"the shape of MASE coefficient and target_shape must be consistent in the "
+                                     f"first and last dimension. However, their shapes are {mase_coe_shape}"
+                                     f"(MASE coefficient) and {target_shape} (targets)")
                 target_scaled = target * score_kwargs['mase_coefficient']
                 cprediction_scaled = cprediction * score_kwargs['mase_coefficient']
                 score_dict[metric_.name] = metric_._sign * metric_(target_scaled, cprediction_scaled, **score_kwargs)
diff --git a/test/test_pipeline/test_metrics.py b/test/test_pipeline/test_metrics.py
index 1f9889807..b3b3cd386 100644
--- a/test/test_pipeline/test_metrics.py
+++ b/test/test_pipeline/test_metrics.py
@@ -3,7 +3,7 @@
 import pytest
 
 import sklearn.metrics
-
+import sktime.performance_metrics.forecasting as forecasting_metrics
 
 from autoPyTorch.constants import (
     BINARY,
@@ -12,13 +12,20 @@
     STRING_TO_TASK_TYPES,
     TABULAR_CLASSIFICATION,
     TABULAR_REGRESSION,
+    TIMESERIES_FORECASTING,
     TASK_TYPES_TO_STRING
 )
-from autoPyTorch.metrics import accuracy, balanced_accuracy, mean_squared_error
+from autoPyTorch.metrics import (accuracy,
+                                 balanced_accuracy,
+                                 mean_squared_error,
+                                 compute_mase_coefficient,
+                                 median_MAPE_forecasting)
 from autoPyTorch.pipeline.components.training.metrics.base import (
     _PredictMetric,
     _ThresholdMetric,
+    _ForecastingMetric,
     autoPyTorchMetric,
+    ForecastingMetricMixin,
     make_metric,
 )
 from autoPyTorch.pipeline.components.training.metrics.utils import (
@@ -48,6 +55,15 @@ def test_get_no_name_regression(output_type):
         assert isinstance(metric, autoPyTorchMetric)
 
 
+@pytest.mark.parametrize('output_type', ['continuous', 'continuous-multioutput'])
+def test_get_no_name_regression(output_type):
+    dataset_properties = {'task_type': 'time_series_forecasting',
+                          'output_type': output_type}
+    metrics = get_metrics(dataset_properties)
+    for metric in metrics:
+        assert isinstance(metric, ForecastingMetricMixin)
+
+
 @pytest.mark.parametrize('metric', ['accuracy', 'average_precision',
                                     'balanced_accuracy', 'f1'])
 def test_get_name(metric):
@@ -96,6 +112,37 @@ def test_regression_metrics():
         assert isinstance(score, float)
 
 
+def test_forecasting_metric():
+    # test of all regression metrics
+    dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
+                          'output_type': OUTPUT_TYPES_TO_STRING[CONTINUOUS]}
+    n_prediction_steps = 5
+    n_seq = 2
+    n_targets = 2
+
+    y_target = np.zeros([n_seq, n_prediction_steps, n_targets])
+    y_pred = np.ones([n_seq, n_prediction_steps, n_targets])
+    mase_coefficient = np.ones([n_seq, n_prediction_steps, n_targets]) * 2
+    metrics = get_metrics(dataset_properties=dataset_properties, all_supported_metrics=True)
+    forecasting_kwargs = {'sp': 4,
+                          'n_prediction_steps': n_prediction_steps,
+                          'mase_coefficient': mase_coefficient,
+                          }
+    score_dict = calculate_score(y_pred, y_target, STRING_TO_TASK_TYPES[dataset_properties['task_type']], metrics,
+                                 **forecasting_kwargs)
+    assert isinstance(score_dict, dict)
+    for name, score in score_dict.items():
+        assert isinstance(name, str)
+        assert isinstance(score, float)
+    forecasting_kwargs = {'sp': 4,
+                          'n_prediction_steps': n_prediction_steps,
+                          'mase_coefficient': np.ones([1, n_prediction_steps, n_targets]),
+                          }
+    with pytest.raises(ValueError, match="the shape of MASE coefficient and target_shape must be consistent"):
+        score_dict = calculate_score(y_pred, y_target, STRING_TO_TASK_TYPES[dataset_properties['task_type']], metrics,
+                                     **forecasting_kwargs)
+
+
 def test_predictmetric_binary():
     y_true = np.array([0, 0, 1, 1])
     y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
@@ -155,6 +202,42 @@ def test_threshold_scorer_binary():
     assert score == pytest.approx(-1.0)
 
 
+def test_forecastingcomputation():
+    scorer_mean = _ForecastingMetric(
+        'mean_mape', forecasting_metrics.mean_absolute_percentage_error, 0.0, np.finfo(np.float64).max, 1,
+        kwargs=dict(aggregation='mean'),
+    )
+    scorer_median = _ForecastingMetric(
+        'median_mape', forecasting_metrics.mean_absolute_percentage_error, 0.0, np.finfo(np.float64).max, 1,
+        kwargs=dict(aggregation='median'),
+    )
+
+    n_seq = 3
+    n_prediction_steps = 5
+    n_targets = 2
+
+    y_true = np.expand_dims([np.arange(n_prediction_steps) + i * 10 for i in range(n_seq)], -1).repeat(n_targets, axis=-1)
+    y_pred = y_true + 1
+    score_mean = scorer_mean(y_true=y_true, y_pred=y_pred, sp=1, n_prediction_steps=n_prediction_steps)
+    score_median = scorer_median(y_true=y_true, y_pred=y_pred, sp=1, n_prediction_steps=n_prediction_steps)
+
+    score_all = []
+    for true_seq, pred_seq in zip(y_true, y_pred):
+        score_all.append(forecasting_metrics.mean_absolute_percentage_error(y_true=true_seq, y_pred=pred_seq))
+    assert score_mean == np.mean(score_all)
+    assert score_median == np.median(score_all)
+
+    # Additional parameters
+    horizon_weight = [0.1, 0.2, 0.3, 0.4, 0.5]
+    score_mean = scorer_mean(y_true=y_true, y_pred=y_pred, sp=1,
+                             n_prediction_steps=n_prediction_steps, horizon_weight=horizon_weight)
+    score_all = []
+    for true_seq, pred_seq in zip(y_true, y_pred):
+        score_all.append(forecasting_metrics.mean_absolute_percentage_error(y_true=true_seq, y_pred=pred_seq,
+                                                                            horizon_weight=horizon_weight))
+    assert score_mean == np.mean(score_all)
+
+
 def test_sign_flip():
     y_true = np.arange(0, 1.01, 0.1)
     y_pred = y_true.copy()
@@ -184,6 +267,7 @@ def test_sign_flip():
     assert score == pytest.approx(-1.0)
 
 
+
 def test_classification_only_metric():
     y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
     y_pred = \
@@ -255,3 +339,19 @@ def test_calculate_loss():
         task_type=TABULAR_REGRESSION,
         metrics=[mean_squared_error],
     )['mean_squared_error']
+
+
+def test_compute_mase_coefficient():
+    past_target = np.arange(12)
+    mase_value_1 = compute_mase_coefficient(past_target, 15)
+    assert mase_value_1 == 1 / np.mean(past_target)
+    mase_value_2 = compute_mase_coefficient(past_target, 5)
+    assert mase_value_2 == 0.2
+
+    past_target = np.ones(12) * 2
+    assert compute_mase_coefficient(past_target, 15) == 0.5
+    assert compute_mase_coefficient(past_target, 5) == 0.5
+
+    past_target = np.zeros(12)
+    assert compute_mase_coefficient(past_target, 15) == 1.
+    assert compute_mase_coefficient(past_target, 5) == 1.

From 3f47489444deb52ebbc647452a91b7a219627ed4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 10 May 2022 22:55:37 +0200
Subject: [PATCH 254/347] forecasting preprocessing

---
 .../encoding/NoEncoder.py                     |  12 +
 .../encoding/OneHotEncoder.py                 |   4 +-
 .../imputation/TimeSeriesImputer.py           |  68 +++--
 .../imputation/base_time_series_imputer.py    |  41 ---
 .../scaling/base_scaler.py                    |   2 +-
 .../scaling/utils.py                          |  63 ++--
 test/conftest.py                              |   2 +-
 .../preprocessing/forecasting/__init__.py     |   0
 .../preprocessing/forecasting/base.py         |  42 +++
 .../forecasting/test_encoder_choice.py        |  25 ++
 .../forecasting/test_encoders.py              |  94 ++++++
 .../preprocessing/forecasting/test_imputer.py | 274 ++++++++++++++++++
 .../preprocessing/forecasting/test_scaling.py | 197 +++++++++++++
 .../test_time_series_transformer.py           |  45 +++
 14 files changed, 775 insertions(+), 94 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/__init__.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/base.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
 create mode 100644 test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
index 72f49183c..cb48b4134 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
@@ -30,3 +30,15 @@ def get_properties(
             'name': 'Time Series No Encoder',
             'handles_sparse': True
         }
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        return NoEncoder.transform(self, X)
+
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
index 807a6ca19..cb616dfb9 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -25,8 +25,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
 
         if len(n_features_cat) == 0:
             n_features_cat = self.preprocessor['categorical'].categories
-        for cat_column in categorical_columns:
-            feature_shapes[feature_names[cat_column]] = len(n_features_cat[cat_column])
+        for i, cat_column in enumerate(categorical_columns):
+            feature_shapes[feature_names[cat_column]] = len(n_features_cat[i])
         self.feature_shapes = feature_shapes
         return self
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index b81978db1..16837f640 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -1,23 +1,23 @@
 from typing import Any, Dict, List, Optional
 
 import numpy as np
+from sklearn.base import BaseEstimator
 from sktime.transformations.series.impute import Imputer
 
 from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
 from autoPyTorch.utils.common import FitRequirement
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation. \
-    base_time_series_imputer import BaseTimeSeriesImputer
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent,
     autoPyTorchTimeSeriesTargetPreprocessingComponent
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import HyperparameterSearchSpace
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
-class TimeSeriesFeatureImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesPreprocessingComponent):
+class TimeSeriesFeatureImputer(autoPyTorchTimeSeriesPreprocessingComponent):
     def __init__(self,
                  random_state: Optional[np.random.RandomState] = None,
                  imputation_strategy: str = 'mean'):
@@ -27,7 +27,7 @@ def __init__(self,
         self.add_fit_requirements([
             FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
-    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImputer:
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseEstimator:
         """
         Builds the preprocessor based on the given fit dictionary 'X'.
 
@@ -47,10 +47,9 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImput
         if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
             if self.imputation_strategy == 'constant_zero':
                 imputer = Imputer(method='constant', random_state=self.random_state, value=0)
-                self.preprocessor['numerical'] = imputer
             else:
                 imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state)
-                self.preprocessor['numerical'] = imputer
+            self.preprocessor['numerical'] = imputer
 
         return self
 
@@ -74,28 +73,45 @@ def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter='imputation_strategy',
-                value_range=("drift", "linear", "nearest", "constant_zero", "mean", "median", "bfill", "ffill"),
+                value_range=("drift", "linear", "nearest", "constant_zero", "bfill", "ffill"),
                 default_value="drift",
             ),
     ) -> ConfigurationSpace:
-        if dataset_properties.get('features_have_missing_values', False):
-            cs = super().get_hyperparameter_search_space(dataset_properties, imputation_strategy)
-        else:
-            cs = ConfigurationSpace()
+        """Get the hyperparameter search space for the Time Series Imputator
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+            imputation_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for imputation, its hyperparameters are defined by sktime
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a Time Series Imputor with the given
+                `dataset_properties`
+        """
+        if dataset_properties is None:
+            raise ValueError("TimeSeriesFeatureImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        cs = ConfigurationSpace()
+        if (dataset_properties.get('features_have_missing_values', True)
+            and isinstance(dataset_properties['numerical_columns'], List)
+            and len(dataset_properties['numerical_columns']) != 0
+        ):
+            add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
         return cs
 
 
-class TimeSeriesTargetImputer(BaseTimeSeriesImputer, autoPyTorchTimeSeriesTargetPreprocessingComponent):
+class TimeSeriesTargetImputer(autoPyTorchTimeSeriesTargetPreprocessingComponent):
     def __init__(self,
                  random_state: Optional[np.random.RandomState] = None,
                  imputation_strategy: str = 'mean', ):
         super().__init__()
         self.random_state = random_state
         self.imputation_strategy = imputation_strategy
-        self.add_fit_requirements([
-            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
-    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImputer:
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseEstimator:
         """
         Builds the preprocessor based on the given fit dictionary 'X'.
 
@@ -112,10 +128,9 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseTimeSeriesImput
         # Forecasting tasks always have numerical outputs (TODO add support for categorical HPs)
         if self.imputation_strategy == 'constant_zero':
             imputer = Imputer(method='constant', random_state=self.random_state, value=0)
-            self.preprocessor['target_numerical'] = imputer
         else:
             imputer = Imputer(method=self.imputation_strategy, random_state=self.random_state)
-            self.preprocessor['target_numerical'] = imputer
+        self.preprocessor['target_numerical'] = imputer
 
         return self
 
@@ -128,10 +143,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['target_numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
+        if self.preprocessor['target_numerical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        X.update({'imputer': self.preprocessor})
+        X.update({'target_imputer': self.preprocessor})
         return X
 
     @staticmethod
@@ -156,8 +171,11 @@ def get_hyperparameter_search_space(
         Returns:
 
         """
-        if dataset_properties.get('features_have_missing_values', False):
-            cs = super().get_hyperparameter_search_space(dataset_properties, imputation_strategy)
-        else:
-            cs = ConfigurationSpace()
-        return cs
+        if dataset_properties is None:
+            raise ValueError("TimeSeriesTargetImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        cs = ConfigurationSpace()
+        if dataset_properties.get('targets_have_missing_values', True):
+            add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
+        return cs
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
deleted file mode 100644
index 883e6af56..000000000
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/base_time_series_imputer.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from typing import Any, Dict, Optional
-
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
-
-
-class BaseTimeSeriesImputer:
-    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "BaseTimeSeriesImputer":
-        raise NotImplementedError
-
-    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        imputation_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='imputation_strategy',
-            value_range=("drift", "linear", "nearest", "constant_zero", "mean", "median", "bfill", "ffill"),
-            default_value="drift",
-        ),
-    ) -> ConfigurationSpace:
-        """Get the hyperparameter search space for the Time Series Imputator
-
-        Args:
-            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
-                Properties that describe the dataset
-            imputation_strategy (HyperparameterSearchSpace: default = ...)
-                The strategy to use for imputation, its hyperparameters are defined by sktime
-
-        Returns:
-            ConfigurationSpace
-                The space of possible configurations for a Time Series Imputor with the given
-                `dataset_properties`
-        """
-        cs = ConfigurationSpace()
-        add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
-        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
index aad545a3b..bfc9aab2e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -24,7 +24,7 @@ def __init__(self,
                  scaling_mode: str = 'standard'):
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True)
         ])
         self.random_state = random_state
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 91e6d2215..8e5193b1e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -10,7 +10,7 @@
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
 class TimeSeriesScaler(BaseEstimator):
     def __init__(self, mode: str,
-                 dataset_is_small_preprocess: bool = False,
+                 dataset_is_small_preprocess: bool = True,
                  static_features: Tuple[Union[str, int]] = ()):
         self.mode = mode
         self.dataset_is_small_preprocess = dataset_is_small_preprocess
@@ -24,7 +24,7 @@ def fit(self, X: pd.DataFrame, y: Any = None) -> "TimeSeriesScaler":
         self.static_features = static_features
         return self
 
-    def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
+    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...]:
         """
         X = sklearn.utils.check_array(
             X,
@@ -44,15 +44,18 @@ def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
 
                 # for static features, if we do normalization w.r.t. each group, then they will become the same values,
                 # thus we treat them differently: normalize with the entire dataset
-                self.scale[self.static_features] = self.loc[self.static_features].std()
-                self.loc[self.static_features] = self.loc[self.static_features].mean()
-            else:
-                self.loc = X.mean()
-                self.scale = X.std()
+                self.scale[self.static_features] = X[self.static_features].std().fillna(0.0)
+                self.loc[self.static_features] = X[self.static_features].mean()
+
+                # ensure that if all the values are the same in a group, we could still normalize them correctly
+                self.scale[self.scale == 0] = 1.
 
-            # ensure that if all the values are the same in a group, we could still normalize them correctly
-            self.scale.mask(self.scale == 0.0, self.loc)
-            self.scale[self.scale == 0] = 1.
+            else:
+                # in this case X is a np array
+                self.loc = X.mean(axis=0, keepdims=True)
+                self.scale = np.nan_to_num(X.std(axis=0, ddof=1, keepdims=True))
+                self.scale = np.where(self.scale == 0, self.loc, self.scale)
+                self.scale[self.scale == 0] = 1.
 
             return (X - self.loc) / self.scale
 
@@ -65,24 +68,32 @@ def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
                 min_[self.static_features] = min_[self.static_features].min()
                 max_[self.static_features] = max_[self.static_features].max()
 
+                diff_ = max_ - min_
+                self.loc = min_
+                self.scale = diff_
+                self.scale.mask(self.scale == 0.0, self.loc)
+                self.scale[self.scale == 0.0] = 1.0
+
             else:
-                min_ = X.min()
-                max_ = X.max()
-
-            diff_ = max_ - min_
-            self.loc = min_
-            self.scale = diff_
-            self.scale.mask(self.scale == 0.0, self.loc)
-            self.scale[self.scale == 0.0] = 1.0
+                min_ = X.min(axis=0, keepdims=True)
+                max_ = X.max(axis=0, keepdims=True)
+
+                diff_ = max_ - min_
+                self.loc = min_
+                self.scale = diff_
+                self.scale = np.where(self.scale == 0., self.loc, self.scale)
+                self.scale[self.scale == 0.0] = 1.0
+
             return (X - self.loc) / self.scale
 
         elif self.mode == "max_abs":
-            X_abs = X.transform("abs")
             if self.dataset_is_small_preprocess:
-                max_abs_ = X_abs.groupby(X_abs.index).transform("max")
+                X_abs = X.transform("abs")
+                max_abs_ = X_abs.groupby(X_abs.index).agg("max")
                 max_abs_[self.static_features] = max_abs_[self.static_features].max()
             else:
-                max_abs_ = X_abs.max()
+                X_abs = np.abs(X)
+                max_abs_ = X_abs.max(0, keepdims=True)
 
             max_abs_[max_abs_ == 0.0] = 1.0
             self.loc = None
@@ -91,15 +102,19 @@ def transform(self, X: pd.DataFrame) -> Tuple[np.ndarray, ...]:
             return X / self.scale
 
         elif self.mode == 'mean_abs':
-            X_abs = X.transform("abs")
             if self.dataset_is_small_preprocess:
+                X_abs = X.transform("abs")
                 X_abs = X_abs.groupby(X_abs.index)
                 mean_abs_ = X_abs.agg("mean")
                 mean_abs_[self.static_features] = mean_abs_[self.static_features].mean()
+                self.scale = mean_abs_.mask(mean_abs_ == 0.0, X_abs.agg("max"))
             else:
-                mean_abs_ = X_abs.mean()
+                X_abs = np.abs(X)
+                mean_abs_ = X_abs.mean(0, keepdims=True)
+                self.scale = np.where(mean_abs_ == 0.0, np.max(X_abs), mean_abs_)
+
+            self.scale[self.scale == 0] = 1
             self.loc = None
-            self.scale = mean_abs_.mask(mean_abs_ == 0.0, X_abs.agg("max"))
 
             return X / self.scale
 
diff --git a/test/conftest.py b/test/conftest.py
index b58f56765..7e2249d39 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -640,7 +640,7 @@ def get_forecasting_data(request):
         uni_variant = True
     elif request == 'uni_variant_w_missing':
         uni_variant = True
-        targets_with_missing_value = True
+        with_missing_values = True
     elif request == 'multi_variant_wo_missing':
         with_missing_values = False
     elif request == 'multi_variant_w_missing':
diff --git a/test/test_pipeline/components/preprocessing/forecasting/__init__.py b/test/test_pipeline/components/preprocessing/forecasting/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_pipeline/components/preprocessing/forecasting/base.py b/test/test_pipeline/components/preprocessing/forecasting/base.py
new file mode 100644
index 000000000..5928acc85
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/base.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
+    TimeSeriesTransformer, TimeSeriesTargetTransformer
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
+    TimeSeriesEncoderChoice
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
+    TimeSeriesFeatureImputer,
+    TimeSeriesTargetImputer,
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+
+
+class ForecastingPipeline(TimeSeriesForecastingPipeline):
+    def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
+                            ) -> List[Tuple[str, autoPyTorchChoice]]:
+        """
+        Defines what steps a pipeline should follow.
+        The step itself has choices given via autoPyTorchChoice.
+
+        Returns:
+            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
+                by the pipeline.
+        """
+        steps: List[Tuple[str, autoPyTorchChoice]] = []
+
+        default_dataset_properties = {'target_type': 'time_series_forecasting'}
+        if dataset_properties is not None:
+            default_dataset_properties.update(dataset_properties)
+
+        steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
+                      ("scaler", BaseScaler(random_state=self.random_state)),
+                      ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
+                                                           random_state=self.random_state)),
+                      ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+                      ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
+                      ])
+        return steps
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
new file mode 100644
index 000000000..424b8898e
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
@@ -0,0 +1,25 @@
+import copy
+import unittest
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
+    TimeSeriesEncoderChoice
+)
+
+
+class TestEncoderChoice(unittest.TestCase):
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the encoder
+        choice"""
+        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]}
+        encoder_choice = TimeSeriesEncoderChoice(dataset_properties)
+        cs = encoder_choice.get_hyperparameter_search_space()
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(encoder_choice.get_components().keys()))
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
new file mode 100644
index 000000000..e071c08e2
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
@@ -0,0 +1,94 @@
+import unittest
+
+import pandas as pd
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.NoEncoder import (
+    TimeSeriesNoEncoder
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.OneHotEncoder import (
+    TimeSeriesOneHotEncoder
+)
+
+
+class TestEncoders(unittest.TestCase):
+    def setUp(self) -> None:
+        data = np.array([[1, 'male', 1],
+                         [1, 'female', 2],
+                         [1, 'unknown', 2],
+                         [2, 'male', 2],
+                         [2, 'female', 2]])
+        feature_names = ("feature_n1", "feature_c", "feature_n2")
+
+        self.data = pd.DataFrame(data, columns=feature_names)
+
+        categorical_columns = [1]
+        numerical_columns = [0, 2]
+        self.train_indices = np.array([0, 1, 2])
+        self.test_indices = np.array([3, 4])
+
+        self.dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+            'categories': [['female', 'male', 'unknown']],
+            'feature_names': feature_names,
+            'feature_shapes': {fea: 1 for fea in feature_names}
+        }
+
+    def test_one_hot_encoder_no_unknown(self):
+        X = {
+            'X_train': self.data.iloc[self.train_indices],
+            'dataset_properties': self.dataset_properties
+        }
+        encoder_component = TimeSeriesOneHotEncoder()
+        encoder_component.fit(X)
+        X = encoder_component.transform(X)
+        encoder = X['encoder']['categorical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['encoder'], dict)
+        self.assertIsInstance(encoder, BaseEstimator)
+        self.assertIsNone(X['encoder']['numerical'])
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((encoder, X['dataset_properties']['categorical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data.iloc[self.test_indices])
+        # check if the transform is correct
+
+        assert_array_equal(transformed.tolist(), [[0.0, 1.0, 0.0, '2', '2'], [1.0, 0.0, 0.0, '2', '2']])
+
+        dataset_properties = X['dataset_properties']
+
+        idx_cat = 0
+        for i, fea_name in enumerate(dataset_properties['feature_names']):
+            if i in dataset_properties['categorical_columns']:
+                self.assertEqual( dataset_properties['feature_shapes'][fea_name],
+                                  len(dataset_properties['categories'][idx_cat]))
+                idx_cat += 1
+            else:
+                assert dataset_properties['feature_shapes'][fea_name] == 1
+
+    def test_none_encoder(self):
+        X = {
+            'X_train': self.data.iloc[self.train_indices],
+            'dataset_properties': self.dataset_properties
+        }
+
+        encoder_component = TimeSeriesNoEncoder()
+        encoder_component.fit(X)
+        X = encoder_component.transform(X)
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['encoder'], dict)
+        self.assertIsNone(X['encoder']['categorical'])
+        self.assertIsNone(X['encoder']['numerical'])
+
+        dataset_properties = X['dataset_properties']
+        for i, fea_name in enumerate(dataset_properties['feature_names']):
+            self.assertEqual( dataset_properties['feature_shapes'][fea_name], 1)
\ No newline at end of file
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
new file mode 100644
index 000000000..9abee07ab
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
@@ -0,0 +1,274 @@
+import unittest
+
+import numpy as np
+import pandas as pd
+from numpy.testing import assert_array_equal
+
+import pytest
+
+from sklearn.base import BaseEstimator, clone
+from sklearn.compose import make_column_transformer
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
+    TimeSeriesFeatureImputer,
+    TimeSeriesTargetImputer,
+)
+
+
+class TestTimeSeriesFeatureImputer(unittest.TestCase):
+    def setUp(self) -> None:
+        data = np.array([[1.0, np.nan, 3],
+                         [np.nan, 8, 9],
+                         [4.0, 5, np.nan],
+                         [np.nan, 2, 3],
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        train_indices = np.array([0, 1, 2])
+        self.test_indices = np.array([3, 4, 5])
+        dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+        }
+        self.X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        self.data = data
+        self.dataset_properties = dataset_properties
+
+    def test_get_config_space(self):
+        dataset_properties = dict(categorical_columns=[0, 1],
+                                  numerical_columns=[1, 2],
+                                  features_have_missing_values=True)
+        config = TimeSeriesFeatureImputer.get_hyperparameter_search_space(dataset_properties).sample_configuration()
+        estimator = TimeSeriesFeatureImputer(**config)
+        estimator_clone = clone(estimator)
+        estimator_clone_params = estimator_clone.get_params()
+
+        # Make sure all keys are copied properly
+        for k, v in estimator.get_params().items():
+            self.assertIn(k, estimator_clone_params)
+
+        # Make sure the params getter of estimator are honored
+        klass = estimator.__class__
+        new_object_params = estimator.get_params(deep=False)
+        for name, param in new_object_params.items():
+            new_object_params[name] = clone(param, safe=False)
+        new_object = klass(**new_object_params)
+        params_set = new_object.get_params(deep=False)
+
+        for name in new_object_params:
+            param1 = new_object_params[name]
+            param2 = params_set[name]
+            self.assertEqual(param1, param2)
+
+        dataset_properties['features_have_missing_values'] = False
+        cs = TimeSeriesFeatureImputer.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(len(cs.get_hyperparameters()), 0)
+
+        with self.assertRaises(ValueError):
+            TimeSeriesFeatureImputer.get_hyperparameter_search_space()
+
+    def test_drift_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='drift')
+        data = pd.DataFrame(self.data)
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        categorical_imputer = X['imputer']['categorical']
+        numerical_imputer = X['imputer']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['imputer'], dict)
+        self.assertIsNone(categorical_imputer)
+        self.assertIsInstance(numerical_imputer, BaseEstimator)
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data.iloc[self.test_indices])
+
+        self.assertTrue(np.allclose(transformed, np.array([[7.5, 2., 3.],
+                                                           [7., 2., 9.],
+                                                           [4, 2., 10.]])))
+
+    def test_linear_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='linear')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+
+        self.assertTrue(np.allclose(transformed, np.array([[7., 2., 3.],
+                                                           [7., 2., 9.],
+                                                           [4., 2., 9.]])))
+
+    def test_nearest_imputation(self):
+        data = np.array([[1.0, np.nan, 7],
+                         [np.nan, 9, 10],
+                         [10.0, 7, 7],
+                         [9.0, np.nan, 11],
+                         [9.0, 9, np.nan],
+                         [np.nan, 5, 6],
+                         [12.0, np.nan, 8],
+                         [9.0, 7.0, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        train_indices = np.array([0, 1, 2, 3, 4])
+        test_indices = np.array([5, 6, 7])
+        dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+        }
+        X = {
+            'X_train': data[train_indices],
+            'dataset_properties': dataset_properties
+        }
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='nearest')
+
+        imputer_component = imputer_component.fit(X)
+        X = imputer_component.transform(X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(data[test_indices])
+
+        assert_array_equal(transformed, np.array([[12., 5., 6.],
+                                                  [12., 5, 8.],
+                                                  [9., 7., 8]]))
+
+    def test_constant_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='constant_zero')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+        assert_array_equal(transformed, np.array([[0, 2, 3],
+                                                  [7, 0, 9],
+                                                  [4, 0, 0]]))
+
+    def test_bfill_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='bfill')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+        assert_array_equal(transformed, np.array([[7., 2, 3],
+                                                  [7, 2., 9],
+                                                  [4, 2., 9.]]))
+
+    def test_ffill_imputation(self):
+        imputer_component = TimeSeriesFeatureImputer(imputation_strategy='ffill')
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['imputer']['numerical']
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data[self.test_indices])
+        assert_array_equal(transformed, np.array([[7, 2, 3],
+                                                  [7, 2, 9],
+                                                  [4, 2, 9]]))
+
+
+class TestTimeSeriesTargetImputer(unittest.TestCase):
+    def test_get_config_space(self):
+        dataset_properties = dict(categorical_columns=[0, 1],
+                                  numerical_columns=[1, 2])
+        config = TimeSeriesTargetImputer.get_hyperparameter_search_space(dataset_properties).sample_configuration()
+        estimator = TimeSeriesFeatureImputer(**config)
+        estimator_clone = clone(estimator)
+        estimator_clone_params = estimator_clone.get_params()
+
+        # Make sure all keys are copied properly
+        for k, v in estimator.get_params().items():
+            self.assertIn(k, estimator_clone_params)
+
+        # Make sure the params getter of estimator are honored
+        klass = estimator.__class__
+        new_object_params = estimator.get_params(deep=False)
+        for name, param in new_object_params.items():
+            new_object_params[name] = clone(param, safe=False)
+        new_object = klass(**new_object_params)
+        params_set = new_object.get_params(deep=False)
+
+        for name in new_object_params:
+            param1 = new_object_params[name]
+            param2 = params_set[name]
+            self.assertEqual(param1, param2)
+
+        dataset_properties = dict(targets_have_missing_values=False)
+        cs = TimeSeriesTargetImputer.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(len(cs.get_hyperparameters()), 0)
+
+        with pytest.raises(ValueError):
+            TimeSeriesTargetImputer.get_hyperparameter_search_space()
+
+    def test_ffill_imputation(self):
+        y = np.array([1.0, np.nan, 8, 9, 4.0, 5, np.nan]).reshape([-1, 1])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
+        dataset_properties = {
+            'categorical_columns': categorical_columns,
+            'numerical_columns': numerical_columns,
+        }
+        self.X = {
+            'y_train': y,
+            'dataset_properties': dataset_properties
+        }
+        self.dataset_properties = dataset_properties
+
+        imputer_component = TimeSeriesTargetImputer(imputation_strategy='ffill')
+
+        imputer_component = imputer_component.fit(self.X)
+
+        imputer_component = imputer_component.fit(self.X)
+        X = imputer_component.transform(self.X)
+        numerical_imputer = X['target_imputer']['target_numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['target_imputer'], dict)
+        self.assertIsInstance(numerical_imputer, BaseEstimator)
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((numerical_imputer, [0]),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['y_train'])
+        transformed = column_transformer.transform(y)
+        assert_array_equal(transformed, np.array([[1.], [1.], [8.], [9.], [4.], [5.], [5.]]))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
new file mode 100644
index 000000000..06430f236
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -0,0 +1,197 @@
+import unittest
+
+import numpy as np
+import pandas as pd
+from numpy.testing import assert_allclose
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
+
+
+class TestScaling(unittest.TestCase):
+    def setUp(self) -> None:
+        data_seq_1 = np.array([[1, 2, 3],
+                               [0, 2, 3],
+                               [2, 2, 3],
+                               ])
+
+        data_seq_2 = np.array([[0, 1, 1],
+                               [0, 1, 2],
+                               [0, 1, 4],
+                               [0, 1, 6]
+                               ])
+
+        columns = ['f1', 's', 'f2']
+        self.raw_data = [data_seq_1, data_seq_2]
+        self.data = pd.DataFrame(np.concatenate([data_seq_1, data_seq_2]), columns=columns, index=[0] * 3 + [1] * 4)
+        self.static_features = ('s',)
+        categorical_columns = list()
+        numerical_columns = [0, 1, 2]
+
+        self.dataset_properties = {'categorical_columns': categorical_columns,
+                                   'numerical_columns': numerical_columns,
+                                   'static_features': self.static_features,
+                                   'is_small_preprocess': True}
+
+    def test_base_and_standard_scaler(self):
+        scaler_component = BaseScaler(scaling_mode='standard')
+        X = {
+            'X_train': self.data,
+            'dataset_properties': self.dataset_properties
+        }
+
+        scaler_component = scaler_component.fit(dict(dataset_properties=self.dataset_properties))
+        X = scaler_component.transform(X)
+
+        scaler: TimeSeriesScaler = X['scaler']['numerical']
+
+        # check if the fit dictionary X is modified as expected
+        self.assertIsInstance(X['scaler'], dict)
+        self.assertIsInstance(scaler, BaseEstimator)
+        self.assertIsNone(X['scaler']['categorical'])
+
+        # make column transformer with returned encoder to fit on data
+        column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
+        column_transformer = column_transformer.fit(X['X_train'])
+        transformed = column_transformer.transform(self.data)
+
+        self.assertTrue(np.allclose(transformed, np.asarray([[0., 1.06904497, 0.],
+                                                             [-1., 1.06904497, 0.],
+                                                             [1., 1.06904497, 0.],
+                                                             [0., -0.80178373, -1.01472214],
+                                                             [0., -0.80178373, -0.56373452],
+                                                             [0., -0.80178373, 0.33824071],
+                                                             [0., -0.80178373, 1.24021595]])))
+
+        transformer = column_transformer.named_transformers_['timeseriesscaler']
+        self.assertTrue(np.allclose(transformer.loc.values, np.asarray([[1.0, 1.428571, 3.00],
+                                                                        [0.0, 1.428571, 3.25]])))
+
+        self.assertTrue(np.allclose(transformer.scale.values, np.asarray([[1.0, 0.534522, 1.000000],
+                                                                          [1.0, 0.534522, 2.217356]])))
+
+        # second column is static features, those it need to be the mean and std value across all sequences
+        scaler.dataset_is_small_preprocess = False
+        scaler = scaler.fit(self.data)
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertIsInstance(transformed_test, np.ndarray)
+        # should have the same value as the second part of transformed except for the static values
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed[:, [0, -1]]))
+        self.assertTrue(np.all(transformed_test[:, 1] == 0.))
+
+        self.assertTrue(np.allclose(scaler.loc, np.asarray([[0., 1., 3.25]])))
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 1., 2.21735578]])))
+
+    def test_min_max(self):
+        scaler = TimeSeriesScaler(mode='min_max',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        transformed_data = scaler.transform(self.data).values
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[0.5, 1., 0.],
+                                                                  [0., 1., 0.],
+                                                                  [1., 1., 0.],
+                                                                  [0., 0., 0.],
+                                                                  [0., 0., 0.2],
+                                                                  [0., 0., 0.6],
+                                                                  [0., 0., 1.]])))
+        self.assertTrue(np.allclose(scaler.loc.values, np.asarray([[0, 1, 3],
+                                                                   [0, 1, 1]])))
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 1, 1],
+                                                                     [1, 1, 5]])))
+
+        scaler.dataset_is_small_preprocess = False
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+
+        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
+        self.assertTrue(np.all(transformed_test[:, 1] == 0.))
+        self.assertTrue(np.allclose(scaler.loc, np.asarray([[0., 1., 1.]])))
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 1., 5.]])))
+
+    def test_max_abs_scaler(self):
+        scaler = TimeSeriesScaler(mode='max_abs',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        transformed_data = scaler.transform(self.data).values
+
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[0.5, 1., 1.],
+                                                                  [0., 1., 1.],
+                                                                  [1., 1., 1.],
+                                                                  [0., 0.5, 0.16666667],
+                                                                  [0., 0.5, 0.33333333],
+                                                                  [0., 0.5, 0.66666667],
+                                                                  [0., 0.5, 1.]])))
+        self.assertIsNone(scaler.loc)
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 2, 3],
+                                                                     [1, 2, 6]])))
+
+        scaler.dataset_is_small_preprocess = False
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+
+        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
+        self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+        self.assertIsNone(scaler.loc)
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 1., 6.]])))
+
+    def test_mean_abs_scaler(self):
+        scaler = TimeSeriesScaler(mode='mean_abs',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        transformed_data = scaler.transform(self.data).values
+
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[1., 1.33333333, 1.],
+                                                                  [0., 1.33333333, 1.],
+                                                                  [2., 1.33333333, 1.],
+                                                                  [0., 0.66666667, 0.30769231],
+                                                                  [0., 0.66666667, 0.61538462],
+                                                                  [0., 0.66666667, 1.23076923],
+                                                                  [0., 0.66666667, 1.84615385]])))
+        self.assertIsNone(scaler.loc)
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[1., 1.5, 3.],
+                                                                     [1., 1.5, 3.25]])))
+        scaler.dataset_is_small_preprocess = False
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+
+        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
+        self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+        self.assertIsNone(scaler.loc)
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[6., 1., 3.25]])))
+
+    def test_no_scaler(self):
+        scaler = TimeSeriesScaler(mode='mean_abs',
+                                  static_features=self.static_features
+                                  )
+
+        scaler = scaler.fit(self.data)
+        transformed_data = scaler.transform(self.data).values
+
+        self.assertTrue(np.allclose(transformed_data, np.asarray([[1., 1.33333333, 1.],
+                                                                  [0., 1.33333333, 1.],
+                                                                  [2., 1.33333333, 1.],
+                                                                  [0., 0.66666667, 0.30769231],
+                                                                  [0., 0.66666667, 0.61538462],
+                                                                  [0., 0.66666667, 1.23076923],
+                                                                  [0., 0.66666667, 1.84615385]])))
+        self.assertIsNone(scaler.loc)
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[1., 1.5, 3.],
+                                                                     [1., 1.5, 3.25]])))
+        scaler.dataset_is_small_preprocess = False
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+
+        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
+        self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+        self.assertIsNone(scaler.loc)
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[6., 1., 3.25]])))
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
new file mode 100644
index 000000000..17a7e2cf6
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
@@ -0,0 +1,45 @@
+from test.test_pipeline.components.preprocessing.forecasting.base import TimeSeriesTransformer
+
+import numpy as np
+
+import pytest
+
+from scipy.sparse import csr_matrix
+
+from sklearn.compose import ColumnTransformer
+
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
+    TimeSeriesTransformer
+)
+
+
+@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing',
+                                                            'uni_variant_w_missing',
+                                                            'multi_variant_wo_missing',
+                                                            'multi_variant_w_missing'], indirect=True)
+class TimeSeriesForecstingTransformer:
+    def test_tabular_preprocess(self, get_fit_dictionary_forecasting):
+        pipeline = TimeSeriesTransformer(dataset_properties=get_fit_dictionary_forecasting['dataset_properties'])
+        pipeline = pipeline.fit(get_fit_dictionary_forecasting)
+        X = pipeline.transform(get_fit_dictionary_forecasting)
+        column_transformer = X['tabular_transformer']
+
+        # check if transformer was added to fit dictionary
+        assert 'tabular_transformer' in X.keys()
+        # check if transformer is of expected type
+        # In this case we expect the tabular transformer not the actual column transformer
+        # as the later is not callable and runs into error in the compose transform
+        assert isinstance(column_transformer, TimeSeriesTransformer)
+
+        data = column_transformer.preprocessor.fit_transform(X['X_train'])
+        assert isinstance(data, np.ndarray)
+
+        # Make sure no columns are unintentionally dropped after preprocessing
+        if len(get_fit_dictionary_forecasting['dataset_properties']["numerical_columns"]) == 0:
+            categorical_pipeline = column_transformer.preprocessor.named_transformers_['categorical_pipeline']
+            categorical_data = categorical_pipeline.transform(X['X_train'])
+            assert data.shape[1] == categorical_data.shape[1]
+        elif len(get_fit_dictionary_forecasting['dataset_properties']["categorical_columns"]) == 0:
+            numerical_pipeline = column_transformer.preprocessor.named_transformers_['numerical_pipeline']
+            numerical_data = numerical_pipeline.transform(X['X_train'])
+            assert data.shape[1] == numerical_data.shape[1]

From 835055d7962ad7d4e804e2b61b2080a79325d2c2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 11 May 2022 20:00:38 +0200
Subject: [PATCH 255/347] maint

---
 .../time_series_forecasting_data_loader.py        | 15 +++++++++------
 .../training/data_loader/time_series_util.py      |  1 -
 .../components/training/metrics/metrics.py        |  4 ++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index baddedb9c..a1187f19f 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -3,7 +3,7 @@
 from functools import partial
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter, Constant
+from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
 from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
@@ -17,7 +17,8 @@
 from autoPyTorch.datasets.time_series_dataset import (
     TimeSeriesForecastingDataset,
     TimeSeriesSequence,
-    extract_feature_index)
+    extract_feature_index
+)
 from autoPyTorch.utils.common import (
     FitRequirement,
     HyperparameterSearchSpace,
@@ -77,7 +78,6 @@ def __init__(self,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.sample_strategy = sample_strategy
-        self.subseq_length = self.window_size
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector = None
 
@@ -386,7 +386,7 @@ def get_test_data_loader(self) -> torch.utils.data.DataLoader:
         return self.test_data_loader
 
     @staticmethod
-    def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
+    def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
                                         batch_size: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter="batch_size",
                                                                   value_range=(32, 320),
@@ -448,6 +448,9 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
         add_hyperparameter(cs, num_batch_per_epoch, UniformIntegerHyperparameter)
         add_hyperparameter(cs, sample_strategy, CategoricalHyperparameter)
 
+        if dataset_properties is None:
+            dataset_properties = {}
+
         seq_length_max = dataset_properties.get('seq_length_max', np.inf)
 
         if seq_length_max <= window_size.value_range[1]:
@@ -455,9 +458,9 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = None,
                 warnings.warn('The base window_size is larger than the maximal sequence length in the dataset,'
                               'we simply set it as a constant value with maximal sequence length')
                 window_size = HyperparameterSearchSpace(hyperparameter=window_size.hyperparameter,
-                                                        value_range=(seq_length_max,),
+                                                        value_range=(1, seq_length_max),
                                                         default_value=seq_length_max)
-                window_size = get_hyperparameter(window_size, Constant)
+                window_size = get_hyperparameter(window_size, UniformIntegerHyperparameter)
             else:
                 window_size_value_range = window_size.value_range
                 window_size = HyperparameterSearchSpace(hyperparameter='window_size',
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 5d4469223..808d07676 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -10,7 +10,6 @@
 
 from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
-from torch.nn.utils.rnn import pad_sequence
 
 
 class TestSequenceDataset(TransformSubset):
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index d8ade9f28..683a6d56a 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -70,7 +70,7 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> n
     past_target = np.nan_to_num(past_target)
     max_past_target_abs = np.max(np.abs(past_target))
     if max_past_target_abs == 0.:
-        return np.asarray(1.)
+        return np.asarray([1.])
     if sp >= len(past_target):
         # in this case, we simply consider the mean value of the entire sequence
         # TODO consider if there is a better way of handling this
@@ -79,7 +79,7 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> n
                                                                        np.zeros_like(past_target),
                                                                        multioutput="raw_values")
         except ValueError:
-            return np.asarray(1.)
+            return np.asarray([1.])
 
     else:
         mase_denominator = forecasting_metrics.mean_absolute_error(past_target[sp:],

From ef9e44ecd16e97a890bce4bc81af6fab35d321d1 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 11 May 2022 22:59:13 +0200
Subject: [PATCH 256/347] finish test for preprocessing

---
 .../TimeSeriesTransformer.py                  | 12 ++--
 .../base_time_series_preprocessing.py         | 30 --------
 .../time_series_preprocessing/utils.py        |  2 +-
 .../pipeline/time_series_forecasting.py       | 16 +++--
 test/conftest.py                              | 17 +++--
 .../preprocessing/forecasting/base.py         | 25 ++++---
 .../preprocessing/forecasting/test_scaling.py | 24 +++----
 .../test_time_series_transformer.py           | 68 ++++++++++++-------
 8 files changed, 105 insertions(+), 89 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 1639e7cd5..b58889dee 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -18,11 +18,11 @@
 from autoPyTorch.utils.common import FitRequirement
 
 
-class TimeSeriesTransformer(autoPyTorchTimeSeriesPreprocessingComponent):
+class TimeSeriesFeatureTransformer(autoPyTorchTimeSeriesPreprocessingComponent):
     def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
         super().__init__()
         self.random_state = random_state
-        self.preprocessor: Optional[Pipeline] = None
+        self.preprocessor: Optional[ColumnTransformer] = None
         self.add_fit_requirements([
             FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
@@ -78,11 +78,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             X (Dict[str, Any]): updated fit dictionary
         """
-        X.update({'time_series_transformer': self})
+        X.update({'time_series_feature_transformer': self})
         return X
 
     def __call__(self, X: pd.DataFrame) -> pd.DataFrame:
-
         if self.preprocessor is None:
             raise ValueError("cant call {} without fitting the column transformer first."
                              .format(self.__class__.__name__))
@@ -104,6 +103,11 @@ def get_column_transformer(self) -> ColumnTransformer:
 
 
 class TimeSeriesTargetTransformer(autoPyTorchTimeSeriesTargetPreprocessingComponent):
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.random_state = random_state
+        self.preprocessor: Optional[ColumnTransformer] = None
+
     def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
         """
         Creates a column transformer for the chosen tabular
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
index 94f37c27b..f00cb95b5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -18,21 +18,6 @@ def __init__(self) -> None:
         self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
             numerical=None, categorical=None)
 
-    def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
-        """
-        Returns early_preprocessor dictionary containing the sklearn numerical
-        and categorical early_preprocessor with "numerical" and "categorical"
-        keys. May contain None for a key if early_preprocessor does not
-        handle the datatype defined by key
-
-        Returns:
-            Dict[str, BaseEstimator]: early_preprocessor dictionary
-        """
-        if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None:
-            raise AttributeError("{} can't return early_preprocessor dict without fitting first"
-                                 .format(self.__class__.__name__))
-        return self.preprocessor
-
     def __str__(self) -> str:
         """ Allow a nice understanding of what components where used """
         string = self.__class__.__name__
@@ -55,21 +40,6 @@ def __init__(self) -> None:
         self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
             numerical=None, categorical=None)
 
-    def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
-        """
-        Returns early_preprocessor dictionary containing the sklearn numerical
-        and categorical early_preprocessor with "numerical" and "categorical"
-        keys. May contain None for a key if early_preprocessor does not
-        handle the datatype defined by key
-
-        Returns:
-            Dict[str, BaseEstimator]: early_preprocessor dictionary
-        """
-        if (self.preprocessor['target_numerical'] and self.preprocessor['target_categorical']) is None:
-            raise AttributeError("{} can't return early_preprocessor dict without fitting first"
-                                 .format(self.__class__.__name__))
-        return self.preprocessor
-
     def __str__(self) -> str:
         """ Allow a nice understanding of what components where used """
         string = self.__class__.__name__
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
index e7e15ad8e..f376ceef2 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
@@ -42,7 +42,7 @@ def get_time_series_target_preprocessers(X: Dict[str, Any]) -> Dict[str, List[Ba
     Returns:
         (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
     """
-    preprocessor = dict(numerical=list(), categorical=list())  # type: Dict[str, List[BaseEstimator]]
+    preprocessor = dict(target_numerical=list(), target_categorical=list())  # type: Dict[str, List[BaseEstimator]]
     for key, value in X.items():
         if isinstance(value, dict):
             # as each preprocessor is child of BaseEstimator
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index c4a9b37c4..af9bbe081 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -19,7 +19,7 @@
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
-    TimeSeriesTransformer
+    TimeSeriesFeatureTransformer
 )
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
     TimeSeriesEncoderChoice
@@ -41,12 +41,17 @@
 from autoPyTorch.pipeline.components.setup.network_initializer import (
     NetworkInitializerChoice
 )
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import (
     TargetScalerChoice
+)
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import (
+    TargetNoScaler
+)
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
-from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import \
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
     TimeSeriesForecastingDataLoader
+)
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
@@ -118,7 +123,8 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
     def fit(self, X: Dict[str, Any], y: Optional[np.ndarray] = None,
             **fit_params: Any) -> Pipeline:
         super().fit(X, y, **fit_params)
-        self.target_scaler = X['target_scaler']
+        self.target_scaler = X.get('target_scaler', TargetNoScaler(self.random_state).fit(X))
+        return self
 
     def _get_hyperparameter_search_space(self,
                                          dataset_properties: Dict[str, Any],
@@ -331,7 +337,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                           ("scaler", BaseScaler(random_state=self.random_state)),
                           ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
                                                                random_state=self.random_state)),
-                          ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
+                          ("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),
                           ("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
                           ])
 
diff --git a/test/conftest.py b/test/conftest.py
index 7e2249d39..4949d85c9 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -645,8 +645,7 @@ def get_forecasting_data(request):
         with_missing_values = False
     elif request == 'multi_variant_w_missing':
         with_missing_values = True
-    else:
-        raise NotImplementedError
+
     generator = check_random_state(0)
     n_seq = 10
     base_length = 50
@@ -658,9 +657,19 @@ def get_forecasting_data(request):
     # for categorical features, the following character indicate how the feature is stored:
     # s: stored as string; n: stored as
     if type_X == 'pd':
-        feature_columns = ['n1', 'cs2_10', 'n3', 'cn4_5', 'n5']
+        if 'only_cat' in request:
+            feature_columns = ['cs2_10', 'cn4_5']
+        elif 'only_num' in request:
+            feature_columns = ['n1', 'n3', 'n5']
+        else:
+            feature_columns = ['n1', 'cs2_10', 'n3', 'cn4_5', 'n5']
     else:
-        feature_columns = ['n1', 'cn2_5', 'n3', 'cn4_5', 'n5']
+        if 'only_cat' in request:
+            feature_columns = ['cn2_5', 'cn4_5']
+        elif 'only_num' in request:
+            feature_columns = ['n1', 'n3', 'n5']
+        else:
+            feature_columns = ['n1', 'cn2_5', 'n3', 'cn4_5', 'n5']
 
     def generate_forecasting_features(feature_type, length):
         feature_type_content = list(feature_type)
diff --git a/test/test_pipeline/components/preprocessing/forecasting/base.py b/test/test_pipeline/components/preprocessing/forecasting/base.py
index 5928acc85..78e115959 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/base.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/base.py
@@ -1,8 +1,9 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
-    TimeSeriesTransformer, TimeSeriesTargetTransformer
+    TimeSeriesFeatureTransformer, TimeSeriesTargetTransformer
 )
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
     TimeSeriesEncoderChoice
@@ -26,17 +27,23 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
             List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
                 by the pipeline.
         """
-        steps: List[Tuple[str, autoPyTorchChoice]] = []
+        steps: List[Tuple[str, Union[autoPyTorchChoice, autoPyTorchComponent]]] = []
 
         default_dataset_properties = {'target_type': 'time_series_forecasting'}
         if dataset_properties is not None:
             default_dataset_properties.update(dataset_properties)
+        if not default_dataset_properties['uni_variant']:
 
-        steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
-                      ("scaler", BaseScaler(random_state=self.random_state)),
-                      ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
-                                                           random_state=self.random_state)),
-                      ("time_series_transformer", TimeSeriesTransformer(random_state=self.random_state)),
-                      ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
+            steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
+                          ("scaler", BaseScaler(random_state=self.random_state)),
+                          ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
+                                                               random_state=self.random_state)),
+                          ("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),
+                          ])
+
+        steps.extend([("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
+                      ("time_series_target_transformer", TimeSeriesTargetTransformer(random_state=self.random_state)),
                       ])
+
         return steps
+
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
index 06430f236..0c2ff1c0b 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -170,28 +170,28 @@ def test_mean_abs_scaler(self):
         self.assertTrue(np.allclose(scaler.scale, np.asarray([[6., 1., 3.25]])))
 
     def test_no_scaler(self):
-        scaler = TimeSeriesScaler(mode='mean_abs',
+        scaler = TimeSeriesScaler(mode='none',
                                   static_features=self.static_features
                                   )
 
         scaler = scaler.fit(self.data)
         transformed_data = scaler.transform(self.data).values
 
-        self.assertTrue(np.allclose(transformed_data, np.asarray([[1., 1.33333333, 1.],
-                                                                  [0., 1.33333333, 1.],
-                                                                  [2., 1.33333333, 1.],
-                                                                  [0., 0.66666667, 0.30769231],
-                                                                  [0., 0.66666667, 0.61538462],
-                                                                  [0., 0.66666667, 1.23076923],
-                                                                  [0., 0.66666667, 1.84615385]])))
+        self.assertTrue(np.allclose(transformed_data, self.data.values))
         self.assertIsNone(scaler.loc)
+        self.assertIsNone(scaler.scale)
 
-        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[1., 1.5, 3.],
-                                                                     [1., 1.5, 3.25]])))
         scaler.dataset_is_small_preprocess = False
         transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
 
         self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
-        self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+        self.assertTrue(np.allclose(transformed_test, np.concatenate(self.raw_data)))
+
         self.assertIsNone(scaler.loc)
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[6., 1., 3.25]])))
+        self.assertIsNone(scaler.scale)
+
+        with self.assertRaises(ValueError):
+            scaler = TimeSeriesScaler(mode='random',
+                                      static_features=self.static_features
+                                      )
+            _ = scaler.transform(self.data)
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
index 17a7e2cf6..3a6464124 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
@@ -1,7 +1,6 @@
-from test.test_pipeline.components.preprocessing.forecasting.base import TimeSeriesTransformer
+from test.test_pipeline.components.preprocessing.forecasting.base import ForecastingPipeline
 
 import numpy as np
-
 import pytest
 
 from scipy.sparse import csr_matrix
@@ -9,37 +8,58 @@
 from sklearn.compose import ColumnTransformer
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
-    TimeSeriesTransformer
+    TimeSeriesFeatureTransformer,
+    TimeSeriesTargetTransformer
 )
 
 
 @pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing',
                                                             'uni_variant_w_missing',
                                                             'multi_variant_wo_missing',
-                                                            'multi_variant_w_missing'], indirect=True)
-class TimeSeriesForecstingTransformer:
-    def test_tabular_preprocess(self, get_fit_dictionary_forecasting):
-        pipeline = TimeSeriesTransformer(dataset_properties=get_fit_dictionary_forecasting['dataset_properties'])
-        pipeline = pipeline.fit(get_fit_dictionary_forecasting)
-        X = pipeline.transform(get_fit_dictionary_forecasting)
-        column_transformer = X['tabular_transformer']
-
-        # check if transformer was added to fit dictionary
-        assert 'tabular_transformer' in X.keys()
-        # check if transformer is of expected type
-        # In this case we expect the tabular transformer not the actual column transformer
-        # as the later is not callable and runs into error in the compose transform
-        assert isinstance(column_transformer, TimeSeriesTransformer)
-
-        data = column_transformer.preprocessor.fit_transform(X['X_train'])
-        assert isinstance(data, np.ndarray)
+                                                            'multi_variant_w_missing',
+                                                            'multi_variant_w_missing_only_cat',
+                                                            'multi_variant_w_missing_only_num',
+                                                            ], indirect=True)
+def test_time_series_preprocess(get_fit_dictionary_forecasting):
+    pipeline = ForecastingPipeline(dataset_properties=get_fit_dictionary_forecasting['dataset_properties'])
+    pipeline = pipeline.fit(get_fit_dictionary_forecasting)
+    X = pipeline.transform(get_fit_dictionary_forecasting)
+
+    assert 'time_series_target_transformer' in X.keys()
+    target_transformer = X['time_series_target_transformer']
+
+    # check if transformer is of expected type
+    # In this case we expect the tabular transformer not the actual column transformer
+    # as the later is not callable and runs into error in the compose transform
+    assert isinstance(target_transformer, TimeSeriesTargetTransformer)
+
+    targets = target_transformer.preprocessor.fit_transform(X['y_train'])
+    assert isinstance(targets, np.ndarray)
+
+    targets_2 = target_transformer(X['y_train'])
+    assert np.allclose(targets, targets_2)
+
+    assert isinstance(target_transformer.get_target_transformer(), ColumnTransformer)
+
+    if not X['dataset_properties']['uni_variant']:
+        assert 'time_series_feature_transformer' in X.keys()
+        time_series_feature_transformer = X['time_series_feature_transformer']
+        assert isinstance(time_series_feature_transformer, TimeSeriesFeatureTransformer)
+
+        features = time_series_feature_transformer.preprocessor.fit_transform(X['X_train'])
+        assert isinstance(features, np.ndarray)
+
+        features_2 = time_series_feature_transformer(X['X_train'])
+        assert np.allclose(features, features_2)
+
+        assert isinstance(time_series_feature_transformer.get_column_transformer(), ColumnTransformer)
 
         # Make sure no columns are unintentionally dropped after preprocessing
         if len(get_fit_dictionary_forecasting['dataset_properties']["numerical_columns"]) == 0:
-            categorical_pipeline = column_transformer.preprocessor.named_transformers_['categorical_pipeline']
+            categorical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_['categorical_pipeline']
             categorical_data = categorical_pipeline.transform(X['X_train'])
-            assert data.shape[1] == categorical_data.shape[1]
+            assert features.shape[1] == categorical_data.shape[1]
         elif len(get_fit_dictionary_forecasting['dataset_properties']["categorical_columns"]) == 0:
-            numerical_pipeline = column_transformer.preprocessor.named_transformers_['numerical_pipeline']
+            numerical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_['numerical_pipeline']
             numerical_data = numerical_pipeline.transform(X['X_train'])
-            assert data.shape[1] == numerical_data.shape[1]
+            assert features.shape[1] == numerical_data.shape[1]

From 21b39580835c96151a4ab66a3f834144abffd4f8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 12 May 2022 23:08:19 +0200
Subject: [PATCH 257/347] test for data loader

---
 autoPyTorch/datasets/time_series_dataset.py   |   9 +-
 .../base_network_embedding.py                 |   4 +-
 .../time_series_forecasting_data_loader.py    | 135 ++++--
 .../training/data_loader/time_series_util.py  |   8 +-
 .../training/test_time_series_data_loader.py  | 411 ++++++++++++++++++
 5 files changed, 513 insertions(+), 54 deletions(-)
 create mode 100644 test/test_pipeline/components/training/test_time_series_data_loader.py

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 5a290039e..7105d5c12 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,5 +1,4 @@
 import os
-import pdb
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from numbers import Real
 import uuid
@@ -103,7 +102,7 @@ def __init__(self,
                  Y_test: Optional[np.ndarray] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
-                 n_prediction_steps: int = 0,
+                 n_prediction_steps: int = 1,
                  sp: int = 1,
                  known_future_features_index: Optional[List[int]] = None,
                  compute_mase_coefficient_value: bool = True,
@@ -120,6 +119,8 @@ def __init__(self,
         """
         self.n_prediction_steps = n_prediction_steps
 
+        if X is not None and X.ndim == 1:
+            X = X[:, np.newaxis]
         self.X = X
         self.Y = Y
 
@@ -131,6 +132,9 @@ def __init__(self,
         self.X_val = None
         self.Y_val = None
 
+        if X_test is not None and X_test.ndim == 1:
+            X_test = X_test[:, np.newaxis]
+
         self.X_test = X_test
         self.Y_tet = Y_test
 
@@ -1136,7 +1140,6 @@ def create_holdout_val_split(
         splits = [[() for _ in range(len(self.datasets))] for _ in range(2)]
         idx_start = 0
         for idx_seq, dataset in enumerate(self.datasets):
-
             split = self.holdout_validators[holdout_val_type.name](self.random_state,
                                                                    val_share,
                                                                    indices=np.arange(len(dataset)) + idx_start,
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 22d453805..546b3fb9f 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -55,8 +55,8 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
             if 'tabular_transformer' in X:
                 numerical_column_transformer = X['tabular_transformer'].preprocessor. \
                     named_transformers_['numerical_pipeline']
-            elif 'time_series_transformer' in X:
-                numerical_column_transformer = X['time_series_transformer'].preprocessor. \
+            elif 'time_series_feature_transformer' in X:
+                numerical_column_transformer = X['time_series_feature_transformer'].preprocessor. \
                     named_transformers_['numerical_pipeline']
             else:
                 raise ValueError("Either a tabular or time_series transformer must be contained!")
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index a1187f19f..cef9dc138 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -52,7 +52,7 @@ def __init__(self,
                  window_size: int = 1,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
-                 sample_strategy='seq_uniform',
+                 sample_strategy='SeqUniform',
                  transform_time_features=False,
                  random_state: Optional[np.random.RandomState] = None) -> None:
         """
@@ -68,11 +68,12 @@ def __init__(self,
         super().__init__(batch_size=batch_size, random_state=random_state)
         self.backcast = backcast
         self.backcast_period = backcast_period
-        if not backcast:
-            self.window_size: int = window_size
-        else:
-            self.window_size: int = backcast_period * n_prediction_steps
+
         self.n_prediction_steps = n_prediction_steps
+        self.window_size = window_size
+
+        self.window_size = self.adjust_window_size(1)
+
         self.sample_interval = 1
         # length of the tail, for instance if a sequence_length = 2, sample_interval =2, n_prediction = 2,
         # the time sequence should look like: [X, y, X, y, y] [test_data](values in tail is marked with X)
@@ -88,13 +89,78 @@ def __init__(self,
         self.freq = "1Y"
         self.time_feature_transform = []
         self.dataset_columns = []
+        self.sampler_train = None
 
         self.add_fit_requirements(
             [FitRequirement("known_future_features", (Tuple,), user_defined=True, dataset_property=True),
              FitRequirement("feature_shapes", (Dict,), user_defined=True, dataset_property=True),
-             FitRequirement("feature_names", (Tuple, ), user_defined=True, dataset_property=True),
+             FitRequirement("feature_names", (Tuple,), user_defined=True, dataset_property=True),
              FitRequirement("sequence_lengths_train", (List,), user_defined=True, dataset_property=True),
-             FitRequirement("freq", (str, ), user_defined=True, dataset_property=True)])
+             FitRequirement("freq", (str,), user_defined=True, dataset_property=True),
+             FitRequirement("n_prediction_steps", (int,), user_defined=True, dataset_property=True)])
+
+    def adjust_window_size(self, sample_interval: int = 1) -> int:
+        """
+        Adjust the sliding window size with the given sample_interval and the
+        Args:
+            sample_interval (int): resolution of the window size
+
+        Returns:
+            window_size (int): window size
+
+        """
+        window_size = self.window_size
+        if self.backcast:
+            window_size = self.backcast_period * self.n_prediction_steps
+
+        if sample_interval > 1:
+            # for lower resolution, window_size should be smaller
+            window_size = (self.window_size - 1) // sample_interval + 1
+        return window_size
+
+    def compute_expected_num_instances_per_seq(self,
+                                               num_instances_dataset: int,
+                                               seq_train_length: np.ndarray,
+                                               min_start: int = 0,
+                                               fraction_seq: float = 1.0,
+                                               fraction_samples_per_seq: float = 1.0,
+                                               ) -> np.ndarray:
+        """
+        Compute the number of expected sample instances within each sequence.
+        Args:
+            num_instances_dataset (int): number of all possible instances inside a dataset
+            seq_train_length (np.ndarray): length of each sequence
+            min_start (int): minimal number of start
+            fraction_seq (float): fraction of the sequence that will be sampled during training.
+            fraction_samples_per_seq (float): fraction of number of samples inside each series
+
+        Returns:
+            num_instances_per_seqs (np.ndarray): expected number of instances to be sampled inside each sequence
+        """
+        seq_train_length = np.asarray(seq_train_length)
+        num_instances_epoch = self.num_batches_per_epoch * self.batch_size
+        # create masks for masking
+        seq_idx_inactivate = np.random.choice(seq_train_length.size,
+                                              int(np.floor(seq_train_length.size * (1 - fraction_seq))),
+                                              replace=False)
+        if len(seq_idx_inactivate) == seq_train_length.size:
+            # we don't want to make all the sequence inactivate
+            seq_idx_inactivate = self.random_state.choice(seq_idx_inactivate, len(seq_idx_inactivate) - 1,
+                                                          replace=False)
+
+        if self.sample_strategy == 'LengthUniform':
+            available_seq_length = seq_train_length - min_start
+            available_seq_length = np.where(available_seq_length <= 0, 0, available_seq_length)
+            num_instances_per_seqs = num_instances_epoch / np.sum(available_seq_length) * available_seq_length
+        elif self.sample_strategy == 'SeqUniform':
+            num_seq_train = len(seq_train_length)
+            num_instances_per_seqs = np.repeat(num_instances_epoch / num_seq_train, num_seq_train)
+        else:
+            raise NotImplementedError(f'Unsupported sample strategy: {self.sample_strategy}')
+
+        num_instances_per_seqs[seq_idx_inactivate] = 0
+        num_instances_per_seqs *= fraction_samples_per_seq
+        return num_instances_per_seqs
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         """
@@ -111,22 +177,20 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         # Incorporate the transform to the dataset
         datamanager: TimeSeriesForecastingDataset = X['backend'].load_datamanager()
-
-        self.n_prediction_steps = datamanager.n_prediction_steps
-        if self.backcast:
-            self.window_size = self.backcast_period * self.n_prediction_steps
+        dataset_properties = X['dataset_properties']
 
         # this value corresponds to budget type resolution
         sample_interval = X.get('sample_interval', 1)
         padding_value = X.get('required_padding_value', 0.0)
 
-        if sample_interval > 1:
-            # for lower resolution, window_size should be smaller
-            self.window_size = (self.window_size - 1) // sample_interval + 1
+        self.n_prediction_steps = dataset_properties['n_prediction_steps']
+
+        self.window_size = self.adjust_window_size(sample_interval)
 
         max_lagged_value = max(X['dataset_properties'].get('lagged_value', [np.inf]))
         max_lagged_value += self.window_size + self.n_prediction_steps
 
+        # we want the feature names from the raw dataset
         self.dataset_columns = datamanager.feature_names
 
         known_future_features_index = extract_feature_index(
@@ -158,7 +222,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             train=False,
         )
 
-        datamanager.transform_time_features = self.transform_time_features
         if X['dataset_properties']["is_small_preprocess"]:
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
@@ -169,6 +232,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         else:
             self.dataset_small_preprocess = False
 
+        datamanager.transform_time_features = self.transform_time_features
+
         self._is_uni_variant = X['dataset_properties']['uni_variant']
 
         self.freq = X['dataset_properties']['freq']
@@ -180,11 +245,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         train_split, test_split = datamanager.splits[X['split_id']]
 
         num_instances_dataset = np.size(train_split)
-        num_instances_train = self.num_batches_per_epoch * self.batch_size
 
         # get the length of each sequence of training data (after split), as we know that validation sets are always
         # place on the tail of the series, the discontinuity only happens if a new series is concated.
-        # for instance, if we have a train indices is experssed as [0, 1, 2 ,3, 7 ,8 ].
+        # for instance, if we have a train indices is expressed as [0, 1, 2 ,3, 7 ,8 ].
         #  A new sequence must start from the index 7. We could then split each unique values to represent the length
         # of each split
 
@@ -193,38 +257,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         dataset_seq_length_train_all = X['dataset_properties']['sequence_lengths_train']
         if np.sum(dataset_seq_length_train_all) == len(train_split):
-            # this works if we want to fit the entire datasets
+            # this applies if we want to fit the entire datasets
             seq_train_length = np.array(dataset_seq_length_train_all)
         else:
             _, seq_train_length = np.unique(train_split - np.arange(len(train_split)), return_counts=True)
-        # create masks for masking
-        seq_idx_inactivate = np.where(self.random_state.rand(seq_train_length.size) > fraction_seq)[0]
-        if len(seq_idx_inactivate) == seq_train_length.size:
-            seq_idx_inactivate = self.random_state.choice(seq_idx_inactivate, len(seq_idx_inactivate) - 1,
-                                                          replace=False)
-        # this budget will reduce the number of samples inside each sequence, e.g., the samples becomes more sparse
 
-        """
-        num_instances_per_seqs = np.ceil(
-            np.ceil(num_instances_train / (num_instances_dataset - min_start) * seq_train_length) *
-            fraction_samples_per_seq
-        )
-        """
-        if self.sample_strategy == 'LengthUniform':
-            available_seq_length = seq_train_length - min_start
-            available_seq_length = np.where(available_seq_length <= 1, 1, available_seq_length)
-            num_instances_per_seqs = num_instances_train / num_instances_dataset * available_seq_length
-        elif self.sample_strategy == 'SeqUniform':
-            num_seq_train = len(seq_train_length)
-            num_instances_per_seqs = np.repeat(num_instances_train / num_seq_train, num_seq_train)
-        else:
-            raise NotImplementedError(f'Unsupported sample strategy: {self.sample_strategy}')
-
-        num_instances_per_seqs[seq_idx_inactivate] = 0
-        num_instances_per_seqs *= fraction_samples_per_seq
-
-        # num_instances_per_seqs = num_instances_per_seqs.astype(seq_train_length.dtype)
-        # at least one element of each sequence should be selected
+        num_instances_per_seqs = self.compute_expected_num_instances_per_seq(num_instances_dataset,
+                                                                             seq_train_length,
+                                                                             min_start,
+                                                                             fraction_seq,
+                                                                             fraction_samples_per_seq,
+                                                                             )
 
         # TODO consider the case where num_instances_train is greater than num_instances_dataset,
         # In which case we simply iterate through all the datasets
@@ -314,6 +357,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                 sequence_lengths = [0] * num_sequences
                 for seq_idx, x_seq in enumerate(X):
                     sequence_lengths[seq_idx] = len(x_seq.X)
+
                 x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
                 x_all.index = series_number
@@ -344,6 +388,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     x_seq.cache_time_features()
 
                 x_seq.freq = self.freq
+                x_seq.is_test_set = True
                 if not self.dataset_small_preprocess:
                     x_seq.update_transform(self.test_transform, train=False)
         else:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 808d07676..cae364464 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -1,4 +1,4 @@
-from typing import Optional, Sequence, List, Iterator, Sized
+from typing import Optional, Sequence, List, Iterator, Sized, Union
 
 import numpy as np
 
@@ -25,7 +25,7 @@ def __getitem__(self, idx: int) -> np.ndarray:
 
 
 def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
-                                     seq_minimal_length: int,
+                                     seq_minimal_length: int = 1,
                                      seq_max_length: int = np.inf,
                                      batch_first=True,
                                      padding_value=0.0) -> torch.Tensor:
@@ -127,7 +127,7 @@ class TimeSeriesSampler(SubsetRandomSampler):
     def __init__(self,
                  indices: Sequence[int],
                  seq_lengths: Sequence[int],
-                 num_instances_per_seqs: Optional[List[float]] = None,
+                 num_instances_per_seqs: Optional[Union[List[float], np.ndarray]] = None,
                  min_start: int = 0,
                  generator: Optional[torch.Generator] = None) -> None:
         """
@@ -154,7 +154,7 @@ def __init__(self,
         generator: Optional[torch.Generator]
             pytorch generator to control the randomness
         """
-        super(TimeSeriesSampler, self).__init__(indices, generator)
+        super().__init__(indices, generator)
         if num_instances_per_seqs is None:
             self.iter_all_seqs = True
         else:
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
new file mode 100644
index 000000000..6c5d5cc34
--- /dev/null
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -0,0 +1,411 @@
+from  typing import List
+import copy
+import unittest
+from unittest import mock
+import numpy as np
+import unittest.mock
+
+import pandas as pd
+
+from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes, HoldOutFuncs
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+import torch
+import torchvision
+from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
+    TestSequenceDataset,
+    pad_sequence_with_minimal_length,
+    PadSequenceCollector,
+    TimeSeriesSampler
+)
+
+from autoPyTorch.utils.common import HyperparameterSearchSpace
+
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
+    TimeSeriesForecastingDataLoader
+)
+
+
+class TestTimeSeriesForecastingDataSets(unittest.TestCase):
+    def setUp(self) -> None:
+        rng = np.random.RandomState(0)
+        feature_names = ['f1']
+        feature_shapes = {'f1': 1}
+        known_future_features = ('f1',)
+        freq = '1Y'
+        n_prediction_steps = 3
+
+        sequence_lengths_train = [10, 20, 30, 40, 50, 60, 70, 80, 90, 1000]
+
+        backend = unittest.mock.Mock()
+        n_repeats = 2
+
+        with mock.patch('autoPyTorch.datasets.time_series_dataset.TimeSeriesForecastingDataset') as MockDataSet:
+            mockdataset = MockDataSet.return_value
+            mockdataset.holdout_validators = HoldOutFuncs.get_holdout_validators(
+                HoldoutValTypes.time_series_hold_out_validation
+            )
+            datasets = []
+            mockdataset.sequence_lengths_train = sequence_lengths_train
+            for seq_len in sequence_lengths_train:
+                mock_ser = mock.MagicMock()
+                mock_ser.__len__.return_value = seq_len
+                datasets.append(mock_ser)
+            mockdataset.datasets = datasets
+            mockdataset.n_prediction_steps = n_prediction_steps
+
+        split = TimeSeriesForecastingDataset.create_holdout_val_split(mockdataset,
+                                                                      HoldoutValTypes.time_series_hold_out_validation,
+                                                                      0.1,
+                                                                      n_repeats=n_repeats)
+        X = []
+        y = []
+        X_test = []
+        """
+        for i, seq_len in enumerate(sequence_lengths_train):
+            X.append(
+                pd.DataFrame(rng.random([seq_len, len(feature_names)]), columns=feature_names, index=[i] * seq_len))
+            y.append(pd.DataFrame(rng.random([seq_len, 1]), columns=feature_names, index=[i] * seq_len))
+            X_test.append(pd.DataFrame(rng.random([n_prediction_steps, 1]),
+                                       columns=feature_names, index=[i] * n_prediction_steps)
+                          )
+
+        self.datamanager = TimeSeriesForecastingDataset(X, y, X_test=X_test,
+                                                        n_prediction_steps=n_prediction_steps,
+                                                        known_future_features=known_future_features,
+                                                        freq="1Q")
+        """
+        with mock.patch('autoPyTorch.datasets.time_series_dataset.TimeSeriesForecastingDataset') as MockDataSet:
+            dataset = MockDataSet.return_value
+
+            dataset.__len__.return_value = sum(sequence_lengths_train)
+            datamanager = unittest.mock.MagicMock()
+            datamanager.get_dataset.return_value = dataset
+            datamanager.feature_names = ['f1']
+            datamanager.splits.__getitem__.return_value = split
+
+        dataset_properties = dict(feature_names=feature_names,
+                                  feature_shapes=feature_shapes,
+                                  known_future_features=known_future_features,
+                                  freq=freq,
+                                  is_small_preprocess=True,
+                                  uni_variant=False,
+                                  time_feature_transform=True,
+                                  sequence_lengths_train=sequence_lengths_train,
+                                  n_prediction_steps=n_prediction_steps,
+                                  n_repeats=n_repeats)
+
+        self.n_prediction_steps = n_prediction_steps
+
+        backend.load_datamanager.return_value = datamanager
+        self.fit_dictionary = {
+            'dataset_properties': dataset_properties,
+            'lagged_value': [1, 2, 3],
+            'X_train': pd.DataFrame([0.] * sum(sequence_lengths_train)),
+            'y_train': pd.DataFrame([0.] * sum(sequence_lengths_train)),
+            'train_indices': split[0],
+            'test_indices': split[1],
+            'working_dir': '/tmp',
+            'backend': backend,
+            'split_id': 0,
+        }
+
+    def test_get_set_config_space(self):
+        """
+        Makes sure that the configuration space of the base data loader
+        is properly working"""
+        loader = TimeSeriesForecastingDataLoader()
+
+        dataset_properties = {'seq_length_max': 70}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 50)
+
+        dataset_properties = {'seq_length_max': 25}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 25)
+        self.assertEqual(cs.get_hyperparameter('window_size').default_value, 25)
+
+        dataset_properties = {'seq_length_max': 20}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 20)
+        self.assertEqual(cs.get_hyperparameter('window_size').lower, 1)
+
+        dataset_properties = {'seq_length_max': 10}
+        cs = loader.get_hyperparameter_search_space(dataset_properties)
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 10)
+        self.assertEqual(cs.get_hyperparameter('window_size').lower, 1)
+
+        cs = loader.get_hyperparameter_search_space(dataset_properties,
+                                                    window_size=HyperparameterSearchSpace(hyperparameter='window_size',
+                                                                                          value_range=(2, 5),
+                                                                                          default_value=3))
+
+        self.assertEqual(cs.get_hyperparameter('window_size').upper, 5)
+        self.assertEqual(cs.get_hyperparameter('window_size').lower, 2)
+
+        for _ in range(5):
+            sample = cs.sample_configuration()
+            self.assertTrue(
+                ('backcast_period' in sample) ^ ('window_size' in sample)
+            )
+
+    def test_base_fit(self):
+        """ Makes sure that fit and transform work as intended """
+        fit_dictionary = copy.copy(self.fit_dictionary)
+
+        # Mock child classes requirements
+        loader = TimeSeriesForecastingDataLoader()
+        loader.build_transform = unittest.mock.Mock()
+        loader._check_transform_requirements = unittest.mock.Mock()
+
+        loader.fit(fit_dictionary)
+
+        # Fit means that we created the data loaders
+        self.assertIsInstance(loader.train_data_loader, torch.utils.data.DataLoader)
+        self.assertIsInstance(loader.val_data_loader, torch.utils.data.DataLoader)
+
+        # Transforms adds this fit dictionaries
+        transformed_fit_dictionary = loader.transform(fit_dictionary)
+        self.assertIn('train_data_loader', transformed_fit_dictionary)
+        self.assertIn('val_data_loader', transformed_fit_dictionary)
+
+        self.assertEqual(transformed_fit_dictionary['train_data_loader'],
+                         loader.train_data_loader)
+        self.assertEqual(transformed_fit_dictionary['val_data_loader'],
+                         loader.val_data_loader)
+        self.assertEqual(transformed_fit_dictionary['window_size'], loader.window_size)
+
+    def test_build_transform_small_preprocess_true(self):
+        """
+        Makes sure a proper composition is created
+        """
+        loader = TimeSeriesForecastingDataLoader()
+        fit_dictionary = copy.deepcopy(self.fit_dictionary)
+        fit_dictionary['dataset_properties']['is_small_preprocess'] = True
+        for thing in ['imputer', 'scaler', 'encoder']:
+            fit_dictionary[thing] = [unittest.mock.Mock()]
+
+        compose = loader.build_transform(fit_dictionary, mode='train')
+
+        self.assertIsInstance(compose, torchvision.transforms.Compose)
+
+        # No preprocessing needed here as it was done before
+        self.assertEqual(len(compose.transforms), 1)
+
+    def test_build_transform_small_preprocess_false(self):
+        """
+        Makes sure a proper composition is created
+        """
+        loader = TimeSeriesForecastingDataLoader()
+        fit_dictionary = copy.deepcopy(self.fit_dictionary)
+        fit_dictionary['dataset_properties']['is_small_preprocess'] = False
+        fit_dictionary['preprocess_transforms'] = [unittest.mock.Mock()]
+
+        compose = loader.build_transform(fit_dictionary, mode='train')
+
+        self.assertIsInstance(compose, torchvision.transforms.Compose)
+
+        # We expect the expand_transformer and Mock
+        self.assertEqual(len(compose.transforms), 2)
+
+    def test_adjust_window_size(self):
+        window_size = 2
+        n_prediction_steps = 5
+        backcast_period = 3
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=1,
+                                                                 window_size=window_size,
+                                                                 n_prediction_steps=n_prediction_steps)
+        self.assertEqual(time_series_dataloader.window_size, window_size)
+
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=1,
+                                                                 backcast=True,
+                                                                 backcast_period=backcast_period,
+                                                                 window_size=window_size,
+                                                                 n_prediction_steps=n_prediction_steps)
+        self.assertEqual(time_series_dataloader.window_size, backcast_period * n_prediction_steps)
+
+        sample_interval = 3
+        self.assertEqual(time_series_dataloader.adjust_window_size(sample_interval),
+                         (backcast_period * n_prediction_steps) // sample_interval)
+
+    @mock.patch("autoPyTorch.pipeline.components.training.data_loader.time_series_util.TimeSeriesSampler.__init__",
+                spec=True)
+    def test_compute_expected_num_instances_per_seq(self, sampler_mock_init):
+        sampler_mock_init.return_value = None
+        batch_size = 5
+        window_size = 5
+        num_batches_per_epoch = 4
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=batch_size,
+                                                                 window_size=window_size,
+                                                                 num_batches_per_epoch=num_batches_per_epoch)
+        fit_dictionary = copy.copy(self.fit_dictionary)
+        time_series_dataloader = time_series_dataloader.fit(fit_dictionary)
+
+        self.assertEqual(time_series_dataloader.window_size, window_size)
+        self.assertEqual(time_series_dataloader.known_future_features_index, (0,))
+
+        sampler = time_series_dataloader.sampler_train
+        self.assertIsInstance(sampler, TimeSeriesSampler)
+        train_split = fit_dictionary['train_indices']
+        self.assertEqual(len(train_split), len(sampler_mock_init.call_args[1]['indices']))
+
+        train_seq_length = fit_dictionary['dataset_properties']['sequence_lengths_train']
+
+        seq_lengths = []
+        for train_seq_len in train_seq_length:
+            n_train_seq = len(
+                HoldOutFuncs.time_series_hold_out_validation(
+                    None, None,
+                    np.arange(train_seq_len),
+                    n_prediction_steps=fit_dictionary['dataset_properties']['n_prediction_steps'],
+                    n_repeats=fit_dictionary['dataset_properties']['n_repeats'])[0])
+            if n_train_seq > 0:
+                seq_lengths.append(n_train_seq)
+        self.assertTrue(np.all(seq_lengths == sampler_mock_init.call_args[1]['seq_lengths']))
+
+        num_instances_per_seqs_full = sampler_mock_init.call_args[1]['num_instances_per_seqs']
+        unique_num_instances_per_seqs = np.unique(num_instances_per_seqs_full)
+        self.assertEqual(len(unique_num_instances_per_seqs), 1)
+
+        self.assertAlmostEqual(unique_num_instances_per_seqs.item(),
+                               num_batches_per_epoch * batch_size / len(seq_lengths))
+
+        self.assertEqual(sampler_mock_init.call_args[1]['min_start'],
+                         fit_dictionary['dataset_properties']['n_prediction_steps'])
+
+        num_instances_dataset = sum(train_seq_length)
+        seq_train_length = seq_lengths
+        min_start = fit_dictionary['dataset_properties']['n_prediction_steps']
+
+        fraction_seq = 0.3
+        num_instances_per_seqs_frac_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start, fraction_seq)
+        instances_to_be_sampled = np.where(num_instances_per_seqs_frac_seq)[0]
+        self.assertEqual(len(instances_to_be_sampled), int(np.ceil(fraction_seq * len(seq_train_length))))
+        self.assertAlmostEqual(np.unique(num_instances_per_seqs_frac_seq[instances_to_be_sampled]),
+                               unique_num_instances_per_seqs)
+
+        fraction_samples_per_seq = 0.3
+        num_instances_per_seqs_frac_per_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start,
+            fraction_samples_per_seq=fraction_samples_per_seq)
+        self.assertTrue(np.allclose(num_instances_per_seqs_frac_per_seq,
+                                    fraction_samples_per_seq * num_instances_per_seqs_full))
+
+        time_series_dataloader.sample_strategy = 'LengthUniform'
+
+        seq_lengths_reduced = np.asarray(seq_lengths) - min_start
+        seq_lengths_reduced = np.where(seq_lengths_reduced <= 0, 0, seq_lengths_reduced)
+
+        num_instances_per_seqs_full = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start)
+
+        self.assertTrue(
+            np.allclose(num_instances_per_seqs_full,
+                        batch_size * num_batches_per_epoch * seq_lengths_reduced / np.sum(seq_lengths_reduced))
+        )
+
+        fraction_seq = 0.3
+        num_instances_per_seqs_frac_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start, fraction_seq)
+        instances_to_be_sampled = np.where(num_instances_per_seqs_frac_seq)[0]
+
+        self.assertTrue(np.allclose(np.unique(num_instances_per_seqs_frac_seq[instances_to_be_sampled]),
+                                    num_instances_per_seqs_full[instances_to_be_sampled]))
+
+        fraction_samples_per_seq = 0.3
+        num_instances_per_seqs_frac_per_seq = time_series_dataloader.compute_expected_num_instances_per_seq(
+            num_instances_dataset,
+            seq_train_length,
+            min_start,
+            fraction_samples_per_seq=fraction_samples_per_seq)
+        self.assertTrue(np.allclose(num_instances_per_seqs_frac_per_seq,
+                                    fraction_samples_per_seq * num_instances_per_seqs_full))
+
+    @mock.patch("autoPyTorch.pipeline.components.training.data_loader.time_series_util.TestSequenceDataset.__init__",
+                spec=True)
+    def test_get_loader(self, loader_init_mock):
+        loader_init_mock.return_value = None
+        batch_size = 5
+        window_size = 5
+        num_batches_per_epoch = 4
+        time_series_dataloader = TimeSeriesForecastingDataLoader(batch_size=batch_size,
+                                                                 window_size=window_size,
+                                                                 num_batches_per_epoch=num_batches_per_epoch)
+        fit_dictionary = copy.copy(self.fit_dictionary)
+        time_series_dataloader.fit(fit_dictionary)
+        x_test = TimeSeriesSequence(X=np.array([1, 2, 3, 4, 5]),
+                                    Y=np.array([1, 2, 3, 4, 5]),
+                                    X_test=np.array([1, 2, 3]))
+        test_loader = time_series_dataloader.get_loader(X=x_test)
+        self.assertIsInstance(test_loader, torch.utils.data.DataLoader)
+        self.assertIsInstance(test_loader.dataset, TestSequenceDataset)
+        test_set = loader_init_mock.call_args[0][0]
+        self.assertIsInstance(test_set, List)
+        self.assertEqual(len(test_set), 1)
+
+        x_test = [x_test, x_test]
+        _ = time_series_dataloader.get_loader(X=x_test)
+        test_set = loader_init_mock.call_args[0][0]
+        self.assertEqual(len(test_set), len(x_test))
+
+        for seq in test_set:
+            self.assertIsInstance(seq, TimeSeriesSequence)
+            self.assertTrue(seq.is_test_set)
+            self.assertEqual(seq.freq, time_series_dataloader.freq)
+
+
+class TestTimeSeriesUtil(unittest.TestCase):
+    def test_test_seq_length(self):
+        x_test = TimeSeriesSequence(X=np.array([0, 1, 2, 3, 4]),
+                                    Y=np.array([1, 2, 3, 4, 5]),
+                                    X_test=np.array([1, 2, 3]),
+                                    n_prediction_steps=3,
+                                    is_test_set=True)
+        x_test = [x_test, x_test]
+        test_set = TestSequenceDataset(x_test)
+        self.assertEqual(len(test_set), len(x_test))
+        self.assertTrue(np.allclose(test_set[0][0]['past_targets'].numpy(), x_test[0].Y))
+
+    def test_pad_sequence_with_minimal_length(self):
+        sequences = [torch.ones([10, 1]),
+                     torch.ones([3, 1]),
+                     torch.ones([17, 1])]
+        pad_seq_1 = pad_sequence_with_minimal_length(sequences, 5)
+        self.assertEqual(list(pad_seq_1.shape), [3, 17, 1])
+        self.assertTrue(torch.all(pad_seq_1[0] == torch.tensor([0.]*7 + [1.] * 10).unsqueeze(-1)))
+
+        pad_seq_2 = pad_sequence_with_minimal_length(sequences, 5, batch_first=False)
+        self.assertEqual(list(pad_seq_2.shape), [17, 3, 1])
+        self.assertTrue(torch.all(pad_seq_2[:, 0] == torch.tensor([0.]*7 + [1.] * 10).unsqueeze(-1)))
+
+        pad_seq_3 = pad_sequence_with_minimal_length(sequences, 5, padding_value=0.5)
+        self.assertTrue(torch.all(pad_seq_3[0] == torch.tensor([0.5]*7 + [1.] * 10).unsqueeze(-1)))
+
+        pad_seq_4 = pad_sequence_with_minimal_length(sequences, 5, 10)
+        self.assertEqual(list(pad_seq_4.shape), [3, 10, 1])
+        self.assertTrue(torch.all(pad_seq_4[0] == torch.ones(10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_4[1] == torch.tensor([0]*7 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_4[2] == torch.ones(10).unsqueeze(-1)))
+
+        pad_seq_5 = pad_sequence_with_minimal_length(sequences, 20)
+        self.assertEqual(list(pad_seq_5.shape), [3, 20, 1])
+        self.assertTrue(torch.all(pad_seq_5[0] == torch.tensor([0]*10 + [1.] * 10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[1] == torch.tensor([0]*17 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[2] == torch.tensor([0]*3 + [1.] * 17).unsqueeze(-1)))
+
+        sequences = [torch.ones(3, dtype=torch.bool),
+                     torch.ones(15, dtype=torch.bool)]
+        pad_seq_6 = pad_sequence_with_minimal_length(sequences, 5)
+        self.assertTrue(pad_seq_6.dtype == torch.bool)
+        self.assertTrue(torch.all(pad_seq_6[0] == torch.tensor([False] * 12 + [True] * 3, dtype=torch.bool)))
+
+
+

From 101ddbc4f74be7d4407dcfd930b279343b003e7d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 13 May 2022 15:33:08 +0200
Subject: [PATCH 258/347] tests for dataloader

---
 .../time_series_forecasting_data_loader.py    |   1 +
 .../training/data_loader/time_series_util.py  |  18 +--
 .../training/test_time_series_data_loader.py  | 115 ++++++++++++++----
 3 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index cef9dc138..b6a46cc6a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -289,6 +289,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             sampler=self.sampler_train,
         )
 
+        # validation set is not so important here, we make  the size of validation set to be 20% of training instances
         num_samples_val = int(np.sum(num_instances_per_seqs)) // 5
         if num_samples_val > len(val_dataset):
             sampler_val = None
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index cae364464..1a36460f0 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -49,6 +49,7 @@ def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
         out_tensor = sequences[0].new_full(out_dims, False)
     else:
         out_tensor = sequences[0].new_full(out_dims, padding_value)
+
     for i, tensor in enumerate(sequences):
         length = min(tensor.size(0), seq_max_length)
         # use index notation to prevent duplicate references to the tensor
@@ -110,12 +111,13 @@ def __call__(self, batch, sample_interval=1, seq_minimal_length=1, padding_value
             return batch
         elif isinstance(elem, collections.abc.Mapping):
             # only past targets and features needs to be transformed
-
             return {
-                key: self([d[key] for d in batch]) if "past" not in key else self([d[key] for d in batch],
-                                                                                  self.sample_interval,
-                                                                                  self.window_size,
-                                                                                  self.target_padding_value) for key
+                key: self([d[key] for d in batch]) if "past" not in key else self(
+                    [d[key] for d in batch],
+                    self.sample_interval,
+                    self.window_size,
+                    self.target_padding_value if "targets" in key else 0.0
+                ) for key
                 in elem}
 
         elif elem is None:
@@ -174,7 +176,7 @@ def __init__(self,
                     idx_start = idx_tracker
 
                 num_interval = int(np.ceil(num_instances))
-                if num_interval > idx_end - idx_start or num_interval == 0:
+                if num_interval > idx_end - idx_start:
                     interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
                     # we consider
                     num_expected_ins_decimal.append(num_instances)
@@ -190,7 +192,7 @@ def __init__(self,
                 idx_tracker += seq_length
 
             num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
-            # seq_intervals_decimal_length = np.stack(seq_intervals_decimal_length)
+
             self.seq_lengths = seq_lengths
             self.seq_lengths_sum = np.sum(seq_lengths)
             self.num_instances = int(np.round(np.sum(num_instances_per_seqs)))
@@ -255,7 +257,7 @@ def __init__(self, data_source: Sized, num_samples: int, generator: Optional[tor
 
     def __iter__(self) -> Iterator[int]:
         if self.eval_all_sequences:
-            return super(SequentialSubSetSampler, self).__iter__()
+            yield from super(SequentialSubSetSampler, self).__iter__()
         else:
             yield from torch.randperm(len(self.data_source), generator=self.generator)[:self.num_samples]
 
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
index 6c5d5cc34..2ec96d144 100644
--- a/test/test_pipeline/components/training/test_time_series_data_loader.py
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -1,4 +1,4 @@
-from  typing import List
+from typing import List
 import copy
 import unittest
 from unittest import mock
@@ -15,7 +15,8 @@
     TestSequenceDataset,
     pad_sequence_with_minimal_length,
     PadSequenceCollector,
-    TimeSeriesSampler
+    TimeSeriesSampler,
+    SequentialSubSetSampler
 )
 
 from autoPyTorch.utils.common import HyperparameterSearchSpace
@@ -27,7 +28,6 @@
 
 class TestTimeSeriesForecastingDataSets(unittest.TestCase):
     def setUp(self) -> None:
-        rng = np.random.RandomState(0)
         feature_names = ['f1']
         feature_shapes = {'f1': 1}
         known_future_features = ('f1',)
@@ -57,23 +57,7 @@ def setUp(self) -> None:
                                                                       HoldoutValTypes.time_series_hold_out_validation,
                                                                       0.1,
                                                                       n_repeats=n_repeats)
-        X = []
-        y = []
-        X_test = []
-        """
-        for i, seq_len in enumerate(sequence_lengths_train):
-            X.append(
-                pd.DataFrame(rng.random([seq_len, len(feature_names)]), columns=feature_names, index=[i] * seq_len))
-            y.append(pd.DataFrame(rng.random([seq_len, 1]), columns=feature_names, index=[i] * seq_len))
-            X_test.append(pd.DataFrame(rng.random([n_prediction_steps, 1]),
-                                       columns=feature_names, index=[i] * n_prediction_steps)
-                          )
-
-        self.datamanager = TimeSeriesForecastingDataset(X, y, X_test=X_test,
-                                                        n_prediction_steps=n_prediction_steps,
-                                                        known_future_features=known_future_features,
-                                                        freq="1Q")
-        """
+
         with mock.patch('autoPyTorch.datasets.time_series_dataset.TimeSeriesForecastingDataset') as MockDataSet:
             dataset = MockDataSet.return_value
 
@@ -380,26 +364,26 @@ def test_pad_sequence_with_minimal_length(self):
                      torch.ones([17, 1])]
         pad_seq_1 = pad_sequence_with_minimal_length(sequences, 5)
         self.assertEqual(list(pad_seq_1.shape), [3, 17, 1])
-        self.assertTrue(torch.all(pad_seq_1[0] == torch.tensor([0.]*7 + [1.] * 10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_1[0] == torch.tensor([0.] * 7 + [1.] * 10).unsqueeze(-1)))
 
         pad_seq_2 = pad_sequence_with_minimal_length(sequences, 5, batch_first=False)
         self.assertEqual(list(pad_seq_2.shape), [17, 3, 1])
-        self.assertTrue(torch.all(pad_seq_2[:, 0] == torch.tensor([0.]*7 + [1.] * 10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_2[:, 0] == torch.tensor([0.] * 7 + [1.] * 10).unsqueeze(-1)))
 
         pad_seq_3 = pad_sequence_with_minimal_length(sequences, 5, padding_value=0.5)
-        self.assertTrue(torch.all(pad_seq_3[0] == torch.tensor([0.5]*7 + [1.] * 10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_3[0] == torch.tensor([0.5] * 7 + [1.] * 10).unsqueeze(-1)))
 
         pad_seq_4 = pad_sequence_with_minimal_length(sequences, 5, 10)
         self.assertEqual(list(pad_seq_4.shape), [3, 10, 1])
         self.assertTrue(torch.all(pad_seq_4[0] == torch.ones(10).unsqueeze(-1)))
-        self.assertTrue(torch.all(pad_seq_4[1] == torch.tensor([0]*7 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_4[1] == torch.tensor([0] * 7 + [1.] * 3).unsqueeze(-1)))
         self.assertTrue(torch.all(pad_seq_4[2] == torch.ones(10).unsqueeze(-1)))
 
         pad_seq_5 = pad_sequence_with_minimal_length(sequences, 20)
         self.assertEqual(list(pad_seq_5.shape), [3, 20, 1])
-        self.assertTrue(torch.all(pad_seq_5[0] == torch.tensor([0]*10 + [1.] * 10).unsqueeze(-1)))
-        self.assertTrue(torch.all(pad_seq_5[1] == torch.tensor([0]*17 + [1.] * 3).unsqueeze(-1)))
-        self.assertTrue(torch.all(pad_seq_5[2] == torch.tensor([0]*3 + [1.] * 17).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[0] == torch.tensor([0] * 10 + [1.] * 10).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[1] == torch.tensor([0] * 17 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.all(pad_seq_5[2] == torch.tensor([0] * 3 + [1.] * 17).unsqueeze(-1)))
 
         sequences = [torch.ones(3, dtype=torch.bool),
                      torch.ones(15, dtype=torch.bool)]
@@ -407,5 +391,82 @@ def test_pad_sequence_with_minimal_length(self):
         self.assertTrue(pad_seq_6.dtype == torch.bool)
         self.assertTrue(torch.all(pad_seq_6[0] == torch.tensor([False] * 12 + [True] * 3, dtype=torch.bool)))
 
+    def test_pad_sequence_controller(self):
+        window_size = 3
+        seq_max_length = 5
+        target_padding_value = 0.5
+        pad_seq_controller = PadSequenceCollector(window_size=window_size,
+                                                  sample_interval=1,
+                                                  target_padding_value=target_padding_value,
+                                                  seq_max_length=seq_max_length)
+        n_prediction_steps = 2
+        seq = TimeSeriesSequence(np.arange(10).astype(np.float), np.arange(10).astype(np.float),
+                                 n_prediction_steps=n_prediction_steps)
+        features_padded = pad_seq_controller([seq[0][0], seq[-1][0]])
+        past_targets = features_padded['past_targets']
+        past_features = features_padded['past_features']
+        self.assertEqual(list(past_targets.shape), [2, seq_max_length])
+        self.assertEqual(list(past_features.shape), [2, seq_max_length, 1])
+        self.assertTrue(features_padded['past_observed_targets'].dtype == torch.bool)
+        self.assertTrue(features_padded['decoder_lengths'].dtype == torch.int64)
+
+        self.assertTrue(torch.all(torch.ones(seq_max_length - 1) * target_padding_value == past_targets[0, :-1]))
+        self.assertTrue(torch.all(torch.zeros(seq_max_length - 1) == past_features[0, :-1]))
+
+        targets_padded = pad_seq_controller([seq[0][1], seq[-1][1]])
+        self.assertTrue(list(targets_padded['future_targets']), [2, n_prediction_steps])
+
+        features_padded = pad_seq_controller([seq[0][0], seq[0][0]])
+        self.assertEqual(list(features_padded['past_targets'].shape), [2, window_size])
+
+        pad_seq_controller.sample_interval = 2
+        features_padded = pad_seq_controller([seq[0][0], seq[-1][0]])
+        self.assertEqual(list(features_padded['past_targets'].shape), [2, 3])
+
+        self.assertTrue(torch.all(
+            pad_seq_controller([{'x': 0}, {'x': 1}])['x'] == torch.Tensor([0, 1]))
+        )
+        self.assertTrue(torch.all(
+            pad_seq_controller([{'x': np.array(0)}, {'x': np.array(1)}])['x'] == torch.Tensor([0, 1]))
+        )
 
+    def test_time_series_sampler(self):
+        indices = np.arange(100)
+        seq_lengths = [5, 10, 15, 20, 50]
+        num_instances_per_seqs = [3.3, 1.3, 7.5, 10, 20.1]
+
+        sampler = TimeSeriesSampler(indices, seq_lengths, num_instances_per_seqs, min_start=2)
+        self.assertEqual(sampler.num_instances, int(np.round(np.sum(num_instances_per_seqs))))
+        # The first sequence does not contain enough data to allow 3.3 sequences, so it only has 1 interval
+        # For the others, Interval should be np.floor(n_inst) + 1 (resulting in  np.floor(n_inst) intervals)
+        self.assertEqual(list(map(len, sampler.seq_intervals_int)), [1, 2, 8, 10, 21])
+        self.assertTrue(torch.equal(sampler.seq_intervals_decimal, torch.tensor([[2, 5],
+                                                                                 [7, 11],
+                                                                                 [17, 18],
+                                                                                 [32, 33],
+                                                                                 [52, 54]])))
+        self.assertTrue(
+            torch.allclose(sampler.num_expected_ins_decimal,
+                           torch.Tensor(
+                               [3.3000e+00, 3.0000e-01, 5.0000e-01, 1.0000e-08, 1.0000e-01]).type(torch.float64))
+        )
 
+        for i in range(5):
+            samples = torch.stack(list(sampler)).sort()[0].numpy()
+            for seq_intervals_int in sampler.seq_intervals_int:
+                if len(seq_intervals_int) > 1:
+                    for i in range(len(seq_intervals_int) - 1):
+                        self.assertTrue(
+                            len(np.where((seq_intervals_int[i] < samples) & (samples < seq_intervals_int[i + 1]))) == 1
+                        )
+
+    def test_sequential_sub_set_sampler(self):
+        n_samples = 5
+        n_indices = np.arange(100)
+        sampler = SequentialSubSetSampler(n_indices, n_samples)
+        self.assertEqual(len(sampler), n_samples)
+        self.assertEqual(len(list(sampler)), n_samples)
+
+        sampler = SequentialSubSetSampler(n_indices, 150)
+        self.assertEqual(len(sampler), len(n_indices))
+        self.assertEqual(len(list(sampler)), len(n_indices))

From 73180869e9eb0e94fe5a27a6365b74ef88959b4a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 13 May 2022 15:38:16 +0200
Subject: [PATCH 259/347] maint

---
 .../setup/network/forecasting_network.py          |  3 +--
 autoPyTorch/pipeline/time_series_forecasting.py   | 15 +--------------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 3f6008bc4..6b984615c 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -129,8 +129,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
 
         return self
 
-    def predict(self, loader: torch.utils.data.DataLoader,
-                target_scaler: Optional[BaseTargetScaler] = None) -> torch.Tensor:
+    def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
         """
         Performs batched prediction given a loader object
         """
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index af9bbe081..c368c2877 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -9,7 +9,6 @@
 import pandas as pd
 
 from sklearn.base import RegressorMixin
-from sklearn.pipeline import Pipeline
 
 import torch
 
@@ -44,9 +43,6 @@
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import (
     TargetScalerChoice
 )
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import (
-    TargetNoScaler
-)
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
 from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
@@ -94,8 +90,6 @@ def __init__(self,
                               config, steps, dataset_properties, include, exclude,
                               random_state, init_params, search_space_updates)
 
-        self.target_scaler = None
-
         # Because a pipeline is passed to a worker, we need to honor the random seed
         # in this context. A tabular regression pipeline will implement a torch
         # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html
@@ -112,7 +106,6 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
         Returns:
             np.ndarray: coefficient of determination R^2 of the prediction
         """
-        # TODO adjust to sktime's losses
         from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
         metrics = get_metrics(self.dataset_properties, ['r2'])
         y_pred = self.predict(X, batch_size=batch_size)
@@ -120,12 +113,6 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
                              metrics=metrics)['r2']
         return r2
 
-    def fit(self, X: Dict[str, Any], y: Optional[np.ndarray] = None,
-            **fit_params: Any) -> Pipeline:
-        super().fit(X, y, **fit_params)
-        self.target_scaler = X.get('target_scaler', TargetNoScaler(self.random_state).fit(X))
-        return self
-
     def _get_hyperparameter_search_space(self,
                                          dataset_properties: Dict[str, Any],
                                          include: Optional[Dict[str, Any]] = None,
@@ -431,7 +418,7 @@ def predict(self,
 
         loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size)
         try:
-            return self.named_steps['network'].predict(loader, self.target_scaler).flatten()
+            return self.named_steps['network'].predict(loader).flatten()
         except Exception as e:
             # https://github.com/pytorch/fairseq/blob/50a671f78d0c8de0392f924180db72ac9b41b801/fairseq/trainer.py#L283
             if 'out of memory' in str(e):

From cf2c982272268a64cb8a6bcf386d565a12fd2286 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 13 May 2022 19:42:28 +0200
Subject: [PATCH 260/347] test for target scaling 1

---
 .../TargetStandardScaler.py                   |   2 +-
 .../base_target_scaler.py                     |   4 +-
 .../components/setup/forecasting/__init__.py  |   0
 .../test_forecasting_target_scaling.py        | 136 ++++++++++++++++++
 4 files changed, 139 insertions(+), 3 deletions(-)
 create mode 100644 test/test_pipeline/components/setup/forecasting/__init__.py
 create mode 100644 test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
index f077bc730..0dfe15f10 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
@@ -13,4 +13,4 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[
         return {
             'shortname': 'TargetStandardScaler',
             'name': 'TargetStandardScaler'
-        }
\ No newline at end of file
+        }
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index df191182c..27849f994 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -52,7 +52,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     def __call__(self,
                  past_target: Union[np.ndarray, torch.tensor],
                  past_observed_values: Optional[torch.BoolTensor] = None,
-                 future_targets: Optional[Union[np.ndarray, torch.Tensor]]=None,
+                 future_targets: Optional[Union[np.ndarray, torch.Tensor]] = None,
                  ) -> Union[np.ndarray, torch.tensor]:
 
         if self.scaler is None:
@@ -69,7 +69,7 @@ def __call__(self,
 
     @staticmethod
     def get_hyperparameter_search_space(
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         return cs
diff --git a/test/test_pipeline/components/setup/forecasting/__init__.py b/test/test_pipeline/components/setup/forecasting/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
new file mode 100644
index 000000000..f3439f670
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -0,0 +1,136 @@
+import torch
+
+import copy
+import unittest
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScalerChoice
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMaxAbsScaler import TargetMaxAbsScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMeanAbsScaler import TargetMeanAbsScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMinMaxScaler import TargetMinMaxScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
+
+
+class TestTargetScalar(unittest.TestCase):
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the encoder
+        choice"""
+        rescaler_choice = TargetScalerChoice({})
+        cs = rescaler_choice.get_hyperparameter_search_space()
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(rescaler_choice.get_components().keys()))
+        )
+
+        # Make sure we can properly set some random configs
+        # Whereas just one iteration will make sure the algorithm works,
+        # doing five iterations increase the confidence. We will be able to
+        # catch component specific crashes
+        for i in range(5):
+            config = cs.sample_configuration()
+            config_dict = copy.deepcopy(config.get_dictionary())
+            rescaler_choice.set_hyperparameters(config)
+
+            self.assertEqual(rescaler_choice.choice.__class__,
+                             rescaler_choice.get_components()[config_dict['__choice__']])
+
+            # Then check the choice configuration
+            selected_choice = config_dict.pop('__choice__', None)
+            for key, value in config_dict.items():
+                # Remove the selected_choice string from the parameter
+                # so we can query in the object for it
+                key = key.replace(selected_choice + ':', '')
+                self.assertIn(key, vars(rescaler_choice.choice))
+                self.assertEqual(value, rescaler_choice.choice.__dict__[key])
+
+        include = ['TargetMeanAbsScaler', 'TargetMaxAbsScaler']
+        cs = rescaler_choice.get_hyperparameter_search_space(include=include)
+        self.assertTrue(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(include),
+        )
+
+    def test_target_no_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = TargetNoScaler()
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.rand([5, 6, 7])
+        future_targets = torch.rand(([5, 3, 7]))
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+        self.assertTrue(torch.equal(past_targets, transformed_past_target))
+        self.assertTrue(torch.equal(future_targets, transformed_future_targets))
+        self.assertIsNone(loc)
+        self.assertIsNone(scale)
+
+        _, transformed_future_targets, _, _ = scalar(past_targets)
+        self.assertIsNone(transformed_future_targets)
+
+    def test_target_max_abs_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = TargetMaxAbsScaler()
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [2.] * 8),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.equal(transformed_past_target[1], torch.Tensor([0.] * 2 + [1.] * 8).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.equal(transformed_past_target[1], torch.Tensor([0.] * 2 + [1.] * 8).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertIsNone(loc)
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
+        self.assertIsNone(loc_full)

From 8b7ef6100e38408684bcb0692bf08cebd6712d13 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 15 May 2022 16:36:53 +0200
Subject: [PATCH 261/347] test for target scaer

---
 .../setup/forecasting_target_scaling/utils.py |  17 +-
 .../test_forecasting_target_scaling.py        | 239 +++++++++++++++++-
 2 files changed, 242 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
index 5649c2817..bd59a9127 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
@@ -40,7 +40,7 @@ def transform(self,
                 max_ = torch.max(past_targets, dim=1, keepdim=True)[0]
 
                 diff_ = max_ - min_
-                loc = min_ - 1e-10
+                loc = min_
                 scale = torch.where(diff_ == 0, past_targets[:, [-1]], diff_)
                 scale[scale == 0.0] = 1.0
                 if future_targets is not None:
@@ -98,13 +98,15 @@ def transform(self,
                 max_ = torch.max(max_masked_past_targets, dim=1, keepdim=True)[0]
 
                 diff_ = max_ - min_
-                loc = min_ - 1e-10
+                loc = min_
                 scale = torch.where(diff_ == 0, past_targets[:, [-1]], diff_)
                 scale[scale == 0.0] = 1.0
 
                 if future_targets is not None:
                     future_targets = (future_targets - loc) / scale
-                return (past_targets - loc) / scale, future_targets, loc, scale
+                scaled_past_targets = torch.where(past_observed_values, (past_targets - loc) / scale, past_targets)
+
+                return scaled_past_targets, future_targets, loc, scale
 
             elif self.mode == "max_abs":
                 max_abs_ = torch.max(torch.abs(valid_past_targets), dim=1, keepdim=True)[0]
@@ -112,7 +114,10 @@ def transform(self,
                 scale = max_abs_
                 if future_targets is not None:
                     future_targets = future_targets / scale
-                return past_targets / scale, future_targets, None, scale
+
+                scaled_past_targets = torch.where(past_observed_values, past_targets / scale, past_targets)
+
+                return scaled_past_targets, future_targets, None, scale
 
             elif self.mode == 'mean_abs':
                 mean_abs = torch.sum(torch.abs(valid_past_targets), dim=1, keepdim=True) / valid_past_obs
@@ -121,7 +126,9 @@ def transform(self,
                 scale[scale == 0.0] = 1.0
                 if future_targets is not None:
                     future_targets = future_targets / scale
-                return past_targets / scale, future_targets, None, scale
+
+                scaled_past_targets = torch.where(past_observed_values, past_targets / scale, past_targets)
+                return scaled_past_targets, future_targets, None, scale
 
             elif self.mode == "none":
                 return past_targets, future_targets, None, None
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
index f3439f670..898891d00 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -62,8 +62,10 @@ def test_target_no_scalar(self):
         past_targets = torch.rand([5, 6, 7])
         future_targets = torch.rand(([5, 3, 7]))
 
-        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
-                                                                                 future_targets=future_targets)
+        past_observed_values = torch.rand([5, 6, 7]) > 0.5
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets)
         self.assertTrue(torch.equal(past_targets, transformed_past_target))
         self.assertTrue(torch.equal(future_targets, transformed_future_targets))
         self.assertIsNone(loc)
@@ -72,6 +74,223 @@ def test_target_no_scalar(self):
         _, transformed_future_targets, _, _ = scalar(past_targets)
         self.assertIsNone(transformed_future_targets)
 
+    def test_target_mean_abs_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = TargetMeanAbsScaler()
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [8. / 11.] * 5 + [16. / 11.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 80. / 11.))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 11. / 8., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertIsNone(loc)
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [10. / 11.] * 5 + [20. / 11.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 100. / 11))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
+
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 1.1, 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
+        self.assertIsNone(loc_full)
+
+    def test_target_standard_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = TargetStandardScaler()
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [-0.7246] * 5 + [1.2076] * 3).unsqueeze(-1), atol=1e-4))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.allclose(transformed_future_targets[1], torch.ones([10, 1]) * 16.6651))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 6.0000))
+
+        self.assertTrue(
+            torch.allclose(loc,
+                           torch.Tensor([0., 11. / 8., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        self.assertTrue(
+            torch.allclose(scale,
+                           torch.Tensor([1., 0.5175, 1.]).reshape([len(past_targets), 1, past_targets.shape[-1]]),
+                           atol=1e-4)
+        )
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([-1.4908] * 2 + [-0.1355] * 5 + [1.2197] * 3).unsqueeze(-1),
+                                       atol=1e-4)
+                        )
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.allclose(transformed_future_targets[1], torch.ones([10, 1]) * 12.0618,  atol=1e-4))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 6.))
+
+        self.assertTrue(
+            torch.allclose(loc,
+                           torch.Tensor([0., 1.1, 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]),
+                           atol=1e-4
+                           )
+        )
+        self.assertTrue(
+            torch.allclose(scale,
+                           torch.Tensor([1., 0.7379, 1.]).reshape([len(past_targets), 1, past_targets.shape[-1]]),
+                           atol=1e-4
+                           )
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(loc, loc_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
+    def test_target_min_max_scalar(self):
+        X = {'dataset_properties': {}}
+        scalar = TargetMinMaxScaler()
+        scalar = scalar.fit(X)
+        X = scalar.transform(X)
+        self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
+
+        past_targets = torch.vstack(
+            [
+                torch.zeros(10),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
+                torch.ones(10) * 4
+            ]
+        ).unsqueeze(-1)
+        past_observed_values = torch.vstack(
+            [
+                torch.Tensor([False] * 3 + [True] * 7),
+                torch.Tensor([False] * 2 + [True] * 8),
+                torch.Tensor([True] * 10)
+
+            ]).unsqueeze(-1).bool()
+        future_targets = torch.ones([3, 10, 1]) * 10
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(
+            past_targets, past_observed_values=past_observed_values, future_targets=future_targets
+        )
+
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [0.] * 5 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 9))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 1.5))
+
+        self.assertTrue(
+            torch.equal(loc, torch.Tensor([0., 1., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 1., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
+                                                                                 future_targets=future_targets)
+        self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
+        self.assertTrue(torch.equal(transformed_past_target[1],
+                                    torch.Tensor([0.] * 2 + [0.5] * 5 + [1.] * 3).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
+
+        self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
+        self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 1.5))
+        self.assertTrue(
+            torch.equal(loc, torch.Tensor([0., 0., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+        self.assertTrue(
+            torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
+        )
+
+        transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
+            past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
+        )
+        self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
+        self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
+        self.assertTrue(torch.equal(scale, scale_full))
+
     def test_target_max_abs_scalar(self):
         X = {'dataset_properties': {}}
         scalar = TargetMaxAbsScaler()
@@ -82,7 +301,7 @@ def test_target_max_abs_scalar(self):
         past_targets = torch.vstack(
             [
                 torch.zeros(10),
-                torch.Tensor([0.] * 2 + [2.] * 8),
+                torch.Tensor([0.] * 2 + [1.] * 5 + [2.] * 3),
                 torch.ones(10) * 4
             ]
         ).unsqueeze(-1)
@@ -100,13 +319,15 @@ def test_target_max_abs_scalar(self):
         )
 
         self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
-        self.assertTrue(torch.equal(transformed_past_target[1], torch.Tensor([0.] * 2 + [1.] * 8).unsqueeze(-1)))
+        self.assertTrue(torch.allclose(transformed_past_target[1],
+                                       torch.Tensor([0.] * 2 + [0.5] * 5 + [1.] * 3).unsqueeze(-1)))
         self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
 
         self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
-        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
+        self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5.))
         self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
 
+        self.assertIsNone(loc)
         self.assertTrue(
             torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
         )
@@ -114,23 +335,23 @@ def test_target_max_abs_scalar(self):
         transformed_past_target, transformed_future_targets, loc, scale = scalar(past_targets,
                                                                                  future_targets=future_targets)
         self.assertTrue(torch.equal(transformed_past_target[0], torch.zeros([10, 1])))
-        self.assertTrue(torch.equal(transformed_past_target[1], torch.Tensor([0.] * 2 + [1.] * 8).unsqueeze(-1)))
+        self.assertTrue(torch.equal(transformed_past_target[1],
+                                    torch.Tensor([0.] * 2 + [0.5] * 5 + [1.] * 3).unsqueeze(-1)))
         self.assertTrue(torch.equal(transformed_past_target[2], torch.ones([10, 1])))
 
         self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
         self.assertTrue(torch.equal(transformed_future_targets[1], torch.ones([10, 1]) * 5))
         self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 2.5))
 
+        self.assertIsNone(loc)
         self.assertTrue(
             torch.equal(scale, torch.Tensor([1., 2., 4.]).reshape([len(past_targets), 1, past_targets.shape[-1]]))
         )
-        self.assertIsNone(loc)
 
         transformed_past_target_full, transformed_future_targets_full, loc_full, scale_full = scalar(
             past_targets, past_observed_values=torch.ones([2, 10, 1], dtype=torch.bool), future_targets=future_targets
         )
         self.assertTrue(torch.equal(transformed_past_target, transformed_past_target_full))
         self.assertTrue(torch.equal(transformed_future_targets_full, transformed_future_targets_full))
-        self.assertTrue(torch.equal(scale, scale_full))
-
         self.assertIsNone(loc_full)
+        self.assertTrue(torch.equal(scale, scale_full))

From 1025b937858300a4e643c9722f208e42fa5f038c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 15 May 2022 18:06:05 +0200
Subject: [PATCH 262/347] test for training loss

---
 .../DistributionLoss.py                       |   9 +-
 .../forecasting_training_loss/QuantileLoss.py |   2 +
 .../forecasting_training_loss/__init__.py     |   1 -
 .../test_forecasting_training_losses.py       | 101 ++++++++++++++++++
 4 files changed, 108 insertions(+), 5 deletions(-)
 create mode 100644 test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
index 87f936c93..cbbd429f9 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -1,4 +1,4 @@
-from typing import Optional, Dict, Union, Any, NamedTuple
+from typing import Optional, Dict, Union, Any
 import numpy as np
 
 from ConfigSpace import ConfigurationSpace
@@ -10,10 +10,11 @@
     DisForecastingStrategy
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
-    ForecastingLossComponents
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import (
+    ForecastingLossComponents,
+)
 from autoPyTorch.pipeline.components.training.losses import LogProbLoss
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
 class DistributionLoss(ForecastingLossComponents):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index 72889e8bd..04cadaea3 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -24,6 +24,8 @@ def __init__(self,
         super().__init__()
         self.random_state = random_state
         self.quantiles = [0.5, lower_quantile, upper_quantile]
+        # To make it compatible with
+        # autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer
         self.loss = partial(QuantileLoss, quantiles=self.quantiles)
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index 2e98722ea..53c6788b8 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -196,4 +196,3 @@ def get_hyperparameter_search_space(
     def transform(self, X: np.ndarray) -> np.ndarray:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
-
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
new file mode 100644
index 000000000..b626299ae
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
@@ -0,0 +1,101 @@
+import torch
+
+import copy
+import unittest
+
+from autoPyTorch.constants import (
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING,
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.pipeline.components.training.losses import LogProbLoss, QuantileLoss
+from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss, MAPELoss, MASELoss
+
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.DistributionLoss import DistributionLoss
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.QuantileLoss import NetworkQuantileLoss
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.RegressionLoss import RegressionLoss
+
+
+class TestForecastingTrainingLoss(unittest.TestCase):
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the encoder
+        choice"""
+        dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+        loss_choice = ForecastingLossChoices(dataset_properties)
+        cs = loss_choice.get_hyperparameter_search_space(dataset_properties)
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(loss_choice.get_components().keys()))
+        )
+
+        # Make sure we can properly set some random configs
+        # Whereas just one iteration will make sure the algorithm works,
+        # doing five iterations increase the confidence. We will be able to
+        # catch component specific crashes
+        for i in range(5):
+            config = cs.sample_configuration()
+            config_dict = copy.deepcopy(config.get_dictionary())
+            loss_choice.set_hyperparameters(config)
+
+            self.assertEqual(loss_choice.choice.__class__,
+                             loss_choice.get_components()[config_dict['__choice__']])
+
+        include = ['DistributionLoss', 'QuantileLoss']
+        cs = loss_choice.get_hyperparameter_search_space(dataset_properties=dataset_properties, include=include)
+        self.assertTrue(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(include),
+        )
+
+    def test_distribution_loss(self):
+        for dist_cls in ALL_DISTRIBUTIONS.keys():
+            loss = DistributionLoss(dist_cls)
+            self.assertEqual(loss.dist_cls, dist_cls)
+
+            dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+            fit_dictionary = {'dataset_properties': dataset_properties}
+            loss = loss.fit(fit_dictionary)
+            fit_dictionary = loss.transform(fit_dictionary)
+
+            self.assertEqual(fit_dictionary['loss'], LogProbLoss)
+            self.assertEqual(fit_dictionary['required_padding_value'], ALL_DISTRIBUTIONS[dist_cls].value_in_support)
+            self.assertIsInstance(fit_dictionary['dist_forecasting_strategy'], DisForecastingStrategy)
+
+    def test_quantile_loss(self):
+        lower = 0.2
+        upper = 0.8
+        loss = NetworkQuantileLoss(lower_quantile=lower, upper_quantile=upper)
+        self.assertEqual(loss.quantiles, [0.5, lower, upper])
+
+        dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+        fit_dictionary = {'dataset_properties': dataset_properties}
+        loss = loss.fit(fit_dictionary)
+        fit_dictionary = loss.transform(fit_dictionary)
+        train_loss = fit_dictionary['loss']()
+
+        self.assertIsInstance(train_loss, QuantileLoss)
+        self.assertListEqual(train_loss.quantiles, loss.quantiles)
+        self.assertListEqual(fit_dictionary['quantile_values'], loss.quantiles)
+
+    def test_regression_loss(self):
+        loss_dict = dict(l1=L1Loss,
+                         mse=MSELoss,
+                         mape=MAPELoss,
+                         mase=MASELoss)
+        for loss_name, loss_type in loss_dict.items():
+            loss = RegressionLoss(loss_name)
+
+            dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+            fit_dictionary = {'dataset_properties': dataset_properties}
+            loss = loss.fit(fit_dictionary)
+            fit_dictionary = loss.transform(fit_dictionary)
+            train_loss = fit_dictionary['loss']
+
+            self.assertEqual(train_loss, loss_type)
+

From 6f6863324e1aa0959831d431a02e19b1d08dca97 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 16 May 2022 15:06:49 +0200
Subject: [PATCH 263/347] maint

---
 .../TimeSeriesEarlyPreProcessing.py           |  1 -
 .../setup/network/forecasting_architecture.py | 18 ++++++-------
 .../forecasting_backbone/__init__.py          | 18 ++++---------
 .../forecasting_backbone/cells.py             |  6 -----
 .../forecasting_backbone/components_util.py   |  2 ++
 .../base_forecasting_encoder.py               | 14 ++++-------
 .../flat_encoder/MLPEncoder.py                |  5 +++-
 .../flat_encoder/__init__.py                  | 15 ++++++-----
 .../seq_encoder/TCNEncoder.py                 |  6 +++--
 .../seq_encoder/TransformerEncoder.py         |  6 +++--
 .../seq_encoder/__init__.py                   |  6 ++---
 .../forecasting_head.py                       |  1 -
 .../forecasting_base_trainer.py               | 25 +++++++++++++++----
 13 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index 9b27f950d..95377edef 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -15,7 +15,6 @@
 
 
 class TimeSeriesEarlyPreprocessing(EarlyPreprocessing):
-
     def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
         super(EarlyPreprocessing, self).__init__()
         self.random_state = random_state
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 40d84c6ad..e9e862b61 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -152,7 +152,7 @@ class AbstractForecastingNet(nn.Module):
 
     def __init__(self,
                  network_structure: NetworkStructure,
-                 network_embedding: nn.Module,  # TODO consider  embedding for past, future and static features
+                 network_embedding: nn.Module,
                  network_encoder: Dict[str, EncoderBlockInfo],
                  network_decoder: Dict[str, DecoderBlockInfo],
                  temporal_fusion: Optional[TemporalFusionLayer],
@@ -197,9 +197,9 @@ def __init__(self,
         self.embedding = network_embedding
         if len(known_future_features) > 0:
             known_future_features_idx = [feature_names.index(kff) for kff in known_future_features]
-            self.embedding_future = self.embedding.get_partial_models(known_future_features_idx)
+            self.decoder_embedding = self.embedding.get_partial_models(known_future_features_idx)
         else:
-            self.embedding_future = _NoEmbedding()
+            self.decoder_embedding = _NoEmbedding()
         # modules that generate tensors while doing forward pass
         self.lazy_modules = []
         if network_structure.variable_selection:
@@ -406,7 +406,7 @@ def pre_processing(self,
                 x_past = None
             if length_future > 0:
                 if future_features is not None:
-                    future_features = self.embedding_future(future_features.to(self.device))
+                    future_features = self.decoder_embedding(future_features.to(self.device))
                 x_future = {}
                 if hasattr(self.variable_selector, 'placeholder_features'):
                     for placehold in self.variable_selector.placeholder_features:
@@ -447,7 +447,7 @@ def pre_processing(self,
 
             x_past = self.embedding(x_past.to(device=self.device))
             if future_features is not None:
-                future_features = self.embedding_future(future_features.to(self.device))
+                future_features = self.decoder_embedding(future_features.to(self.device))
             return x_past, future_features, None, loc, scale, None, past_targets
 
     def forward(self,
@@ -547,7 +547,7 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         length_future = future_targets.shape[1]
         future_targets = future_targets.to(self.device)
         if future_features is not None:
-            future_features = self.embedding_future(future_features.to(self.device))
+            future_features = self.decoder_embedding(future_features.to(self.device))
         x_future = {}
         if hasattr(self.variable_selector, 'placeholder_features'):
             for placeholder in self.variable_selector.placeholder_features:
@@ -605,7 +605,7 @@ def forward(self,
             else:
                 x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
                                                                                     dim=-1)
-                x_future = self.embedding_future(x_future.to(self.device))
+                x_future = self.decoder_embedding(x_future.to(self.device))
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
                                                            additional_input=encoder_additional)
@@ -656,7 +656,7 @@ def forward(self,
                                                                                       dim=-1)
                         x_future = x_future.to(self.device)
 
-                    x_future = self.embedding_future(x_future)
+                    x_future = self.decoder_embedding(x_future)
 
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
@@ -744,7 +744,7 @@ def forward(self,
 
                         x_future = x_future.to(self.device)
 
-                    x_future = self.embedding_future(x_future)
+                    x_future = self.decoder_embedding(x_future)
 
                     decoder_output = self.decoder(x_future,
                                                   encoder_output=encoder2decoder,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index dbc04f25a..35afe0b13 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -1,32 +1,25 @@
-import os
 from collections import OrderedDict
-from typing import Dict, Optional, List, Any, Union, Tuple
-from sklearn.pipeline import Pipeline
+from typing import Dict, Optional, List, Any, Union
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
-from ConfigSpace.conditions import EqualsCondition, OrConjunction
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
     autoPyTorchComponent,
-    find_components,
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import (
     BaseForecastingEncoder,
 )
-from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
     import FlatForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder import\
     SeqForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import (
     decoders, decoder_addons, add_decoder
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+)
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
 
@@ -152,8 +145,7 @@ def get_available_components(
             # target_type = dataset_properties['target_type']
             # Apply some automatic filtering here for
             # backbones based on the dataset!
-            # TODO: Think if there is any case where a backbone
-            # is not recommended for a certain dataset
+            # TODO: Think if there is any case where a backbone is not recommended for a certain dataset
 
             components_dict[name] = entry
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 5e1bee7be..d38996bb8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -637,7 +637,6 @@ def forward(self,
                 if incremental_update:
                     # in this case, we only have Transformer, thus x_all needs to be None value!
                     # TODO make this argument clearer!
-                    #x_all = torch.cat([self.cached_intermediate_state[i], x], dim=1)
                     fx = decoder_i(x, encoder_output=encoder_output[i], pos_idx=pos_idx)
                 else:
                     fx = decoder_i(x, encoder_output=encoder_output[i], pos_idx=pos_idx)
@@ -648,10 +647,5 @@ def forward(self,
                 if self.decoder_has_hidden_states[i]:
                     self.cached_intermediate_state[i] = hx
                     #TODO consider if there are other case that could make use of cached intermediate states
-                # else:
-                #    if incremental_update:
-                #        self.cached_intermediate_state[i] = x_all
-                #    else:
-                #        self.cached_intermediate_state[i] = x
             x = fx
         return x
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index b6597180e..b58cfa87a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -1,9 +1,11 @@
 import math
+
 from sklearn.base import BaseEstimator
 
 from typing import Any, Dict, NamedTuple, Optional, Tuple
 
 import torch
+
 from torch import nn
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 7701988e2..43d77a68a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -4,20 +4,17 @@
 import pandas as pd
 from scipy.sparse import csr_matrix
 
-import torch
 import torchvision
-from ConfigSpace import ConfigurationSpace
 from autoPyTorch.utils.common import FitRequirement
 from torch import nn
 from abc import abstractmethod
-from typing import Any, Dict, Iterable, Optional, Tuple, List, Union, NamedTuple
+from typing import Any, Dict, Iterable, Optional, Tuple, List
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderProperties, EncoderBlockInfo, EncoderNetwork
+    EncoderProperties, EncoderBlockInfo
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 
@@ -52,14 +49,14 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
-            FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
+            FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement('network_embedding', (nn.Module, ), user_defined=False, dataset_property=False)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.check_requirements(X, y)
 
         X_train = X.get('X_train', None)
-        y_train = X['y_train']
 
         input_shape = X["dataset_properties"]['input_shape']
         output_shape = X["dataset_properties"]['output_shape']
@@ -71,7 +68,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 else:
                     # get input shape by transforming first two elements of the training set
                     transforms = torchvision.transforms.Compose(X['preprocess_transforms'])
-                    X_train = X_train[:1, np.newaxis, ...]
+                    X_train = X_train.values[:1, np.newaxis, ...]
                     X_train = transforms(X_train)
                     input_shape = np.concatenate(X_train).shape[1:]
 
@@ -97,7 +94,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         else:
             input_shape = X['encoder_output_shape']
 
-
         self.encoder = self.build_encoder(
             input_shape=input_shape,
         )
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index 85bf72698..f2575463c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -8,7 +8,10 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork, EncoderProperties
+    BaseForecastingEncoder, EncoderProperties
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderNetwork
 )
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index 22f5e70a1..bd67cb843 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -1,6 +1,6 @@
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import (
     AbstractForecastingEncoderChoice
-
+)
 
 import os
 from collections import OrderedDict
@@ -45,6 +45,8 @@ def get_available_components(
              to remove from the configuration space
          dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
              of the dataset to guide the pipeline choices of components
+         components(Optional[Dict[str, autoPyTorchComponent])
+            components
 
         Returns:
             Dict[str, autoPyTorchComponent]: A filtered dict of learning
@@ -94,14 +96,12 @@ def get_available_components(
             # target_type = dataset_properties['target_type']
             # Apply some automatic filtering here for
             # backbones based on the dataset!
-            # TODO: Think if there is any case where a backbone
-            # is not recommended for a certain dataset
+            # TODO: Think if there is any case where a backbone is not recommended for a certain dataset
 
             components_dict[name] = entry
 
         return components_dict
 
-
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
@@ -121,10 +121,9 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
         return {
-            'shortname': 'SeqEncoder',
-            'name': 'SeqEncoder',
+            'shortname': 'FlatEncoder',
+            'name': 'FlatEncoder',
             'handles_tabular': False,
             'handles_image': False,
             'handles_time_series': True,
         }
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 6cc29b71c..6c907f138 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -11,8 +11,10 @@
 import torch
 from torch import nn
 from torch.nn.utils import weight_norm
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderNetwork
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index bbcfbff08..b11a6c935 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -12,8 +12,10 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderNetwork, EncoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderNetwork
 )
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 5ea33f371..d119ae788 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -1,19 +1,17 @@
 import os
 from collections import OrderedDict
 from typing import Dict, Optional, List, Any, Union
-import numpy as np
 from sklearn.pipeline import Pipeline
 
 from ConfigSpace.hyperparameters import (
     Constant,
     CategoricalHyperparameter,
-    UniformIntegerHyperparameter,
     UniformFloatHyperparameter,
     OrdinalHyperparameter,
 )
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
 from ConfigSpace.conditions import (
-    EqualsCondition, OrConjunction, GreaterThanCondition, NotEqualsCondition, AndConjunction
+    EqualsCondition, OrConjunction, GreaterThanCondition
 )
 from ConfigSpace.forbidden import ForbiddenInClause, ForbiddenEqualsClause, ForbiddenAndConjunction
 
@@ -25,7 +23,7 @@
     find_components,
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
     AbstractForecastingEncoderChoice
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 81d3985a1..1ee592805 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -56,7 +56,6 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
-            FitRequirement('network_encoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 956064531..f31d0aa15 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -133,7 +133,6 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
             torch.Tensor: The predictions of the network
             float: the loss incurred in the prediction
         """
-        past_target = data['past_targets'].float()
         past_observed_targets = data['past_observed_targets']
 
         past_features = data["past_features"]
@@ -146,6 +145,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
         future_observed_targets = future_targets["future_observed_targets"]
         future_targets_values = future_targets["future_targets"]
 
+        past_target = self.cast_targets(data['past_targets'])
         future_targets_values = self.cast_targets(future_targets_values)
 
         if isinstance(self.criterion, MASELoss):
@@ -166,8 +166,8 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
             loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
             loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
 
-            loss_backcast = loss_func_backcast(self.criterion, backcast) * past_observed_targets
-            loss_forecast = loss_func_forecast(self.criterion, forecast) * future_observed_targets
+            loss_backcast = loss_func_backcast(self.criterion, backcast) * past_observed_targets.to(self.device)
+            loss_forecast = loss_func_forecast(self.criterion, forecast) * future_observed_targets.to(self.device)
 
             loss = loss_forecast.mean() + loss_backcast.mean() * self.backcast_loss_ratio
 
@@ -197,7 +197,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
 
             loss_func = self.criterion_preparation(**criterion_kwargs)
 
-            loss = torch.mean(loss_func(self.criterion, outputs) * future_observed_targets)
+            loss = torch.mean(loss_func(self.criterion, outputs) * future_observed_targets.to(self.device))
 
         loss.backward()
         self.optimizer.step()
@@ -250,7 +250,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
 
                 batch_size = past_target.shape[0]
 
-                future_observed_targets = future_targets["future_observed_targets"]
+                future_observed_targets = future_targets["future_observed_targets"].to(self.device)
                 future_targets_values = future_targets["future_targets"]
 
                 future_targets_values = self.cast_targets(future_targets_values)
@@ -310,3 +310,18 @@ def compute_metrics(self, outputs_data: List[torch.Tensor], targets_data: List[t
         targets_data = torch.cat(targets_data, dim=0).numpy()
 
         return calculate_score(targets_data, outputs_data, self.task_type, self.metrics, **self.metrics_kwargs)
+
+    def cast_targets(self, targets: torch.Tensor) -> torch.Tensor:
+        """
+        This function is quite similar to the base class implementation, except that we do not move targets to
+        sef.device
+
+        """
+        if self.task_type in (REGRESSION_TASKS + FORECASTING_TASKS):
+            targets = targets.float()
+            # make sure that targets will have same shape as outputs (really important for mse loss for example)
+            if targets.ndim == 1:
+                targets = targets.unsqueeze(1)
+        else:
+            targets = targets.long()
+        return targets

From 570408dcb4f79c3f3ac7674e269d2a4a86adad5c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 16 May 2022 19:35:45 +0200
Subject: [PATCH 264/347] test for network backbone

---
 .../forecasting_backbone/__init__.py          |   8 +-
 .../base_forecasting_decoder.py               |   2 -
 .../base_forecasting_encoder.py               |  26 ++-
 .../forecasting_head.py                       |   1 +
 .../forecasting_networks/__init__.py          |   0
 .../test_base_components.py                   | 174 ++++++++++++++++++
 .../test_flat_backbones.py                    |  15 ++
 7 files changed, 213 insertions(+), 13 deletions(-)
 create mode 100644 test/test_pipeline/components/setup/forecasting/forecasting_networks/__init__.py
 create mode 100644 test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
 create mode 100644 test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 35afe0b13..e368f825e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+import numpy as np
 from typing import Dict, Optional, List, Any, Union
 
 import ConfigSpace.hyperparameters as CSH
@@ -26,14 +27,15 @@
 class ForecastingNetworkChoice(autoPyTorchChoice):
     """
     A network is composed of an encoder and decoder. In most of the case, the choice of decoder is heavily dependent on
-    the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
+    the choice of encoder. Therefore, here "choice" indicates the choice of encoder, then decoder will be determined by
     the encoder.
     """
 
     def __init__(self,
-                 **kwargs,
+                 dataset_properties: Dict[str, BaseDatasetPropertiesType],
+                 random_state: Optional[np.random.RandomState] = None
                  ):
-        super().__init__(**kwargs)
+        super().__init__(dataset_properties, random_state)
         self.include_components = None
         self.exclude_components = None
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 725bdcb43..5f9424a6e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -38,10 +38,8 @@ def __init__(self,
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
-            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('known_future_features', (Tuple,), user_defined=False, dataset_property=True),
             FitRequirement('feature_shapes', (Dict,), user_defined=False, dataset_property=True),
-            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
             FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 43d77a68a..6995d771b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -42,27 +42,28 @@ def __init__(self,
     def _required_fit_arguments(self) -> List[FitRequirement]:
         return [
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('y_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
-                           dataset_property=False),
             FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True),
-            FitRequirement('network_embedding', (nn.Module, ), user_defined=False, dataset_property=False)
+            FitRequirement('network_embedding', (nn.Module, ), user_defined=False, dataset_property=False),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.check_requirements(X, y)
 
-        X_train = X.get('X_train', None)
-
-        input_shape = X["dataset_properties"]['input_shape']
+        input_shape = (*X["dataset_properties"]['input_shape'][:-1], 0)
         output_shape = X["dataset_properties"]['output_shape']
 
         if self.block_number == 1:
             if not X["dataset_properties"]["uni_variant"]:
+                X_train = X.get('X_train', None)
+                if X_train is None:
+                    raise ValueError('Non uni_variant dataset must contain X_train!')
+
                 if X["dataset_properties"]["is_small_preprocess"]:
                     input_shape = X_train.shape[1:]
                 else:
@@ -72,10 +73,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                     X_train = transforms(X_train)
                     input_shape = np.concatenate(X_train).shape[1:]
 
+
             if X['transform_time_features']:
                 n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
             else:
                 n_time_feature_transform = 0
+
             input_shape = (*input_shape[:-1], input_shape[-1] + n_time_feature_transform)
 
             if 'network_embedding' in X.keys():
@@ -92,7 +95,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
             input_shape = (X['window_size'], in_features)
         else:
-            input_shape = X['encoder_output_shape']
+            if 'network_encoder' not in X or f'block_{self.block_number -1}' not in X['network_encoder']:
+                raise ValueError('Lower block layers must be fitted and transformed first!')
+            network_block_info = X['network_encoder'][f'block_{self.block_number -1}']
+            input_shape = network_block_info.encoder_output_shape
 
         self.encoder = self.build_encoder(
             input_shape=input_shape,
@@ -100,8 +106,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.input_shape = input_shape
 
+
         has_hidden_states = self.encoder_properties().has_hidden_states
         self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
+        if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:
+            raise ValueError('n_encoder_output_feature must equal to the output dimension')
         return self
 
     @staticmethod
@@ -110,6 +119,7 @@ def allowed_decoders():
 
     @abstractmethod
     def n_encoder_output_feature(self) -> int:
+        # We need this to compute the output of the variable selection network
         raise NotImplementedError
 
     def n_hidden_states(self) -> int:
@@ -141,7 +151,7 @@ def build_encoder(self,
         Returns:
             nn.Module: backbone module
         """
-        raise NotImplementedError()
+        pass
 
     @staticmethod
     def encoder_properties() -> EncoderProperties:
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 1ee592805..55cb13fdb 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -59,6 +59,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
+            FitRequirement('net_output_type', (str, ), user_defined=False, dataset_property=False)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/__init__.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
new file mode 100644
index 000000000..4db9c42fa
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -0,0 +1,174 @@
+import copy
+import unittest
+from unittest import mock
+
+import pandas as pd
+import numpy as np
+import torch
+from autoPyTorch.constants import (
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING,
+)
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
+    base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    MLPDecoder import ForecastingMLPDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo, DecoderProperties
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderProperties, EncoderBlockInfo, EncoderNetwork
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+
+
+class DummyEmbedding(torch.nn.Module):
+    def forward(self, x):
+        if x.shape[-1] > 10:
+            return x[..., :-10]
+        return x
+
+class DummyEncoderNetwork(EncoderNetwork):
+    def forward(self, x, output_seq=False):
+        if output_seq:
+            return torch.ones(x.shape[:-1])
+        return torch.ones((*x.shape[:-1], 10))
+
+
+class DummyForecastingEncoder(BaseForecastingEncoder):
+    def n_encoder_output_feature(self):
+        return 10
+
+    def build_encoder(self, input_shape):
+        return DummyEncoderNetwork()
+
+
+class DummyTranformers():
+    def __call__(self, x):
+        return x[..., :(x.shape[-1] // 2)]
+
+
+class TestForecastingNetworkBases(unittest.TestCase):
+    def setUp(self) -> None:
+        embedding = DummyEmbedding()
+
+        transformation = [DummyTranformers()]
+
+        input_shape = (100, 50)
+        output_shape = (100, 1)
+        time_feature_transform = [1, 2]
+
+        feature_shapes = {'f1': 10, 'f2': 10, 'f3': 10, 'f4': 10, 'f5': 10}
+        known_future_features = ('f1', 'f2', 'f5')
+
+        self.encoder = DummyForecastingEncoder()
+
+        with mock.patch('autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.'
+                        'forecasting_decoder.base_forecasting_decoder.BaseForecastingDecoder') as MockDecoder:
+            mockdecoder = MockDecoder.return_value
+            mockdecoder._build_decoder.return_values = (None, 10)
+        self.decoder = mockdecoder
+
+        self.dataset_properties = dict(input_shape=input_shape,
+                                       output_shape=output_shape,
+                                       transform_time_features=True,
+                                       time_feature_transform=time_feature_transform,
+                                       feature_shapes=feature_shapes,
+                                       known_future_features=known_future_features,
+                                       )
+
+        self.fit_dictionary = dict(X_train=pd.DataFrame(np.random.randn(*input_shape)),
+                                   y_train=pd.DataFrame(np.random.randn(*output_shape)),
+                                   network_embedding=embedding,
+                                   preprocess_transforms=transformation,
+                                   window_size=3
+                                   )
+
+    def test_encoder_choices(self):
+        dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
+        encoder_choices = ForecastingNetworkChoice(dataset_properties)
+        cs = encoder_choices.get_hyperparameter_search_space(dataset_properties)
+        self.assertListEqual(list(cs.get_hyperparameter('__choice__').choices), ['flat_encoder', 'seq_encoder'])
+
+        cs_only_flat = encoder_choices.get_hyperparameter_search_space(dataset_properties, include=['flat_encoder'])
+        for hp_name in cs_only_flat.get_hyperparameter_names():
+            self.assertFalse(hp_name.startswith('seq_encoder'))
+
+        cs_only_flat = encoder_choices.get_hyperparameter_search_space(dataset_properties, include=['flat_encoder'])
+        for hp_name in cs_only_flat.get_hyperparameter_names():
+            self.assertFalse(hp_name.startswith('seq_encoder'))
+
+        cs_only_rnn = encoder_choices.get_hyperparameter_search_space(dataset_properties,
+                                                                      include=['seq_encoder:RNNEncoder'])
+
+        self.assertListEqual(list(cs_only_rnn.get_hyperparameter('__choice__').choices), ['seq_encoder'])
+        self.assertListEqual(list(cs_only_rnn.get_hyperparameter('seq_encoder:block_1:__choice__').choices), ['RNNEncoder'])
+
+        cs_no_rnn = encoder_choices.get_hyperparameter_search_space(dataset_properties,
+                                                                    exclude=['seq_encoder:RNNEncoder'])
+        for hp_name in cs_no_rnn.get_hyperparameter_names():
+            self.assertFalse('RNNEncoder' in hp_name)
+
+    def test_base_encoder(self):
+        window_size = self.fit_dictionary['window_size']
+        for uni_variant in (True, False):
+            for variable_selection in (True, False):
+                for transform_time_features in (True, False):
+                    for is_small_preprocess in (True, False):
+                        network_structure = NetworkStructure(variable_selection=variable_selection)
+
+                        dataset_properties = copy.copy(self.dataset_properties)
+                        fit_dictionary = copy.copy(self.fit_dictionary)
+
+                        dataset_properties['is_small_preprocess'] = is_small_preprocess
+                        dataset_properties['uni_variant'] = uni_variant
+
+                        fit_dictionary['dataset_properties'] = self.dataset_properties
+                        fit_dictionary['network_structure'] = network_structure
+                        fit_dictionary['transform_time_features'] = transform_time_features
+                        fit_dictionary['dataset_properties'] = dataset_properties
+
+                        encoder_block_1 = copy.deepcopy(self.encoder)
+
+                        encoder_block_2 = copy.deepcopy(self.encoder)
+                        encoder_block_2.block_number = 2
+
+                        encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+                        fit_dictionary = encoder_block_1.transform(fit_dictionary)
+                        network_encoder = fit_dictionary['network_encoder']
+                        self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
+                        self.assertEqual(network_encoder['block_1'].encoder_output_shape, (window_size, 10))
+
+                        if variable_selection:
+                            self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
+                        else:
+                            if uni_variant:
+                                n_input_features = 0
+                            else:
+                                if is_small_preprocess:
+                                    n_input_features = 40
+                                else:
+                                    n_input_features = 15
+
+                            if transform_time_features:
+                                n_input_features += len(dataset_properties['time_feature_transform'])
+
+                            n_input_features += dataset_properties['output_shape'][-1]
+                            self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size,
+                                                                                              n_input_features))
+
+                        encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+                        fit_dictionary = encoder_block_2.transform(fit_dictionary)
+
+                        network_encoder = fit_dictionary['network_encoder']
+                        self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
+                        self.assertEqual(network_encoder['block_2'].encoder_output_shape, (window_size, 10))
+                        self.assertEqual(network_encoder['block_2'].encoder_input_shape, (window_size,
+                                                                                          10))
+
+
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
new file mode 100644
index 000000000..208776158
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -0,0 +1,15 @@
+import unittest
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
+    import FlatForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    MLPEncoder import MLPEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    NBEATSEncoder import NBEATSEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.MLPDecoder import (
+    ForecastingMLPDecoder
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.NBEATSDecoder \
+    import NBEATSDecoder
+
+

From 7d420073bb3ade975f05e1e5df1ee0645267c178 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 17 May 2022 12:22:08 +0200
Subject: [PATCH 265/347] test for backbone base

---
 autoPyTorch/datasets/time_series_dataset.py   |   7 +-
 .../forecasting_backbone/cells.py             |   4 +-
 .../forecasting_decoder/MLPDecoder.py         |  17 +-
 .../base_forecasting_decoder.py               |  11 +-
 .../base_forecasting_encoder.py               |   1 -
 .../forecasting_head.py                       |   1 -
 .../test_base_components.py                   | 209 ++++++++++++++++--
 7 files changed, 213 insertions(+), 37 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 7105d5c12..3bbf26612 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -512,9 +512,11 @@ def __init__(self,
 
         # process known future features
         if known_future_features is None:
-            self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)
+            future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)
         else:
-            self.future_feature_shapes: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
+            future_feature_shapes: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
+        self.encoder_can_be_auto_regressive = (self.input_shape[-1] == future_feature_shapes[-1])
+
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1][0].fillna(method="pad"))
@@ -947,6 +949,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'time_feature_transform': self.time_feature_transform,
                                    'uni_variant': self.is_uni_variant,
                                    'targets_have_missing_values': self.train_tensors[1].isnull().values.any(),
+                                   'encoder_can_be_auto_regressive': self.encoder_can_be_auto_regressive,
                                    'features_have_missing_values': False if self.train_tensors[0] is None
                                    else self.train_tensors[0].isnull().values.any()})
         return dataset_properties
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index d38996bb8..c3079b41a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -482,7 +482,7 @@ def __init__(self,
                         # Transformer -> Transformer
                         self.encoder_output_type[i] = EncoderOutputForm.Sequence
                 else:
-                    # Deep AR
+                    # Deep AR, MLP as decoder
                     self.encoder_output_type[i] = EncoderOutputForm.SequenceLast
             if encoder_info[block_id].encoder_properties.has_hidden_states:
                 self.encoder_has_hidden_states[i] = True
@@ -552,6 +552,8 @@ def forward(self,
                     encoder2decoder.append(encoder_i.get_last_seq_value(fx).squeeze(1))
                 else:
                     encoder2decoder.append(fx.squeeze(1))
+            else:
+                raise NotImplementedError
 
             if cache_intermediate_state:
                 if self.encoder_has_hidden_states[i]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 3bc98d148..a1862c147 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -39,11 +39,16 @@ def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor
             else:
                 # auto regressive model does not have local layers
                 return self.local_layers(x)
+        if len(encoder_output.shape) == 3:
+            encoder_output = encoder_output.squeeze(1)
+
         if self.local_layers is None:
             x = torch.concat([encoder_output, x_future.flatten(-2)], dim=-1)
             return self.global_layers(x)
+
         x = self.global_layers(encoder_output)
         x = self.local_layers(x)
+
         return torch.concat([x, x_future], dim=-1)
 
 
@@ -55,7 +60,6 @@ def _build_decoder(self,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
         global_layers = []
         in_features = encoder_output_shape[-1]
-        num_decoder_output_features = in_features
         has_local_layer = 'units_local_layer' in self.config
         if not has_local_layer and not self.auto_regressive:
             in_features += int(np.prod(future_variable_input))
@@ -110,7 +114,7 @@ def get_hyperparameter_search_space(
                                                                               default_value=1),
             units_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_layer",
                                                                                value_range=(16, 512),
-                                                                               default_value=64,
+                                                                               default_value=32,
                                                                                log=True),
             activation: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="activation",
                                                                               value_range=tuple(_activations.keys()),
@@ -124,8 +128,8 @@ def get_hyperparameter_search_space(
                                                                                    value_range=(True, False),
                                                                                    default_value=True),
             units_local_layer: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="units_local_layer",
-                                                                                     value_range=(16, 128),
-                                                                                     default_value=32,
+                                                                                     value_range=(8, 128),
+                                                                                     default_value=16,
                                                                                      log=True),
     ) -> ConfigurationSpace:
         """
@@ -159,9 +163,8 @@ def get_hyperparameter_search_space(
             cs (ConfigurationSpace): ConfigurationSpace
         """
         if dataset_properties is not None:
-            num_in_features = dataset_properties.get('input_shape', (0,))
-            future_feature_shapes = dataset_properties.get('future_feature_shapes', (0,))
-            if num_in_features[-1] != future_feature_shapes[-1]:
+            encoder_can_be_auto_regressive = dataset_properties.get('encoder_can_be_auto_regressive', False)
+            if not encoder_can_be_auto_regressive:
                 # deepAR model cannot be applied
                 auto_regressive = HyperparameterSearchSpace(hyperparameter=auto_regressive.hyperparameter,
                                                             value_range=[False],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 5f9424a6e..0eb8d8670 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -41,6 +41,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('known_future_features', (Tuple,), user_defined=False, dataset_property=True),
             FitRequirement('feature_shapes', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_steps', (int,), user_defined=False, dataset_property=True),
             FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
@@ -74,7 +75,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if auto_regressive:
             self.n_prediction_heads = 1
         else:
-            self.n_prediction_heads = output_shape[0]
+            self.n_prediction_heads = X['dataset_properties']['n_prediction_steps']
 
         network_structure = X['network_structure']
         variable_selection = network_structure.variable_selection
@@ -90,9 +91,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             else:
                 n_time_feature_transform = 0
 
-            if self.block_number == network_structure.num_blocks:
-                self.is_last_decoder = True
-
             if variable_selection:
                 future_in_features = X['network_encoder']['block_1'].encoder_output_shape[-1]
             else:
@@ -106,6 +104,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         else:
             future_variable_input = (self.n_prediction_heads, X['n_decoder_output_features'])
 
+        if self.block_number == network_structure.num_blocks:
+            self.is_last_decoder = True
+
         # TODO consider decoder auto regressive and fill in decoder part
 
         self.decoder, self.n_decoder_output_features = self.build_decoder(
@@ -168,7 +169,7 @@ def build_decoder(self,
         """
         decoder, n_decoder_features = self._build_decoder(encoder_output_shape, future_variable_input,
                                                           n_prediction_heads, dataset_properties)
-        return decoder, n_decoder_features
+        return decoder, int(n_decoder_features)
 
     @abstractmethod
     def _build_decoder(self,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 6995d771b..56d9a9083 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -106,7 +106,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.input_shape = input_shape
 
-
         has_hidden_states = self.encoder_properties().has_hidden_states
         self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
         if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 55cb13fdb..707e584da 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -53,7 +53,6 @@ def __init__(self,
     def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('task_type', (str,), user_defined=True, dataset_property=True),
             FitRequirement('auto_regressive', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('n_decoder_output_features', (int,), user_defined=False, dataset_property=False),
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index 4db9c42fa..25e7e852f 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -1,7 +1,7 @@
 import copy
 import unittest
-from unittest import mock
 
+from ConfigSpace import Configuration
 import pandas as pd
 import numpy as np
 import torch
@@ -13,18 +13,19 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    base_forecasting_decoder import BaseForecastingDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
     MLPDecoder import ForecastingMLPDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
-    DecoderBlockInfo, DecoderProperties
+    DecoderBlockInfo
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderProperties, EncoderBlockInfo, EncoderNetwork
+    EncoderBlockInfo, EncoderNetwork
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS, DisForecastingStrategy
+)
 
 
 class DummyEmbedding(torch.nn.Module):
@@ -33,11 +34,12 @@ def forward(self, x):
             return x[..., :-10]
         return x
 
+
 class DummyEncoderNetwork(EncoderNetwork):
     def forward(self, x, output_seq=False):
         if output_seq:
-            return torch.ones(x.shape[:-1])
-        return torch.ones((*x.shape[:-1], 10))
+            return torch.ones((*x.shape[:-1], 10))
+        return torch.ones((*x.shape[:-2], 1, 10))
 
 
 class DummyForecastingEncoder(BaseForecastingEncoder):
@@ -64,31 +66,50 @@ def setUp(self) -> None:
         time_feature_transform = [1, 2]
 
         feature_shapes = {'f1': 10, 'f2': 10, 'f3': 10, 'f4': 10, 'f5': 10}
-        known_future_features = ('f1', 'f2', 'f5')
+        known_future_features = ('f1', 'f2', 'f3', 'f4', 'f5')
 
         self.encoder = DummyForecastingEncoder()
 
-        with mock.patch('autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.'
-                        'forecasting_decoder.base_forecasting_decoder.BaseForecastingDecoder') as MockDecoder:
-            mockdecoder = MockDecoder.return_value
-            mockdecoder._build_decoder.return_values = (None, 10)
-        self.decoder = mockdecoder
-
         self.dataset_properties = dict(input_shape=input_shape,
                                        output_shape=output_shape,
                                        transform_time_features=True,
                                        time_feature_transform=time_feature_transform,
                                        feature_shapes=feature_shapes,
                                        known_future_features=known_future_features,
+                                       n_prediction_steps=3,
+                                       encoder_can_be_auto_regressive=True
                                        )
 
+        mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
+                                                                       can_be_auto_regressive=True)
+        mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
+        mlp_cfg_non_ar_wo_local = copy.copy(mlp_cfg_non_ar_w_local.get_dictionary())
+
+        mlp_cfg_non_ar_wo_local['has_local_layer'] = False
+        mlp_cfg_non_ar_wo_local.pop('units_local_layer')
+
+        mlp_cfg_ar = copy.copy(mlp_cfg_non_ar_wo_local)
+        mlp_cfg_ar.pop('has_local_layer')
+        mlp_cfg_ar['auto_regressive'] = True
+
+        mlp_cfg_non_ar_wo_local = Configuration(mlp_cs, values=mlp_cfg_non_ar_wo_local)
+        mlp_cfg_ar = Configuration(mlp_cs, values=mlp_cfg_ar)
+
+        self.decoder_ar = ForecastingMLPDecoder(**mlp_cfg_ar)
+        self.decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
+        self.decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
+
         self.fit_dictionary = dict(X_train=pd.DataFrame(np.random.randn(*input_shape)),
                                    y_train=pd.DataFrame(np.random.randn(*output_shape)),
                                    network_embedding=embedding,
                                    preprocess_transforms=transformation,
-                                   window_size=3
+                                   window_size=5
                                    )
 
+        self.decoders = {"non_ar_w_local": self.decoder_w_local,
+                         "non_ar_wo_local": self.decoder_wo_local,
+                         "ar": self.decoder_ar}
+
     def test_encoder_choices(self):
         dataset_properties = {'task_type': TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]}
         encoder_choices = ForecastingNetworkChoice(dataset_properties)
@@ -107,7 +128,8 @@ def test_encoder_choices(self):
                                                                       include=['seq_encoder:RNNEncoder'])
 
         self.assertListEqual(list(cs_only_rnn.get_hyperparameter('__choice__').choices), ['seq_encoder'])
-        self.assertListEqual(list(cs_only_rnn.get_hyperparameter('seq_encoder:block_1:__choice__').choices), ['RNNEncoder'])
+        self.assertListEqual(list(cs_only_rnn.get_hyperparameter('seq_encoder:block_1:__choice__').choices),
+                             ['RNNEncoder'])
 
         cs_no_rnn = encoder_choices.get_hyperparameter_search_space(dataset_properties,
                                                                     exclude=['seq_encoder:RNNEncoder'])
@@ -142,7 +164,7 @@ def test_base_encoder(self):
                         fit_dictionary = encoder_block_1.transform(fit_dictionary)
                         network_encoder = fit_dictionary['network_encoder']
                         self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
-                        self.assertEqual(network_encoder['block_1'].encoder_output_shape, (window_size, 10))
+                        self.assertEqual(network_encoder['block_1'].encoder_output_shape, (1, 10))
 
                         if variable_selection:
                             self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
@@ -167,8 +189,155 @@ def test_base_encoder(self):
 
                         network_encoder = fit_dictionary['network_encoder']
                         self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
-                        self.assertEqual(network_encoder['block_2'].encoder_output_shape, (window_size, 10))
-                        self.assertEqual(network_encoder['block_2'].encoder_input_shape, (window_size,
-                                                                                          10))
+                        self.assertEqual(network_encoder['block_2'].encoder_output_shape, (1, 10))
+                        self.assertEqual(network_encoder['block_2'].encoder_input_shape, (1, 10))
+
+    def test_base_decoder(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        for variable_selection in (True, False):
+            network_structure = NetworkStructure(variable_selection=variable_selection, num_blocks=2)
+            dataset_properties = copy.copy(self.dataset_properties)
+            fit_dictionary = copy.copy(self.fit_dictionary)
+
+            dataset_properties['is_small_preprocess'] = False
+            dataset_properties['uni_variant'] = False
+
+            fit_dictionary['dataset_properties'] = self.dataset_properties
+            fit_dictionary['network_structure'] = network_structure
+            fit_dictionary['transform_time_features'] = True
+            fit_dictionary['dataset_properties'] = dataset_properties
+            encoder_block_1 = copy.deepcopy(self.encoder)
+            encoder_block_2 = copy.deepcopy(self.encoder)
+            encoder_block_2.block_number = 2
+
+            encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+            fit_dictionary = encoder_block_1.transform(fit_dictionary)
+            encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+            fit_dictionary = encoder_block_2.transform(fit_dictionary)
+
+            decoder1 = copy.deepcopy(self.decoder_w_local)
+            decoder1 = decoder1.fit(fit_dictionary)
+            self.assertEqual(decoder1.n_prediction_heads, n_prediction_steps)
+            fit_dictionary = decoder1.transform(fit_dictionary)
+
+            network_decoder = fit_dictionary['network_decoder']
+            self.assertIsInstance(network_decoder['block_1'], DecoderBlockInfo)
+            if variable_selection:
+                self.assertEqual(network_decoder['block_1'].decoder_input_shape,
+                                 (n_prediction_steps, 10))  # Pure variable selection
+                self.assertEqual(network_decoder['block_1'].decoder_output_shape,
+                                 (n_prediction_steps, 26))  # 10 (input features) + 16 (n_output_dims)
+            else:
+                self.assertEqual(network_decoder['block_1'].decoder_input_shape,
+                                 (n_prediction_steps, 52))  # 50 (input features) + 2 (time_transforms)
+                self.assertEqual(network_decoder['block_1'].decoder_output_shape,
+                                 (n_prediction_steps, 68))  # 52 (input features) + 16 (n_out_dims)
+
+            for name, decoder in self.decoders.items():
+                fit_dictionary_ = copy.deepcopy(fit_dictionary)
+                decoder2 = copy.deepcopy(decoder)
+                decoder2.block_number = 2
+                decoder2 = decoder2.fit(fit_dictionary_)
+                fit_dictionary_ = decoder2.transform(fit_dictionary_)
+                self.assertTrue(decoder2.is_last_decoder)
+                if name == 'ar':
+                    self.assertEqual(fit_dictionary_['n_prediction_heads'], 1)
+                else:
+                    self.assertEqual(fit_dictionary_['n_prediction_heads'], n_prediction_steps)
+                n_prediction_heads = fit_dictionary_['n_prediction_heads']
+
+                network_decoder = fit_dictionary_['network_decoder']['block_2']
+                self.assertIsInstance(network_decoder, DecoderBlockInfo)
+                if variable_selection:
+                    self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 26))
+
+                    if name == 'non_ar_w_local':
+                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 42))  # 26+16
+                    elif name == 'non_ar_wo_local':
+                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # num_global
+                    elif name == 'ar':
+                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+                else:
+                    self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 68))
+
+                    if name == 'non_ar_w_local':
+                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 84))  # 26+16
+                    elif name == 'non_ar_wo_local':
+                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # num_global
+                    elif name == 'ar':
+                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+
+    def test_forecasting_heads(self):
+        variable_selection = False
+        n_prediction_steps = self.dataset_properties["n_prediction_steps"]
+
+        network_structure = NetworkStructure(variable_selection=variable_selection, num_blocks=1)
+
+        dataset_properties = copy.copy(self.dataset_properties)
+        fit_dictionary = copy.copy(self.fit_dictionary)
+
+        dataset_properties['is_small_preprocess'] = False
+        dataset_properties['uni_variant'] = False
+        input_tensor = torch.randn([10, fit_dictionary['window_size'], 3 + fit_dictionary['X_train'].shape[-1]])
+        input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dictionary['X_train'].shape[-1]])
+
+        fit_dictionary['dataset_properties'] = self.dataset_properties
+        fit_dictionary['network_structure'] = network_structure
+        fit_dictionary['transform_time_features'] = True
+        fit_dictionary['dataset_properties'] = dataset_properties
+        encoder = copy.deepcopy(self.encoder)
+        encoder = encoder.fit(fit_dictionary)
+        fit_dictionary = encoder.transform(fit_dictionary)
+
+        quantiles = [0.5, 0.1, 0.9]
+        for name, decoder in self.decoders.items():
+            fit_dictionary_ = copy.deepcopy(fit_dictionary)
+            decoder = decoder.fit(fit_dictionary_)
+            fit_dictionary_ = decoder.transform(fit_dictionary_)
+            for net_output_type in ['regression', 'distribution', 'quantile']:
+
+                def eval_heads_output(fit_dict):
+                    head = ForecastingHead()
+                    head = head.fit(fit_dict)
+                    fit_dictionary_copy = head.transform(fit_dict)
+
+                    encoder = fit_dictionary_copy['network_encoder']['block_1'].encoder
+                    decoder = fit_dictionary_copy['network_decoder']['block_1'].decoder
+
+                    head = fit_dictionary_copy['network_head']
+                    output = head(decoder(input_tensor_future, encoder(input_tensor, output_seq=False)))
+                    if name != "ar":
+                        if net_output_type == 'regression':
+                            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+                        elif net_output_type == 'distribution':
+                            self.assertListEqual(list(output.sample().shape), [10, n_prediction_steps, 1])
+                        elif net_output_type == 'quantile':
+                            self.assertEqual(len(output), len(quantiles))
+                            for output_quantile in output:
+                                self.assertListEqual(list(output_quantile.shape), [10, n_prediction_steps, 1])
+                    else:
+                        if net_output_type == 'regression':
+                            self.assertListEqual(list(output.shape), [10, 1, 1])
+                        elif net_output_type == 'distribution':
+                            self.assertListEqual(list(output.sample().shape), [10, 1, 1])
+                        elif net_output_type == 'quantile':
+                            self.assertEqual(len(output), len(quantiles))
+                            for output_quantile in output:
+                                self.assertListEqual(list(output_quantile.shape), [10, 1, 1])
+
+                fit_dictionary_copy = copy.deepcopy(fit_dictionary_)
+                fit_dictionary_copy['net_output_type'] = net_output_type
+
+                if net_output_type == 'distribution':
+                    for dist in ALL_DISTRIBUTIONS.keys():
+                        fit_dictionary_copy['dist_forecasting_strategy'] = DisForecastingStrategy(dist_cls=dist)
+                        eval_heads_output(fit_dictionary_copy)
+                elif net_output_type == 'quantile':
+                    fit_dictionary_copy['quantile_values'] = quantiles
+                    eval_heads_output(fit_dictionary_copy)
+                else:
+                    eval_heads_output(fit_dictionary_copy)
+
+
 
 

From 203307539c83a6faab77b08cda4b622602e022e6 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 17 May 2022 15:43:49 +0200
Subject: [PATCH 266/347] test for flat encoder

---
 .../forecasting_decoder/MLPDecoder.py         |   1 -
 .../forecasting_encoder/__init__.py           |  43 +--
 .../base_forecasting_encoder.py               |   1 -
 .../flat_encoder/__init__.py                  |  74 -----
 .../forecasting_head.py                       |   6 +-
 .../test_base_components.py                   |  87 +++---
 .../test_flat_backbones.py                    | 266 +++++++++++++++++-
 7 files changed, 338 insertions(+), 140 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index a1862c147..fdf5c3192 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -84,7 +84,6 @@ def _build_decoder(self,
                                 local_layers=nn.Sequential(*local_layers) if local_layers is not None else None,
                                 auto_regressive=self.auto_regressive), num_decoder_output_features
 
-
     @staticmethod
     def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
                        ) -> Dict[str, Union[str, bool]]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index f83298cec..dce479069 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -17,7 +17,7 @@
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone. \
     forecasting_encoder.base_forecasting_encoder import BaseForecastingEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
     ForecastingNetworkStructure
@@ -38,6 +38,7 @@ class AbstractForecastingEncoderChoice(autoPyTorchChoice):
     the choice of encoder. Thus here "choice" indicates the choice of encoder, then decoder will be determined by
     the encoder.
     """
+
     def __init__(self,
 
                  **kwargs,
@@ -71,11 +72,11 @@ def additional_components(self):
         return [self.get_decoder_components]
 
     def get_available_components(
-        self,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        include: List[str] = None,
-        exclude: List[str] = None,
-        components: Optional[Dict[str, autoPyTorchComponent]] = None
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            include: List[str] = None,
+            exclude: List[str] = None,
+            components: Optional[Dict[str, autoPyTorchComponent]] = None
     ) -> Dict[str, Type[autoPyTorchComponent]]:
         """Filters out components based on user provided
         include/exclude directives, as well as the dataset properties
@@ -144,11 +145,11 @@ def get_available_components(
         return components_dict
 
     def get_hyperparameter_search_space(
-        self,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        default: Optional[str] = None,
-        include: Optional[List[str]] = None,
-        exclude: Optional[List[str]] = None,
+            self,
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            default: Optional[str] = None,
+            include: Optional[List[str]] = None,
+            exclude: Optional[List[str]] = None,
     ) -> ConfigurationSpace:
         """Returns the configuration space of the current chosen components
 
@@ -198,8 +199,8 @@ def get_hyperparameter_search_space(
                                                                available_encoders,
                                                                choice_hyperparameter.value_range))
             hp_encoder = CSH.CategoricalHyperparameter('__choice__',
-                                                     choice_hyperparameter.value_range,
-                                                     default_value=choice_hyperparameter.default_value)
+                                                       choice_hyperparameter.value_range,
+                                                       default_value=choice_hyperparameter.default_value)
         else:
             hp_encoder = CSH.CategoricalHyperparameter(
                 '__choice__',
@@ -212,8 +213,9 @@ def get_hyperparameter_search_space(
         encoder2decoder = {}
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
-            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
-                                                                                     **updates)
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,
+                                                                                            # type: ignore
+                                                                                            **updates)
             parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
             cs.add_configuration_space(
                 encoder_name,
@@ -239,7 +241,8 @@ def get_hyperparameter_search_space(
             if not decoder2encoder[decoder_name]:
                 continue
             updates = self._get_search_space_updates(prefix=decoder_name)
-            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
+                                                                                            # type: ignore
                                                                                             **updates)
             compatible_encoders = decoder2encoder[decoder_name]
             encoders_with_multi_decoder = []
@@ -254,9 +257,9 @@ def get_hyperparameter_search_space(
             cs.add_configuration_space(
                 decoder_name,
                 config_space,
-                #parent_hyperparameter=parent_hyperparameter
+                # parent_hyperparameter=parent_hyperparameter
             )
-            hps = cs.get_hyperparameters() # type: List[CSH.Hyperparameter]
+            hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
             conditions_to_add = []
             for hp in hps:
                 # TODO consider if this will raise any unexpected behavior
@@ -360,7 +363,7 @@ def set_hyperparameters(self,
     def _defaults_network(self):
         return ['MLPEncoder']
 
-    def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
         """Handy method to check if a component is fitted
 
         Args:
@@ -381,5 +384,3 @@ def transform(self, X: Dict) -> Dict:
     @property
     def _defaults_network(self):
         return ['MLPEncoder']
-
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 56d9a9083..bfd7aedb6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -73,7 +73,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                     X_train = transforms(X_train)
                     input_shape = np.concatenate(X_train).shape[1:]
 
-
             if X['transform_time_features']:
                 n_time_feature_transform = len(X['dataset_properties']['time_feature_transform'])
             else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index bd67cb843..2d3026fd5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -28,80 +28,6 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
 
 
 class FlatForecastingEncoderChoice(AbstractForecastingEncoderChoice):
-    def get_available_components(
-        self,
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        include: List[str] = None,
-        exclude: List[str] = None,
-        components: Optional[Dict[str, autoPyTorchComponent]] = None
-    ) -> Dict[str, Type[autoPyTorchComponent]]:
-        """Filters out components based on user provided
-        include/exclude directives, as well as the dataset properties
-
-        Args:
-         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-            to honor when creating the configuration space
-         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-             to remove from the configuration space
-         dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
-             of the dataset to guide the pipeline choices of components
-         components(Optional[Dict[str, autoPyTorchComponent])
-            components
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: A filtered dict of learning
-                rate backbones
-
-        """
-        if dataset_properties is None:
-            dataset_properties = {}
-
-        if include is not None and exclude is not None:
-            raise ValueError(
-                "The argument include and exclude cannot be used together.")
-
-        if components is None:
-            available_comp = self.get_components()
-        else:
-            available_comp = components
-
-        if include is not None:
-            for incl in include:
-                if incl not in available_comp:
-                    raise ValueError("Trying to include unknown component: "
-                                     "%s" % incl)
-
-        components_dict = OrderedDict()
-        for name in available_comp:
-            if include is not None and name not in include:
-                continue
-            elif exclude is not None and name in exclude:
-                continue
-
-            entry = available_comp[name]
-
-            # Exclude itself to avoid infinite loop
-            if entry == NetworkBackboneChoice or hasattr(entry, 'get_components'):
-                continue
-
-            task_type = str(dataset_properties['task_type'])
-            properties = entry.get_properties()
-            if 'tabular' in task_type and not bool(properties['handles_tabular']):
-                continue
-            elif 'image' in task_type and not bool(properties['handles_image']):
-                continue
-            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
-                continue
-
-            # target_type = dataset_properties['target_type']
-            # Apply some automatic filtering here for
-            # backbones based on the dataset!
-            # TODO: Think if there is any case where a backbone is not recommended for a certain dataset
-
-            components_dict[name] = entry
-
-        return components_dict
-
     def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """Returns the available backbone components
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 707e584da..17584f9bf 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -58,7 +58,9 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('net_output_type', (str, ), user_defined=False, dataset_property=False)
+            FitRequirement('net_output_type', (str, ), user_defined=False, dataset_property=False),
+            FitRequirement('n_prediction_steps', (int,), user_defined=False, dataset_property=True)
+
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -81,7 +83,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             # if the decoder is a stacked block, we directly build head inside the decoder
             if net_output_type != 'regression':
                 raise ValueError("decoder with multi block structure only allow regression loss!")
-            self.output_shape = output_shape
+            self.output_shape = (X['dataset_properties']['n_prediction_steps'], output_shape[-1])
             return self
 
         num_quantiles = 0
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index 25e7e852f..c9f20e86e 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -55,31 +55,48 @@ def __call__(self, x):
         return x[..., :(x.shape[-1] // 2)]
 
 
-class TestForecastingNetworkBases(unittest.TestCase):
-    def setUp(self) -> None:
-        embedding = DummyEmbedding()
-
-        transformation = [DummyTranformers()]
+def generate_fit_dict_and_dataset_property():
+    embedding = DummyEmbedding()
+
+    transformation = [DummyTranformers()]
+    n_prediction_steps = 3
+    input_shape = (100, 50)
+    output_shape = (n_prediction_steps, 1)
+    time_feature_transform = [1, 2]
+
+    feature_shapes = {'f1': 10, 'f2': 10, 'f3': 10, 'f4': 10, 'f5': 10}
+    known_future_features = ('f1', 'f2', 'f3', 'f4', 'f5')
+
+    dataset_properties = dict(input_shape=input_shape,
+                              output_shape=output_shape,
+                              transform_time_features=True,
+                              time_feature_transform=time_feature_transform,
+                              feature_shapes=feature_shapes,
+                              known_future_features=known_future_features,
+                              n_prediction_steps=n_prediction_steps,
+                              encoder_can_be_auto_regressive=True,
+                              is_small_preprocess=True,
+                              task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
+                              uni_variant=False,
+                              )
+
+    fit_dictionary = dict(X_train=pd.DataFrame(np.random.randn(*input_shape)),
+                          y_train=pd.DataFrame(np.random.randn(*output_shape)),
+                          network_embedding=embedding,
+                          preprocess_transforms=transformation,
+                          transform_time_features=True,
+                          window_size=5
+                          )
+
+    return dataset_properties, fit_dictionary
 
-        input_shape = (100, 50)
-        output_shape = (100, 1)
-        time_feature_transform = [1, 2]
 
-        feature_shapes = {'f1': 10, 'f2': 10, 'f3': 10, 'f4': 10, 'f5': 10}
-        known_future_features = ('f1', 'f2', 'f3', 'f4', 'f5')
+class TestForecastingNetworkBases(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dataset_properties, self.fit_dictionary = generate_fit_dict_and_dataset_property()
 
         self.encoder = DummyForecastingEncoder()
 
-        self.dataset_properties = dict(input_shape=input_shape,
-                                       output_shape=output_shape,
-                                       transform_time_features=True,
-                                       time_feature_transform=time_feature_transform,
-                                       feature_shapes=feature_shapes,
-                                       known_future_features=known_future_features,
-                                       n_prediction_steps=3,
-                                       encoder_can_be_auto_regressive=True
-                                       )
-
         mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
                                                                        can_be_auto_regressive=True)
         mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
@@ -99,13 +116,6 @@ def setUp(self) -> None:
         self.decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
         self.decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
 
-        self.fit_dictionary = dict(X_train=pd.DataFrame(np.random.randn(*input_shape)),
-                                   y_train=pd.DataFrame(np.random.randn(*output_shape)),
-                                   network_embedding=embedding,
-                                   preprocess_transforms=transformation,
-                                   window_size=5
-                                   )
-
         self.decoders = {"non_ar_w_local": self.decoder_w_local,
                          "non_ar_wo_local": self.decoder_wo_local,
                          "ar": self.decoder_ar}
@@ -136,6 +146,11 @@ def test_encoder_choices(self):
         for hp_name in cs_no_rnn.get_hyperparameter_names():
             self.assertFalse('RNNEncoder' in hp_name)
 
+        sample = cs.sample_configuration()
+
+        encoder_choices = encoder_choices.set_hyperparameters(sample)
+        self.assertIsInstance(encoder_choices.choice.choice, BaseForecastingEncoder)
+
     def test_base_encoder(self):
         window_size = self.fit_dictionary['window_size']
         for uni_variant in (True, False):
@@ -199,13 +214,9 @@ def test_base_decoder(self):
             dataset_properties = copy.copy(self.dataset_properties)
             fit_dictionary = copy.copy(self.fit_dictionary)
 
-            dataset_properties['is_small_preprocess'] = False
-            dataset_properties['uni_variant'] = False
-
-            fit_dictionary['dataset_properties'] = self.dataset_properties
             fit_dictionary['network_structure'] = network_structure
-            fit_dictionary['transform_time_features'] = True
             fit_dictionary['dataset_properties'] = dataset_properties
+
             encoder_block_1 = copy.deepcopy(self.encoder)
             encoder_block_2 = copy.deepcopy(self.encoder)
             encoder_block_2.block_number = 2
@@ -276,11 +287,12 @@ def test_forecasting_heads(self):
         dataset_properties = copy.copy(self.dataset_properties)
         fit_dictionary = copy.copy(self.fit_dictionary)
 
-        dataset_properties['is_small_preprocess'] = False
-        dataset_properties['uni_variant'] = False
-        input_tensor = torch.randn([10, fit_dictionary['window_size'], 3 + fit_dictionary['X_train'].shape[-1]])
+        input_tensor = torch.randn([10, 20, 3 + fit_dictionary['X_train'].shape[-1]])
         input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dictionary['X_train'].shape[-1]])
 
+        network_embedding = self.fit_dictionary['network_embedding']
+        input_tensor = network_embedding(input_tensor)
+
         fit_dictionary['dataset_properties'] = self.dataset_properties
         fit_dictionary['network_structure'] = network_structure
         fit_dictionary['transform_time_features'] = True
@@ -294,6 +306,7 @@ def test_forecasting_heads(self):
             fit_dictionary_ = copy.deepcopy(fit_dictionary)
             decoder = decoder.fit(fit_dictionary_)
             fit_dictionary_ = decoder.transform(fit_dictionary_)
+
             for net_output_type in ['regression', 'distribution', 'quantile']:
 
                 def eval_heads_output(fit_dict):
@@ -337,7 +350,3 @@ def eval_heads_output(fit_dict):
                     eval_heads_output(fit_dictionary_copy)
                 else:
                     eval_heads_output(fit_dictionary_copy)
-
-
-
-
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
index 208776158..0ae24891a 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -1,15 +1,277 @@
+import copy
 import unittest
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
+    generate_fit_dict_and_dataset_property
+)
+from ConfigSpace import Configuration
+import torch
 
+from autoPyTorch.constants import (
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING,
+)
+from sklearn.pipeline import Pipeline
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
     import FlatForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+    DecoderBlockInfo
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderBlockInfo, EncoderNetwork
+)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder. \
     MLPEncoder import MLPEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder. \
     NBEATSEncoder import NBEATSEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.MLPDecoder import (
     ForecastingMLPDecoder
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.NBEATSDecoder \
     import NBEATSDecoder
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head import ForecastingHead
+
+
+class TestFlatEncoder(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dataset_properties, self.fit_dictionary = generate_fit_dict_and_dataset_property()
+        self.fit_dictionary['net_output_type'] = 'regression'
+        self.fit_dictionary['network_embedding'] = _NoEmbedding()
+
+    def test_flat_encoder_choice(self):
+        encoder_choices = FlatForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+        cs_flat = encoder_choices.get_hyperparameter_search_space(self.dataset_properties)
+        available_encoder = cs_flat.get_hyperparameter("__choice__")
+
+        self.assertTrue('MLPEncoder' in available_encoder.choices)
+        self.assertTrue('NBEATSEncoder' in available_encoder.choices)
+
+        sample = cs_flat.sample_configuration()
+        encoder_choices.set_hyperparameters(sample)
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        self.assertIsInstance(encoder_choices.pipeline, Pipeline)
+        encoder_choices = encoder_choices.fit(fit_dict)
+        fit_dict = encoder_choices.transform(fit_dict)
+
+        self.assertTrue('network_structure' in fit_dict)
+        network_structure = fit_dict['network_structure']
+        self.assertIsInstance(network_structure, NetworkStructure)
+        self.assertTrue(NetworkStructure.num_blocks, 1)
+
+        self.assertTrue('network_encoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_encoder']), 1)
+
+        self.assertTrue('network_decoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_decoder']), 1)
+
+    def test_mlp_network(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        network_structure = NetworkStructure()
+
+        encoder_cfg = MLPEncoder().get_hyperparameter_search_space().get_default_configuration()
+        encoder = MLPEncoder(**encoder_cfg)
+
+        mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
+                                                                       can_be_auto_regressive=True)
+        mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
+        mlp_cfg_non_ar_wo_local = copy.copy(mlp_cfg_non_ar_w_local.get_dictionary())
+
+        mlp_cfg_non_ar_wo_local['has_local_layer'] = False
+        mlp_cfg_non_ar_wo_local.pop('units_local_layer')
+
+        mlp_cfg_non_ar_wo_local = Configuration(mlp_cs, values=mlp_cfg_non_ar_wo_local)
+
+        decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
+        decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
+
+        decoders = {"non_ar_w_local": decoder_w_local,
+                    "non_ar_wo_local": decoder_wo_local}
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        fit_dict['network_structure'] = network_structure
+
+        encoder = encoder.fit(fit_dict)
+        fit_dict = encoder.transform(fit_dict)
+
+        network_embedding = self.fit_dictionary['network_embedding']
+
+        for name, decoder in decoders.items():
+            fit_dict_ = copy.copy(fit_dict)
+
+            decoder = decoder.fit(fit_dict_)
+            fit_dict_ = decoder.transform(fit_dict_)
+
+            input_tensor = torch.randn([10, 20, 3 + fit_dict_['X_train'].shape[-1]])
+            input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dict_['X_train'].shape[-1]])
+
+            head = ForecastingHead()
+            head = head.fit(fit_dict_)
+            fit_dict_ = head.transform(fit_dict_)
+
+            encoder = fit_dict_['network_encoder']['block_1'].encoder
+            decoder = fit_dict_['network_decoder']['block_1'].decoder
+
+            head = fit_dict_['network_head']
+            output = head(decoder(input_tensor_future, encoder(input_tensor, output_seq=False)))
+
+            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+    def test_mlp_network(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        network_structure = NetworkStructure()
+
+        encoder_cfg = MLPEncoder().get_hyperparameter_search_space().get_default_configuration()
+        encoder = MLPEncoder(**encoder_cfg)
+
+        mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
+                                                                       can_be_auto_regressive=False)
+        mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
+        mlp_cfg_non_ar_wo_local = copy.copy(mlp_cfg_non_ar_w_local.get_dictionary())
+
+        mlp_cfg_non_ar_wo_local['has_local_layer'] = False
+        mlp_cfg_non_ar_wo_local.pop('units_local_layer')
+
+        mlp_cfg_non_ar_wo_local = Configuration(mlp_cs, values=mlp_cfg_non_ar_wo_local)
+
+        decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
+        decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
+
+        decoders = {"non_ar_w_local": decoder_w_local,
+                    "non_ar_wo_local": decoder_wo_local}
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        fit_dict['network_structure'] = network_structure
+
+        encoder = encoder.fit(fit_dict)
+        fit_dict = encoder.transform(fit_dict)
+
+        network_embedding = self.fit_dictionary['network_embedding']
+
+        for name, decoder in decoders.items():
+            fit_dict_ = copy.copy(fit_dict)
+
+            decoder = decoder.fit(fit_dict_)
+            fit_dict_ = decoder.transform(fit_dict_)
+
+            input_tensor = torch.randn([10, 20, 3 + fit_dict_['X_train'].shape[-1]])
+            input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dict_['X_train'].shape[-1]])
+
+            head = ForecastingHead()
+            head = head.fit(fit_dict_)
+            fit_dict_ = head.transform(fit_dict_)
+
+            encoder = fit_dict_['network_encoder']['block_1'].encoder
+            decoder = fit_dict_['network_decoder']['block_1'].decoder
+
+            head = fit_dict_['network_head']
+            output = head(decoder(input_tensor_future, encoder(network_embedding(input_tensor), output_seq=False)))
+
+            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+    def test_nbeats_network(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        window_size = self.fit_dictionary['window_size']
+        network_structure = NetworkStructure()
+
+        encoder_cfg = NBEATSEncoder().get_hyperparameter_search_space().get_default_configuration()
+        encoder = NBEATSEncoder(**encoder_cfg)
+
+        nbeats_cs = NBEATSDecoder.get_hyperparameter_search_space(self.dataset_properties)
+
+        nbeatsI_cfg = {
+            "backcast_loss_ratio": 0.0,
+            "normalization": "LN",
+            "activation": "relu",
+
+            "n_beats_type": "I",
+
+            "use_dropout_i": True,
+            "num_stacks_i": 2,
+
+            "num_blocks_i_1": 2,
+            "num_layers_i_1": 2,
+            "width_i_1": 16,
+            "weight_sharing_i_1": True,
+            "stack_type_i_1": 'trend',
+            "expansion_coefficient_length_i_trend_1": 3,
+            "dropout_i_1": 0.1,
+
+            "num_blocks_i_2": 3,
+            "num_layers_i_2": 2,
+            "width_i_2": 16,
+            "weight_sharing_i_2": False,
+            "stack_type_i_2": 'seasonality',
+            "expansion_coefficient_length_i_seasonality_2": 7,
+            "dropout_i_2": 0.1,
+        }
+
+        nbeatsG_cfg = {
+            "backcast_loss_ratio": 0.0,
+            "normalization": "NoNorm",
+            "activation": "relu",
+
+            "n_beats_type": "G",
+
+            "use_dropout_g": True,
+            "num_stacks_g": 2,
+
+            "num_blocks_g": 1,
+            "num_layers_g": 4,
+            "width_g": 512,
+            "weight_sharing_g": False,
+            "expansion_coefficient_length_g": 32,
+            "dropout_g": 0.1,
+        }
+
+        nbeatsI_cfg = Configuration(nbeats_cs, values=nbeatsI_cfg)
+        nbeatsG_cfg = Configuration(nbeats_cs, values=nbeatsG_cfg)
+
+        nbeats_i = NBEATSDecoder(**nbeatsI_cfg)
+        nbeats_g = NBEATSDecoder(**nbeatsG_cfg)
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        fit_dict['network_structure'] = network_structure
+
+        encoder = encoder.fit(fit_dict)
+        fit_dict = encoder.transform(fit_dict)
+
+        for decoder_idx, decoder in enumerate([nbeats_i, nbeats_g]):
+            fit_dict = copy.copy(fit_dict)
+            fit_dict_ = copy.copy(fit_dict)
+
+            decoder = decoder.fit(fit_dict_)
+            fit_dict_ = decoder.transform(fit_dict_)
+
+            input_tensor = torch.randn([10, 20, 1])
+
+            head = ForecastingHead()
+            head = head.fit(fit_dict_)
+            fit_dict_ = head.transform(fit_dict_)
+
+            encoder_net = fit_dict_['network_encoder']['block_1'].encoder
+            decoder_net = fit_dict_['network_decoder']['block_1'].decoder
+            idx_tracker = 0
+            if decoder_idx == 0:
+                # only check nbeats_i
+                for i_stack in range(1, 1 + nbeatsI_cfg['num_stacks_i']):
+                    num_blocks = nbeatsI_cfg[f'num_blocks_i_{i_stack}']
+                    idx_end = idx_tracker + num_blocks
+                    num_individual_models = len(set(decoder_net[idx_tracker:idx_end]))
+                    if nbeatsI_cfg[f'weight_sharing_i_{i_stack}']:
+                        self.assertEqual(num_individual_models, 1)
+                    else:
+                        self.assertEqual(num_individual_models, num_blocks)
+                    idx_tracker = idx_end
 
+            input_tensor = encoder_net(input_tensor, output_seq=False)
 
+            for block in decoder_net:
+                backcast_block, forecast_block = block([None], input_tensor)
+                self.assertListEqual(list(backcast_block.shape), [10, window_size * 1])
+                self.assertListEqual(list(forecast_block.shape), [10, n_prediction_steps * 1])

From c6e2239a796e3ffabd9e8cfa4b808d87d3f49855 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 17 May 2022 23:24:20 +0200
Subject: [PATCH 267/347] test for seq encoder

---
 .../forecasting_backbone/cells.py             |   2 +-
 .../forecasting_backbone/components_util.py   |  26 ++-
 .../forecasting_decoder/MLPDecoder.py         |  33 +--
 .../forecasting_decoder/__init__.py           | 141 -------------
 .../forecasting_encoder/__init__.py           |   4 +-
 .../seq_encoder/TransformerEncoder.py         |   4 +-
 .../seq_encoder/__init__.py                   |  83 ++++----
 .../forecasting_network_head/__init__.py      | 190 ------------------
 .../forecasting_network_head/distribution.py  |   3 +-
 .../test_flat_backbones.py                    |  82 ++------
 .../forecasting_networks/test_seq_encoder.py  | 126 ++++++++++++
 11 files changed, 225 insertions(+), 469 deletions(-)
 create mode 100644 test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index c3079b41a..b2d017aef 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -648,6 +648,6 @@ def forward(self,
             if cache_intermediate_state:
                 if self.decoder_has_hidden_states[i]:
                     self.cached_intermediate_state[i] = hx
-                    #TODO consider if there are other case that could make use of cached intermediate states
+                    # TODO consider if there are other case that could make use of cached intermediate states
             x = fx
         return x
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index b58cfa87a..fb88119c1 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -30,15 +30,23 @@ def __init__(self,
                  grn_dropout_rate: float = 0.0,
                  ) -> None:
         super().__init__()
-        self.network_structure = NetworkStructure(num_blocks=num_blocks,
-                                                  variable_selection=variable_selection,
-                                                  share_single_variable_networks=share_single_variable_networks,
-                                                  use_temporal_fusion=use_temporal_fusion,
-                                                  skip_connection=skip_connection,
-                                                  skip_connection_type=skip_connection_type,
-                                                  grn_dropout_rate=grn_dropout_rate)
-
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "ForecastingNetworkStructure":
+        self.num_blocks = num_blocks
+        self.variable_selection = variable_selection
+        self.share_single_variable_networks = share_single_variable_networks
+        self.use_temporal_fusion = use_temporal_fusion
+        self.skip_connection = skip_connection
+        self.skip_connection_type = skip_connection_type
+        self.grn_dropout_rate = grn_dropout_rate
+        self.network_structure = None
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
+        self.network_structure = NetworkStructure(num_blocks=self.num_blocks,
+                                                  variable_selection=self.variable_selection,
+                                                  share_single_variable_networks=self.share_single_variable_networks,
+                                                  use_temporal_fusion=self.use_temporal_fusion,
+                                                  skip_connection=self.skip_connection,
+                                                  skip_connection_type=self.skip_connection_type,
+                                                  grn_dropout_rate=self.grn_dropout_rate)
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index fdf5c3192..96dff0b72 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -30,7 +30,8 @@ def __init__(self,
         self.local_layers = local_layers
         self.auto_regressive = auto_regressive
 
-    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None):
         if x_future is None or self.auto_regressive:
             # for auto-regressive model, x_future is fed to the encoders
             x = self.global_layers(encoder_output)
@@ -39,6 +40,7 @@ def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor
             else:
                 # auto regressive model does not have local layers
                 return self.local_layers(x)
+
         if len(encoder_output.shape) == 3:
             encoder_output = encoder_output.squeeze(1)
 
@@ -147,13 +149,15 @@ def get_hyperparameter_search_space(
 
         Args:
             dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Dataset Properties
-            can_be_auto_regressive: bool: if this decoder is allowed to be auto-regressive
+            can_be_auto_regressive (bool): if this decoder is allowed to be auto-regressive
+            is_top_layer (bool) if this mlp decoder is at the top layer as seq decoders. Only top layer MLP allows
+                deactivating local layers. (Otherwise the decoder cannot output a sequence)
             num_layers (HyperparameterSearchSpace): number of decoder layers (the last layer is not included, thus it
-            could start from 0)
+                could start from 0)
             units_layer (HyperparameterSearchSpace): number of units of each layer (except for the last layer)
             activation (HyperparameterSearchSpace): activation function
             auto_regressive (bool): if the model acts as a DeepAR model, the corresponding hyperparaemter is
-            controlled by seq_encoder
+                controlled by seq_encoder
             has_local_layer (HyperparameterSearchSpace): if local MLP layer is applied, if not, the output of the
                 network will be directly attached with different heads
             units_local_layer (HyperparameterSearchSpace): number of units of local layer. The size of this layer is
@@ -167,8 +171,7 @@ def get_hyperparameter_search_space(
                 # deepAR model cannot be applied
                 auto_regressive = HyperparameterSearchSpace(hyperparameter=auto_regressive.hyperparameter,
                                                             value_range=[False],
-                                                            default_value=False,)
-
+                                                            default_value=False, )
         cs = ConfigurationSpace()
 
         min_num_layers: int = num_layers.value_range[0]  # type: ignore
@@ -210,14 +213,20 @@ def get_hyperparameter_search_space(
 
         cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
 
-        cs.add_hyperparameters([has_local_layer, units_local_layer])
-        cs.add_conditions([cond_units_local_layer])
-
         if can_be_auto_regressive:
             auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
-
-            cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive, False)
             cs.add_hyperparameters([auto_regressive])
-            cs.add_conditions([cond_use_local_layer])
 
+            if False in auto_regressive.choices:
+                cs.add_hyperparameters([has_local_layer, units_local_layer])
+                cs.add_conditions([cond_units_local_layer])
+
+                cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive, False)
+                cs.add_conditions([cond_use_local_layer])
+                return cs
+            else:
+                return cs
+
+        cs.add_hyperparameters([has_local_layer, units_local_layer])
+        cs.add_conditions([cond_units_local_layer])
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index 032a60401..ba780efa0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -50,145 +50,4 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
 
         return components
 
-    def get_available_components(
-            self,
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            include: List[str] = None,
-            exclude: List[str] = None,
-    ) -> Dict[str, autoPyTorchComponent]:
-        """Filters out components based on user provided
-        include/exclude directives, as well as the dataset properties
 
-        Args:
-         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-            to honor when creating the configuration space
-         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-             to remove from the configuration space
-         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
-             of the dataset to guide the pipeline choices of components
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: A filtered dict of learning
-                rate heads
-
-        """
-        if dataset_properties is None:
-            dataset_properties = {}
-
-        if include is not None and exclude is not None:
-            raise ValueError(
-                "The argument include and exclude cannot be used together.")
-
-        available_comp = self.get_components()
-
-        if include is not None:
-            for incl in include:
-                if incl not in available_comp:
-                    raise ValueError("Trying to include unknown component: "
-                                     "%s" % incl)
-
-        components_dict = OrderedDict()
-        for name in available_comp:
-            if include is not None and name not in include:
-                continue
-            elif exclude is not None and name in exclude:
-                continue
-
-            entry = available_comp[name]
-
-            # Exclude itself to avoid infinite loop
-            if entry == ForecastingDecoderChoice or hasattr(entry, 'get_components'):
-                continue
-
-            task_type = str(dataset_properties['task_type'])
-            properties = entry.get_properties()
-            if 'tabular' in task_type and not bool(properties['handles_tabular']):
-                continue
-            elif 'image' in task_type and not bool(properties['handles_image']):
-                continue
-            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
-                continue
-
-            # target_type = dataset_properties['target_type']
-            # Apply some automatic filtering here for
-            # heads based on the dataset!
-            # TODO: Think if there is any case where a head
-            # is not recommended for a certain dataset
-
-            components_dict[name] = entry
-        return components_dict
-
-    def get_hyperparameter_search_space(
-            self,
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            default: Optional[str] = None,
-            include: Optional[List[str]] = None,
-            exclude: Optional[List[str]] = None,
-    ) -> ConfigurationSpace:
-        """Returns the configuration space of the current chosen components
-
-        Args:
-            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Describes the dataset to work on
-            default (Optional[str]): Default head to use
-            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
-                list, and will exclusively use this components.
-            exclude: Optional[Dict[str, Any]]: which components to skip
-
-        Returns:
-            ConfigurationSpace: the configuration space of the hyper-parameters of the
-                 chosen component
-        """
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None:
-            dataset_properties = {}
-
-        # Compile a list of legal preprocessors for this problem
-        available_heads = self.get_available_components(
-            dataset_properties=dataset_properties,
-            include=include, exclude=exclude)
-
-        if len(available_heads) == 0:
-            raise ValueError("No head found")
-
-        if default is None:
-            defaults = [
-                'MLPDecoder',
-                'RNNDecoder',
-            ]
-            for default_ in defaults:
-                if default_ in available_heads:
-                    default = default_
-                    break
-
-        updates = self._get_search_space_updates()
-        if '__choice__' in updates.keys():
-            choice_hyperparameter = updates['__choice__']
-            if not set(choice_hyperparameter.value_range).issubset(available_heads):
-                raise ValueError("Expected given update for {} to have "
-                                 "choices in {} got {}".format(self.__class__.__name__,
-                                                               available_heads,
-                                                               choice_hyperparameter.value_range))
-            decoder = CSH.CategoricalHyperparameter('__choice__',
-                                                 choice_hyperparameter.value_range,
-                                                 default_value=choice_hyperparameter.default_value)
-        else:
-            decoder = CSH.CategoricalHyperparameter(
-                '__choice__',
-                list(available_heads.keys()),
-                default_value=default)
-        cs.add_hyperparameter(decoder)
-        for name in decoder.choices:
-            updates = self._get_search_space_updates(prefix=name)
-            config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
-                                                                                 **updates)
-            parent_hyperparameter = {'parent': decoder, 'value': name}
-            cs.add_configuration_space(
-                name,
-                config_space,
-                parent_hyperparameter=parent_hyperparameter
-            )
-
-        self.configuration_space_ = cs
-        self.dataset_properties_ = dataset_properties
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index dce479069..3dc7925cd 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from collections import OrderedDict
 from typing import Dict, Optional, List, Any, Type
 from abc import abstractmethod
@@ -109,8 +110,7 @@ def get_available_components(
         if include is not None:
             for incl in include:
                 if incl not in available_comp:
-                    raise ValueError("Trying to include unknown component: "
-                                     "%s" % incl)
+                    warnings.warn("Trying to include unknown component: ""%s" % incl)
 
         components_dict = OrderedDict()
         for name in available_comp:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index b11a6c935..fa28c70c2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -187,7 +187,7 @@ def get_hyperparameter_search_space(
             decoder_type: HyperparameterSearchSpace =
             HyperparameterSearchSpace(hyperparameter='decoder_type',
                                       value_range=('MLPDecoder', 'TransformerDecoder'),
-                                      default_value='TransformerDecoder')
+                                      default_value='MLPDecoder')
     ) -> ConfigurationSpace:
         """
         get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
@@ -200,8 +200,6 @@ def get_hyperparameter_search_space(
         add_hyperparameter(cs, d_model_log, UniformIntegerHyperparameter)
         add_hyperparameter(cs, norm_first, CategoricalHyperparameter)
 
-        min_transformer_layers, max_transformer_layers = num_layers.value_range
-
         num_layers = get_hyperparameter(num_layers, UniformIntegerHyperparameter)
         use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index d119ae788..f53533248 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -183,7 +183,6 @@ def get_hyperparameter_search_space(
                                  skip_connection]
         cond_skip_connections = []
 
-
         if True in skip_connection.choices:
             skip_connection_type = get_hyperparameter(skip_connection_type, CategoricalHyperparameter)
             hp_network_structures.append(skip_connection_type)
@@ -196,7 +195,8 @@ def get_hyperparameter_search_space(
                         EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm")
                     )
                 else:
-                    cond_skip_connections.append(EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm"))
+                    cond_skip_connections.append(
+                        EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm"))
                 if True in grn_use_dropout.choices:
                     grn_dropout_rate = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
                     hp_network_structures.append(grn_dropout_rate)
@@ -216,8 +216,6 @@ def get_hyperparameter_search_space(
             cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate, variable_selection_use_dropout, True)
             cs.add_conditions([cond_vs_dropout, cond_vs_dropoutrate])
 
-
-
         if static_features_shape + future_feature_shapes[-1] == 0:
             if False in variable_selection.choices and False in decoder_auto_regressive.choices:
                 if variable_selection.num_choices == 1 and decoder_auto_regressive.num_choices == 1:
@@ -238,7 +236,7 @@ def get_hyperparameter_search_space(
 
         available_decoders = self.get_available_components(
             dataset_properties=dataset_properties,
-            include=None, exclude=None,
+            include=None, exclude=exclude,
             components=self.get_decoder_components())
 
         if len(available_encoders) == 0:
@@ -291,15 +289,7 @@ def get_hyperparameter_search_space(
             for encoder_name in hp_encoder.choices:
                 updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
                 config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,
-                                                                                                # type: ignore
                                                                                                 **updates)
-                parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
-                cs.add_configuration_space(
-                    block_prefix + encoder_name,
-                    config_space,
-                    parent_hyperparameter=parent_hyperparameter
-                )
-
                 allowed_decoders = available_encoders[encoder_name].allowed_decoders()
                 if len(allowed_decoders) > 1:
                     if 'decoder_type' not in config_space:
@@ -311,9 +301,27 @@ def get_hyperparameter_search_space(
                         raise ValueError(
                             'The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
                     allowed_decoders = hp_decoder_choice
+                valid_decoders = []
                 for decoder_name in allowed_decoders:
-                    decoder2encoder[decoder_name].append(encoder_name)
+                    if decoder_name in decoder2encoder:
+                        valid_decoders.append(decoder_name)
+                        decoder2encoder[decoder_name].append(encoder_name)
                 encoder2decoder[encoder_name] = allowed_decoders
+                if len(allowed_decoders) > 1:
+
+                    if len(valid_decoders) < len(config_space.get_hyperparameter('decoder_type').choices):
+                        updates['decoder_type'] = HyperparameterSearchSpace(hyperparameter='decoder_type',
+                                                                            value_range=tuple(valid_decoders),
+                                                                            default_value=valid_decoders[0])
+                        config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                            dataset_properties,
+                            **updates)
+                parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
+                cs.add_configuration_space(
+                    block_prefix + encoder_name,
+                    config_space,
+                    parent_hyperparameter=parent_hyperparameter
+                )
 
             for decoder_name in available_decoders.keys():
                 if not decoder2encoder[decoder_name]:
@@ -322,10 +330,7 @@ def get_hyperparameter_search_space(
                 if i == 1 and decoder_name == self.deepAR_decoder_name:
                     # TODO this is only a temporary solution, a fix on ConfigSpace needs to be implemented
                     updates['can_be_auto_regressive'] = True
-                if decoder_name == "MLPDecoder" and i < int(max_num_blocks):
-                    updates['has_local_layer'] = HyperparameterSearchSpace('has_local_layer',
-                                                                           value_range=(True,),
-                                                                           default_value=True)
+
                 config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
                                                                                                 # type: ignore
                                                                                                 **updates)
@@ -412,14 +417,13 @@ def get_hyperparameter_search_space(
                     forbidden_deep_ar = ForbiddenEqualsClause(deep_ar_hp, True)
                     if min_num_blocks == 1:
                         if max_num_blocks > 1:
-                            if max_num_blocks - min_num_blocks > 1:
-                                forbidden = ForbiddenAndConjunction(
-                                    ForbiddenInClause(num_blocks, list(range(1, max_num_blocks))),
-                                    forbidden_deep_ar
-                                )
-                            else:
-                                forbidden = ForbiddenAndConjunction(ForbiddenEqualsClause(num_blocks, 2), forbidden_deep_ar)
+                            forbidden = ForbiddenAndConjunction(
+                                ForbiddenInClause(num_blocks, list(range(2, max_num_blocks + 1))),
+                                forbidden_deep_ar
+                            )
                             cs.add_forbidden_clause(forbidden)
+                    else:
+                        cs.add_forbidden_clause(forbidden_deep_ar)
 
                     forbidden_deep_ars = []
 
@@ -430,26 +434,25 @@ def get_hyperparameter_search_space(
                                 ForbiddenEqualsClause(hp_forbidden_deep_ar, True),
                                 forbidden_deep_ar
                             ))
+                    if True in skip_connection.choices:
+                        forbidden_deep_ars.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(skip_connection, True),
+                            forbidden_deep_ar
+                        ))
                     if forbidden_deep_ars:
                         cs.add_forbidden_clauses(forbidden_deep_ars)
 
-        if True in skip_connection.choices:
-            forbidden_mlp_skip = []
-            forbidden_skip = ForbiddenEqualsClause(skip_connection, True)
-            forbidden_temporal_fusion = ForbiddenEqualsClause(use_temporal_fusion, True)
-            for i in range(1, max_num_blocks + 1):
-                hp_mlp_has_local_layer = f"block_{i}:MLPDecoder:has_local_layer"
-                if hp_mlp_has_local_layer in cs:
-                    hp_mlp_has_local_layer = cs.get_hyperparameter(hp_mlp_has_local_layer)
-                    forbidden_mlp_skip.append(ForbiddenAndConjunction(
-                        ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
-                        forbidden_skip
-                    ))
-                    forbidden_mlp_skip.append(ForbiddenAndConjunction(
+        forbidden_mlp_local_layer = []
+        for i in range(1, max_num_blocks + 1):
+            hp_mlp_has_local_layer = f"block_{i}:MLPDecoder:has_local_layer"
+            if hp_mlp_has_local_layer in cs:
+                hp_mlp_has_local_layer = cs.get_hyperparameter(hp_mlp_has_local_layer)
+                if i < max_num_blocks:
+                    forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
                         ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
-                        forbidden_temporal_fusion
+                        ForbiddenInClause(num_blocks, list(range(i + 1, max_num_blocks + 1))),
                     ))
-            cs.add_forbidden_clauses(forbidden_mlp_skip)
+        cs.add_forbidden_clauses(forbidden_mlp_local_layer)
 
         return cs
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
index 39ef0c558..e69de29bb 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/__init__.py
@@ -1,190 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Dict, List, Optional
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_component import (
-    autoPyTorchComponent,
-)
-
-from autoPyTorch.pipeline.components.setup.network_head import NetworkHeadChoice
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import (
-    ForecastingHead
-)
-
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    find_components,
-)
-
-directory = os.path.split(__file__)[0]
-_heads = find_components(__package__,
-                         directory,
-                         ForecastingHead)
-
-_addons = ThirdPartyComponents(ForecastingHead)
-
-
-class ForecastingNetworkHeadChoice(NetworkHeadChoice):
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
-        """Returns the available head components
-
-        Args:
-            None
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: all NetworkHeadComponents available
-                as choices for learning rate scheduling
-        """
-        components = OrderedDict()
-
-        components.update(_heads)
-        components.update(_addons.components)
-
-        return components
-
-    def get_available_components(
-            self,
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            include: List[str] = None,
-            exclude: List[str] = None,
-    ) -> Dict[str, autoPyTorchComponent]:
-        """Filters out components based on user provided
-        include/exclude directives, as well as the dataset properties
-
-        Args:
-         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-            to honor when creating the configuration space
-         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-             to remove from the configuration space
-         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
-             of the dataset to guide the pipeline choices of components
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: A filtered dict of learning
-                rate heads
-
-        """
-        if dataset_properties is None:
-            dataset_properties = {}
-
-        if include is not None and exclude is not None:
-            raise ValueError(
-                "The argument include and exclude cannot be used together.")
-
-        available_comp = self.get_components()
-
-        if include is not None:
-            for incl in include:
-                if incl not in available_comp:
-                    raise ValueError("Trying to include unknown component: "
-                                     "%s" % incl)
-
-        components_dict = OrderedDict()
-        for name in available_comp:
-            if include is not None and name not in include:
-                continue
-            elif exclude is not None and name in exclude:
-                continue
-
-            entry = available_comp[name]
-
-            # Exclude itself to avoid infinite loop
-            if entry == NetworkHeadChoice or hasattr(entry, 'get_components'):
-                continue
-
-            task_type = str(dataset_properties['task_type'])
-            properties = entry.get_properties()
-            if 'tabular' in task_type and not bool(properties['handles_tabular']):
-                continue
-            elif 'image' in task_type and not bool(properties['handles_image']):
-                continue
-            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
-                continue
-
-            # target_type = dataset_properties['target_type']
-            # Apply some automatic filtering here for
-            # heads based on the dataset!
-            # TODO: Think if there is any case where a head
-            # is not recommended for a certain dataset
-
-            components_dict[name] = entry
-        return components_dict
-
-    def get_hyperparameter_search_space(
-            self,
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            default: Optional[str] = None,
-            include: Optional[List[str]] = None,
-            exclude: Optional[List[str]] = None,
-    ) -> ConfigurationSpace:
-        """Returns the configuration space of the current chosen components
-
-        Args:
-            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Describes the dataset to work on
-            default (Optional[str]): Default head to use
-            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
-                list, and will exclusively use this components.
-            exclude: Optional[Dict[str, Any]]: which components to skip
-
-        Returns:
-            ConfigurationSpace: the configuration space of the hyper-parameters of the
-                 chosen component
-        """
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None:
-            dataset_properties = {}
-
-        # Compile a list of legal preprocessors for this problem
-        available_heads = self.get_available_components(
-            dataset_properties=dataset_properties,
-            include=include, exclude=exclude)
-
-        if len(available_heads) == 0:
-            raise ValueError("No head found")
-
-        if default is None:
-            defaults = [
-                'ForecastingFullyConnectedHead',
-                'ForecastingRNNHead',
-            ]
-            for default_ in defaults:
-                if default_ in available_heads:
-                    default = default_
-                    break
-
-        updates = self._get_search_space_updates()
-        if '__choice__' in updates.keys():
-            choice_hyperparameter = updates['__choice__']
-            if not set(choice_hyperparameter.value_range).issubset(available_heads):
-                raise ValueError("Expected given update for {} to have "
-                                 "choices in {} got {}".format(self.__class__.__name__,
-                                                               available_heads,
-                                                               choice_hyperparameter.value_range))
-            head = CSH.CategoricalHyperparameter('__choice__',
-                                                 choice_hyperparameter.value_range,
-                                                 default_value=choice_hyperparameter.default_value)
-        else:
-            head = CSH.CategoricalHyperparameter(
-                '__choice__',
-                list(available_heads.keys()),
-                default_value=default)
-        cs.add_hyperparameter(head)
-        for name in head.choices:
-            updates = self._get_search_space_updates(prefix=name)
-            config_space = available_heads[name].get_hyperparameter_search_space(dataset_properties,  # type: ignore
-                                                                                 **updates)
-            parent_hyperparameter = {'parent': head, 'value': name}
-            cs.add_configuration_space(
-                name,
-                config_space,
-                parent_hyperparameter=parent_hyperparameter
-            )
-
-        self.configuration_space_ = cs
-        self.dataset_properties_ = dataset_properties
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 7c94e521e..8c97d0f5a 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -209,7 +209,6 @@ class DisForecastingStrategy(NamedTuple):
     aggregation: str = "mean"
 
 
-# TODO find components that are compatible with beta, gamma and poisson distrubtion!
+# TODO find components that are compatible with beta, gamma and poisson distribution!
 
 # TODO consider how to implement NegativeBinomialOutput without scale information
-# class NegativeBinomialOutput(ProjectionLayer):
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
index 0ae24891a..e5ff0f3c0 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -6,21 +6,11 @@
 from ConfigSpace import Configuration
 import torch
 
-from autoPyTorch.constants import (
-    TASK_TYPES_TO_STRING,
-    TIMESERIES_FORECASTING,
-)
 from sklearn.pipeline import Pipeline
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
     import FlatForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
-    DecoderBlockInfo
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderBlockInfo, EncoderNetwork
-)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder. \
     MLPEncoder import MLPEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder. \
@@ -30,7 +20,11 @@
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.NBEATSDecoder \
     import NBEATSDecoder
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head import ForecastingHead
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    StackedEncoder,
+    StackedDecoder
+)
 
 
 class TestFlatEncoder(unittest.TestCase):
@@ -59,7 +53,7 @@ def test_flat_encoder_choice(self):
         self.assertTrue('network_structure' in fit_dict)
         network_structure = fit_dict['network_structure']
         self.assertIsInstance(network_structure, NetworkStructure)
-        self.assertTrue(NetworkStructure.num_blocks, 1)
+        self.assertTrue(network_structure.num_blocks, 1)
 
         self.assertTrue('network_encoder' in fit_dict)
         self.assertEqual(len(fit_dict['network_encoder']), 1)
@@ -97,8 +91,6 @@ def test_mlp_network(self):
         encoder = encoder.fit(fit_dict)
         fit_dict = encoder.transform(fit_dict)
 
-        network_embedding = self.fit_dictionary['network_embedding']
-
         for name, decoder in decoders.items():
             fit_dict_ = copy.copy(fit_dict)
 
@@ -112,64 +104,15 @@ def test_mlp_network(self):
             head = head.fit(fit_dict_)
             fit_dict_ = head.transform(fit_dict_)
 
-            encoder = fit_dict_['network_encoder']['block_1'].encoder
-            decoder = fit_dict_['network_decoder']['block_1'].decoder
+            net_encoder = StackedEncoder(network_structure, False,
+                                         fit_dict_['network_encoder'], fit_dict_['network_decoder'])
+            net_decoder = StackedDecoder(network_structure, net_encoder.encoder, fit_dict_['network_encoder'],
+                                         fit_dict_['network_decoder'])
 
             head = fit_dict_['network_head']
-            output = head(decoder(input_tensor_future, encoder(input_tensor, output_seq=False)))
-
-            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
 
-    def test_mlp_network(self):
-        n_prediction_steps = self.dataset_properties['n_prediction_steps']
-        network_structure = NetworkStructure()
-
-        encoder_cfg = MLPEncoder().get_hyperparameter_search_space().get_default_configuration()
-        encoder = MLPEncoder(**encoder_cfg)
-
-        mlp_cs = ForecastingMLPDecoder.get_hyperparameter_search_space(self.dataset_properties,
-                                                                       can_be_auto_regressive=False)
-        mlp_cfg_non_ar_w_local = mlp_cs.get_default_configuration()
-        mlp_cfg_non_ar_wo_local = copy.copy(mlp_cfg_non_ar_w_local.get_dictionary())
-
-        mlp_cfg_non_ar_wo_local['has_local_layer'] = False
-        mlp_cfg_non_ar_wo_local.pop('units_local_layer')
-
-        mlp_cfg_non_ar_wo_local = Configuration(mlp_cs, values=mlp_cfg_non_ar_wo_local)
-
-        decoder_w_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_w_local)
-        decoder_wo_local = ForecastingMLPDecoder(**mlp_cfg_non_ar_wo_local)
-
-        decoders = {"non_ar_w_local": decoder_w_local,
-                    "non_ar_wo_local": decoder_wo_local}
-
-        fit_dict = copy.copy(self.fit_dictionary)
-        fit_dict['dataset_properties'] = self.dataset_properties
-        fit_dict['network_structure'] = network_structure
-
-        encoder = encoder.fit(fit_dict)
-        fit_dict = encoder.transform(fit_dict)
-
-        network_embedding = self.fit_dictionary['network_embedding']
-
-        for name, decoder in decoders.items():
-            fit_dict_ = copy.copy(fit_dict)
-
-            decoder = decoder.fit(fit_dict_)
-            fit_dict_ = decoder.transform(fit_dict_)
-
-            input_tensor = torch.randn([10, 20, 3 + fit_dict_['X_train'].shape[-1]])
-            input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dict_['X_train'].shape[-1]])
-
-            head = ForecastingHead()
-            head = head.fit(fit_dict_)
-            fit_dict_ = head.transform(fit_dict_)
-
-            encoder = fit_dict_['network_encoder']['block_1'].encoder
-            decoder = fit_dict_['network_decoder']['block_1'].decoder
-
-            head = fit_dict_['network_head']
-            output = head(decoder(input_tensor_future, encoder(network_embedding(input_tensor), output_seq=False)))
+            encoder2decoder, _ = net_encoder(input_tensor, [None])
+            output = head(net_decoder(input_tensor_future, encoder2decoder))
 
             self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
 
@@ -275,3 +218,4 @@ def test_nbeats_network(self):
                 backcast_block, forecast_block = block([None], input_tensor)
                 self.assertListEqual(list(backcast_block.shape), [10, window_size * 1])
                 self.assertListEqual(list(forecast_block.shape), [10, n_prediction_steps * 1])
+
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
new file mode 100644
index 000000000..ed6af6cc9
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -0,0 +1,126 @@
+import copy
+import unittest
+import torch
+
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
+    generate_fit_dict_and_dataset_property
+)
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder \
+    import SeqForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
+from sklearn.pipeline import Pipeline
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
+    RNNEncoder import RNNEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
+    TCNEncoder import TCNEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
+    TransformerEncoder import TransformerEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
+    RNNEncoder import RNNEncoder
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    RNNDecoder import ForecastingRNNDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    MLPDecoder import ForecastingMLPDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    TransformerDecoder import ForecastingTransformerDecoder
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    StackedEncoder,
+    StackedDecoder
+)
+
+
+class TestSeqEncoder(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dataset_properties, self.fit_dictionary = generate_fit_dict_and_dataset_property()
+        self.fit_dictionary['net_output_type'] = 'regression'
+        self.fit_dictionary['network_embedding'] = _NoEmbedding()
+
+    def test_config_space(self):
+        seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+        cs_seq = seq_encoder_choice.get_hyperparameter_search_space(
+            dataset_properties=self.dataset_properties,
+            num_blocks=HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                 value_range=(2, 3),
+                                                 default_value=2), )
+        sample = cs_seq.sample_configuration()
+
+        num_blocks = sample['num_blocks']
+        seq_encoder_choice.set_hyperparameters(sample)
+
+        fit_dict = copy.copy(self.fit_dictionary)
+        fit_dict['dataset_properties'] = self.dataset_properties
+        self.assertIsInstance(seq_encoder_choice.pipeline, Pipeline)
+        encoder_choices = seq_encoder_choice.fit(fit_dict)
+        fit_dict = encoder_choices.transform(fit_dict)
+
+        self.assertTrue('network_structure' in fit_dict)
+        network_structure = fit_dict['network_structure']
+        self.assertIsInstance(network_structure, NetworkStructure)
+        self.assertTrue(network_structure.num_blocks, num_blocks)
+
+        self.assertTrue('network_encoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_encoder']), num_blocks)
+
+        self.assertTrue('network_decoder' in fit_dict)
+        self.assertEqual(len(fit_dict['network_decoder']), num_blocks)
+
+    def test_deepar(self):
+        for i, valid_encoder in enumerate(['RNNEncoder', 'TCNEncoder', 'TransformerEncoder']):
+            seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+            update = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                     hyperparameter='auto_regressive',
+                                                     value_range=(True,),
+                                                     default_value=True, )
+            seq_encoder_choice._cs_updates = {"block_1:MLPDecoder:auto_regressive": update}
+            cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
+                                                                        include=[valid_encoder])
+            sample = cs_seq.get_default_configuration()
+            seq_encoder_choice.set_hyperparameters(copy.copy(sample))
+
+            fit_dict = copy.copy(self.fit_dictionary)
+            fit_dict['dataset_properties'] = self.dataset_properties
+
+            encoder_choices = seq_encoder_choice.fit(fit_dict)
+            fit_dict = encoder_choices.transform(fit_dict)
+
+            head = ForecastingHead()
+            head = head.fit(fit_dict)
+            fit_dict = head.transform(fit_dict)
+
+            net_encoder = StackedEncoder(fit_dict['network_structure'], False,
+                                         fit_dict['network_encoder'], fit_dict['network_decoder'])
+            net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                         fit_dict['network_encoder'],
+                                         fit_dict['network_decoder'])
+
+            head = fit_dict['network_head']
+            if i != 1:
+                input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
+                input_tensor_future = torch.randn([10, 1, 59])
+            else:
+                input_tensor = torch.randn([10, 20, 53])  # no lag
+                input_tensor_future = torch.randn([10, 1, 53])
+
+            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                          additional_input=[None],
+                                                          cache_intermediate_state=True,
+                                                          )
+            output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
+            self.assertListEqual(list(output.shape), [10, 1])
+
+            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
+                                                          additional_input=[None],
+                                                          output_seq=False, cache_intermediate_state=True,
+                                                          incremental_update=True
+                                                          )
+            output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
+            self.assertListEqual(list(output.shape), [10, 1, 1])
+

From 727e48eb27586da6c5439890ac72b290ad529ed2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 18 May 2022 19:03:19 +0200
Subject: [PATCH 268/347] test for seqencoder

---
 autoPyTorch/api/time_series_forecasting.py    |  14 +-
 autoPyTorch/datasets/time_series_dataset.py   |   7 +-
 .../forecasting_backbone/cells.py             |  10 +-
 .../forecasting_decoder/RNNDecoder.py         |   2 +-
 .../forecasting_decoder/TransformerDecoder.py |   5 +-
 .../forecasting_encoder/__init__.py           |   2 +-
 .../base_forecasting_encoder.py               |   3 +-
 .../seq_encoder/InceptionTimeEncoder.py       |   8 +-
 .../seq_encoder/__init__.py                   |  88 ++++++--
 .../other_components/TemporalFusion.py        |  22 +-
 .../test_base_components.py                   |   1 +
 .../forecasting_networks/test_seq_encoder.py  | 196 +++++++++++++++++-
 12 files changed, 302 insertions(+), 56 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 915157838..5461f730a 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -452,11 +452,11 @@ def search(
 
     def predict(
             self,
-            X_test: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]], pd.DataFrame] = None,
+            X_test: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]] = None,
             batch_size: Optional[int] = None,
             n_jobs: int = 1,
             past_targets: Optional[List[np.ndarray]] = None,
-            future_targets: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]], pd.DataFrame] = None,
+            future_targets: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]] = None,
             start_times: List[pd.DatetimeIndex] = []
     ) -> np.ndarray:
         """
@@ -466,11 +466,11 @@ def predict(
         if not isinstance(X_test[0], TimeSeriesSequence):
             # Validate and construct TimeSeriesSequence
             X_test, _, _ = self.dataset.transform_data_into_time_series_sequence(X=X_test,
-                                                                              Y=past_targets,
-                                                                              X_test=future_targets,
-                                                                              start_times=start_times,
-                                                                              is_test_set=True
-                                                                              )
+                                                                                 Y=past_targets,
+                                                                                 X_test=future_targets,
+                                                                                 start_times=start_times,
+                                                                                 is_test_set=True
+                                                                                 )
         flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
         if self.dataset.num_target == 1:
             forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps])
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 3bbf26612..0d4fc0567 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1193,8 +1193,9 @@ def generate_test_seqs(self) -> List[TimeSeriesSequence]:
         test_sets = copy.deepcopy(self.datasets)
         for test_seq in test_sets:
             test_seq.is_test_set = True
-            if len(self.known_future_features) > 0 and test_seq.X_test is None:
-                raise ValueError("If future features are required, X_test must be given!")
-            test_seq.X = np.concatenate([test_seq.X, test_seq.X_test])
+            if len(self.known_future_features) > 0:
+                if test_seq.X_test is None:
+                    raise ValueError("If future features are required, X_test must be given!")
+                test_seq.X = np.concatenate([test_seq.X, test_seq.X_test])
         return test_sets
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index b2d017aef..8ed61b238 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -88,6 +88,7 @@ def __init__(self,
             elif network_structure.skip_connection_type == 'gate_add_norm':
                 self.residual_connection = GateAddNorm(d_model, skip_size=n_encoder_output,
                                                        dropout=None, trainable_add=False)
+        self._device = 'cpu'
 
     def forward(self,
                 encoder_output: torch.Tensor,
@@ -103,12 +104,13 @@ def forward(self,
             decoder_length: length of decoder network
             static_embedding: output of static variable selection network (if applible)
         """
+
         if self.decoder_proj_layer is not None:
             decoder_output = self.decoder_proj_layer(decoder_output)
 
         network_output = torch.cat([encoder_output, decoder_output], dim=1)
 
-        if self.enrich_with_static:
+        if self.enrich_with_static and static_embedding is not None:
             static_context_enrichment = self.static_context_enrichment(static_embedding)
             attn_input = self.enrichment(
                 network_output, static_context_enrichment[:, None].expand(-1, network_output.shape[1], -1)
@@ -546,12 +548,10 @@ def forward(self,
             elif self.encoder_output_type[i] == EncoderOutputForm.Sequence:
                 encoder2decoder.append(fx)
             elif self.encoder_output_type[i] == EncoderOutputForm.SequenceLast:
-                if output_seq or incremental_update:
-                    encoder2decoder.append(fx)
-                elif output_seq_i:
+                if output_seq_i:
                     encoder2decoder.append(encoder_i.get_last_seq_value(fx).squeeze(1))
                 else:
-                    encoder2decoder.append(fx.squeeze(1))
+                    encoder2decoder.append(fx)
             else:
                 raise NotImplementedError
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 4106a3c7a..b35a0114c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -65,7 +65,7 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.rnn_kwargs = None
-        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 5a3d633e0..ce10f051d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -69,8 +69,7 @@ def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx:
         if self.use_positional_decoder:
             output = self.pos_encoding(output, pos_idx)
         if self.training:
-            output = self.transformer_decoder_layers(output, encoder_output,
-                                                     tgt_mask=self.tgt_mask.to(encoder_output.device))
+            output = self.transformer_decoder_layers(output, encoder_output, tgt_mask=self.tgt_mask.to(encoder_output.device))
         else:
             output = self.transformer_decoder_layers(output, encoder_output)
         return output
@@ -81,7 +80,7 @@ def __init__(self, **kwargs: Dict):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.transformer_encoder_kwargs = None
-        self.lagged_value = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
     def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 3dc7925cd..af98e5029 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -361,7 +361,7 @@ def set_hyperparameters(self,
 
     @property
     def _defaults_network(self):
-        return ['MLPEncoder']
+        return ['MLPEncoder', 'RNNEncoder', 'NBEATSEncoder']
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
         """Handy method to check if a component is fitted
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index bfd7aedb6..00175b353 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -108,7 +108,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         has_hidden_states = self.encoder_properties().has_hidden_states
         self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
         if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:
-            raise ValueError('n_encoder_output_feature must equal to the output dimension')
+            raise ValueError(f'n_encoder_output_feature ({ self.n_encoder_output_feature()}) '
+                             f'must equal to the output dimension f({self.encoder_output_shape})')
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index fcfcdbb32..aaa41feb0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -131,7 +131,7 @@ def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
             return self.get_last_seq_value(x)
 
     def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
-        return x[:, -1, :]
+        return x[:, -1:, :]
 
 
 class InceptionTimeEncoder(BaseForecastingEncoder):
@@ -148,7 +148,8 @@ def build_encoder(self, input_shape: Tuple[int, ...] = (0,)) -> nn.Module:
         return encoder
 
     def n_encoder_output_feature(self) -> int:
-        return self.config['num_filters']
+        # see _InceptionBlock.forward()
+        return self.config['num_filters'] * 4
 
     @staticmethod
     def allowed_decoders():
@@ -164,8 +165,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'name': 'InceptionTimeEncoder',
             'handles_tabular': False,
             'handles_image': False,
-            # TODO consider InceptionTime for forecasting
-            'handles_time_series': False,
+            'handles_time_series': True,
         }
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index f53533248..401016d00 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -101,7 +101,7 @@ def get_hyperparameter_search_space(
             decoder_auto_regressive: HyperparameterSearchSpace = HyperparameterSearchSpace(
                 hyperparameter="decoder_auto_regressive",
                 value_range=(True, False),
-                default_value=True,
+                default_value=False,
             ),
             skip_connection: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="skip_connection",
                                                                                    value_range=(True, False),
@@ -215,16 +215,11 @@ def get_hyperparameter_search_space(
             cond_vs_dropout = EqualsCondition(variable_selection_use_dropout, variable_selection, True)
             cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate, variable_selection_use_dropout, True)
             cs.add_conditions([cond_vs_dropout, cond_vs_dropoutrate])
-
+        
+        add_forbidden_for_non_ar_recurrent_decoder = False
         if static_features_shape + future_feature_shapes[-1] == 0:
-            if False in variable_selection.choices and False in decoder_auto_regressive.choices:
-                if variable_selection.num_choices == 1 and decoder_auto_regressive.num_choices == 1:
-                    raise ValueError("When no future information is available, it is not possible to disable variable"
-                                     "selection and enable auto-regressive decoder model")
-                cs.add_forbidden_clause(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(variable_selection, False),
-                    ForbiddenEqualsClause(decoder_auto_regressive, False)
-                ))
+            add_forbidden_for_non_ar_recurrent_decoder = True
+
         if True in variable_selection.choices:
             cs.add_hyperparameter(share_single_variable_networks)
             cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection, True))
@@ -250,15 +245,16 @@ def get_hyperparameter_search_space(
                 if default_ in available_encoders:
                     default = default_
                     break
+
         updates_choice = self._get_search_space_updates()
 
         forbiddens_decoder_auto_regressive = []
 
-        if False in decoder_auto_regressive.choices:
+        if True in decoder_auto_regressive.choices:
             forbidden_decoder_ar = ForbiddenEqualsClause(decoder_auto_regressive, True)
         else:
             forbidden_decoder_ar = None
-
+            
         for i in range(1, int(max_num_blocks) + 1):
             block_prefix = f'block_{i}:'
 
@@ -342,6 +338,8 @@ def get_hyperparameter_search_space(
                         encoders_with_multi_decoder.append(encoder)
                     else:
                         encoder_with_single_decoder.append(encoder)
+                encoders_with_multi_decoder = set(encoders_with_multi_decoder)
+                encoder_with_single_decoder = set(encoder_with_single_decoder)
 
                 cs.add_configuration_space(
                     block_prefix + decoder_name,
@@ -377,6 +375,50 @@ def get_hyperparameter_search_space(
 
                 cs.add_conditions(conditions_to_add)
 
+                if forbidden_decoder_ar is not None:
+                    forbiddens_ar_non_recurrent = []
+                    for encoder in hp_encoder.choices:
+                        if encoder in encoder_with_single_decoder:
+                            if not available_decoders[encoder2decoder[encoder][0]].decoder_properties().recurrent:
+                                forbiddens_ar_non_recurrent.append(ForbiddenAndConjunction(
+                                    forbidden_decoder_ar,
+                                    ForbiddenEqualsClause(hp_encoder, encoder)
+                                ))
+                            else:
+                                if add_forbidden_for_non_ar_recurrent_decoder:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            ForbiddenAndConjunction(
+                                                ForbiddenEqualsClause(variable_selection, False),
+                                                ForbiddenEqualsClause(decoder_auto_regressive, False)
+                                            ),
+                                            ForbiddenEqualsClause(hp_encoder, encoder)
+                                        )
+                                    )
+                                
+                        elif encoder in encoders_with_multi_decoder:
+                            hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encoder}:decoder_type')
+                            for decoder in hp_decoder_type.choices:
+                                if not available_decoders[decoder].decoder_properties().recurrent:
+                                    forbiddens_ar_non_recurrent.append(ForbiddenAndConjunction(
+                                        forbidden_decoder_ar,
+                                        ForbiddenEqualsClause(hp_decoder_type, decoder)
+                                    ))
+                                else:
+                                    if add_forbidden_for_non_ar_recurrent_decoder:
+                                        forbiddens_decoder_auto_regressive.append(
+                                            ForbiddenAndConjunction(
+                                                ForbiddenAndConjunction(
+                                                    ForbiddenEqualsClause(variable_selection, False),
+                                                    ForbiddenEqualsClause(decoder_auto_regressive, False)
+                                                ),
+                                                ForbiddenEqualsClause(hp_decoder_type, decoder)
+                                            )
+                                        )
+                    if forbiddens_ar_non_recurrent:
+                        cs.add_forbidden_clauses(forbiddens_ar_non_recurrent)
+
+
         use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
         cs.add_hyperparameter(use_temporal_fusion)
         if True in use_temporal_fusion.choices:
@@ -452,10 +494,30 @@ def get_hyperparameter_search_space(
                         ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
                         ForbiddenInClause(num_blocks, list(range(i + 1, max_num_blocks + 1))),
                     ))
-        cs.add_forbidden_clauses(forbidden_mlp_local_layer)
+                c1 = isinstance(skip_connection, CategoricalHyperparameter) and True in skip_connection.choices
+                c2 = isinstance(skip_connection, Constant) and skip_connection.value
+                if c1 or c2:
+                    if True in skip_connection.choices:
+                        forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                            ForbiddenEqualsClause(skip_connection, True),
+                        ))
+                c1 = isinstance(use_temporal_fusion, CategoricalHyperparameter) and True in use_temporal_fusion.choices
+                c2 = isinstance(use_temporal_fusion, Constant) and skip_connection.value
+                if c1 or c2:
+                    if True in use_temporal_fusion.choices:
+                        forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
+                            ForbiddenEqualsClause(use_temporal_fusion, True),
+                        ))
 
+        cs.add_forbidden_clauses(forbidden_mlp_local_layer)
         return cs
 
+    @property
+    def _defaults_network(self):
+        return ['RNNEncoder', 'NBEATSEncoder']
+
     def set_hyperparameters(self,
                             configuration: Configuration,
                             init_params: Optional[Dict[str, Any]] = None
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index 9244f9b01..9eaebc6b8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -29,7 +29,7 @@ def __init__(self,
                  attention_n_head_log: int = 2,
                  attention_d_model_log: int = 4,
                  use_dropout: bool = False,
-                 dropout_rate: Optional[float] = None,):
+                 dropout_rate: Optional[float] = None, ):
         autoPyTorchComponent.__init__(self)
         self.add_fit_requirements(
             self._required_fit_requirements
@@ -55,13 +55,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
         network_structure = X['network_structure']  # type: NetworkStructure
 
         self.temporal_fusion = TemporalFusionLayer(window_size=X['window_size'],
-                                              network_structure=network_structure,
-                                              network_encoder=X['network_encoder'],
-                                              n_decoder_output_features=X['n_decoder_output_features'],
-                                              d_model=2 ** self.attention_d_model_log,
-                                              n_head=2 ** self.attention_n_head_log,
-                                              dropout=self.dropout_rate
-                                              )
+                                                   network_structure=network_structure,
+                                                   network_encoder=X['network_encoder'],
+                                                   n_decoder_output_features=X['n_decoder_output_features'],
+                                                   d_model=2 ** self.attention_d_model_log,
+                                                   n_head=2 ** self.attention_n_head_log,
+                                                   dropout=self.dropout_rate
+                                                   )
         self.n_decoder_output_features = 2 ** self.attention_d_model_log
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -108,10 +108,6 @@ def get_hyperparameter_search_space(
         Args:
             dataset_properties (Optional[Dict[str, Union[str, int]]):
                 Describes the dataset to work on
-            use_temporal_fusion (HyperparameterSearchSpace):
-                if attention fusion layer is applied (Lim et al.
-                Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting,
-                https://arxiv.org/abs/1912.09363)
             attention_n_head_log (HyperparameterSearchSpace):
                 log value of number of heads for interpretable
             attention_d_model_log (HyperparameterSearchSpace):
@@ -133,4 +129,4 @@ def get_hyperparameter_search_space(
         cs.add_hyperparameters([use_dropout, dropout_rate])
         cond_dropout = EqualsCondition(dropout_rate, use_dropout, True)
         cs.add_condition(cond_dropout)
-        return cs
\ No newline at end of file
+        return cs
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index c9f20e86e..8aadabc14 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -78,6 +78,7 @@ def generate_fit_dict_and_dataset_property():
                               is_small_preprocess=True,
                               task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
                               uni_variant=False,
+                              future_feature_shapes=(n_prediction_steps, 50),
                               )
 
     fit_dictionary = dict(X_train=pd.DataFrame(np.random.randn(*input_shape)),
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index ed6af6cc9..075b479c4 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -1,6 +1,7 @@
 import copy
 import unittest
 import torch
+from itertools import product
 
 from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
     generate_fit_dict_and_dataset_property
@@ -33,7 +34,8 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
     StackedEncoder,
-    StackedDecoder
+    StackedDecoder,
+    TemporalFusionLayer,
 )
 
 
@@ -75,15 +77,28 @@ def test_config_space(self):
     def test_deepar(self):
         for i, valid_encoder in enumerate(['RNNEncoder', 'TCNEncoder', 'TransformerEncoder']):
             seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
-            update = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+            update_ar = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
                                                      hyperparameter='auto_regressive',
                                                      value_range=(True,),
                                                      default_value=True, )
-            seq_encoder_choice._cs_updates = {"block_1:MLPDecoder:auto_regressive": update}
+            update_rnn_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                     hyperparameter='decoder_type',
+                                                     value_range=('MLPDecoder',),
+                                                     default_value='MLPDecoder', )
+            update_transformer_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                     hyperparameter='decoder_type',
+                                                     value_range=('MLPDecoder',),
+                                                     default_value='MLPDecoder', )
+            seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_mlp,
+                                              "block_1:TransformerEncoder:decoder_type": update_transformer_mlp,
+                                              "block_1:MLPDecoder:auto_regressive": update_ar}
+
+
             cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
                                                                         include=[valid_encoder])
             sample = cs_seq.get_default_configuration()
-            seq_encoder_choice.set_hyperparameters(copy.copy(sample))
+
+            seq_encoder_choice.set_hyperparameters(sample)
 
             fit_dict = copy.copy(self.fit_dictionary)
             fit_dict['dataset_properties'] = self.dataset_properties
@@ -114,7 +129,7 @@ def test_deepar(self):
                                                           cache_intermediate_state=True,
                                                           )
             output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
-            self.assertListEqual(list(output.shape), [10, 1])
+            self.assertListEqual(list(output.shape), [10, 1, 1])
 
             encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
                                                           additional_input=[None],
@@ -124,3 +139,174 @@ def test_deepar(self):
             output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
             self.assertListEqual(list(output.shape), [10, 1, 1])
 
+    def test_seq_models(self):
+        update = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                 hyperparameter='auto_regressive',
+                                                 value_range=(False,),
+                                                 default_value=False, )
+        # TO avoid that default setting raises conflict for forbidden clauses
+        update_rnn_default = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                             hyperparameter='decoder_type',
+                                                             value_range=('MLPDecoder', 'RNNDecoder'),
+                                                             default_value='RNNDecoder', )
+        num_blocks = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                               value_range=(2, 2),
+                                               default_value=2)
+        window_size: int = self.fit_dictionary['window_size']
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+        n_features = self.dataset_properties['input_shape'][-1]
+        n_targets = self.dataset_properties['output_shape'][-1]
+        n_time_features = len(self.dataset_properties['time_feature_transform'])
+        all_settings = [(True, False), (True, False), (True, False), (True, False), ('gate_add_norm', 'add')]
+        for hp_values in product(*all_settings):
+            hp_variable_selection = hp_values[0]
+            hp_use_temporal_fusion = hp_values[1]
+            hp_decoder_auto_regressive = hp_values[2]
+            hp_skip_connection = hp_values[3]
+            hp_skip_connection_type = hp_values[4]
+            with self.subTest(hp_variable_selection=hp_values[0],
+                              hp_use_temporal_fusion=hp_values[1],
+                              hp_decoder_auto_regressive=hp_values[2],
+                              hp_skip_connection=hp_values[3],
+                              hp_skip_connection_type=hp_values[4]):
+                variable_selection = HyperparameterSearchSpace('variable_selection',
+                                                               (hp_variable_selection,), hp_variable_selection)
+                use_temporal_fusion = HyperparameterSearchSpace('use_temporal_fusion',
+                                                                (hp_use_temporal_fusion,), hp_use_temporal_fusion)
+                decoder_auto_regressive = HyperparameterSearchSpace('decoder_auto_regressive',
+                                                                    (hp_decoder_auto_regressive,),
+                                                                    hp_decoder_auto_regressive)
+                skip_connection = HyperparameterSearchSpace('skip_connection',
+                                                            (hp_skip_connection,),
+                                                            hp_skip_connection)
+                skip_connection_type = HyperparameterSearchSpace('skip_connection_type',
+                                                                 (hp_skip_connection_type,),
+                                                                 hp_skip_connection_type)
+
+                seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+                seq_encoder_choice._cs_updates = {"block_1:MLPDecoder:auto_regressive": update,
+                                                  "block_1:RNNEncoder:decoder_type": update_rnn_default,
+                                                  "block_2:RNNEncoder:decoder_type": update_rnn_default,
+                                                  }
+                cs_seq_encoder = seq_encoder_choice.get_hyperparameter_search_space(
+                    dataset_properties=self.dataset_properties,
+                    num_blocks=num_blocks,
+                    variable_selection=variable_selection,
+                    use_temporal_fusion=use_temporal_fusion,
+                    decoder_auto_regressive=decoder_auto_regressive,
+                    skip_connection=skip_connection,
+                    skip_connection_type=skip_connection_type
+                )
+                sample = cs_seq_encoder.sample_configuration()
+                seq_encoder_choice.set_hyperparameters(sample)
+
+                fit_dict = copy.copy(self.fit_dictionary)
+                fit_dict['dataset_properties'] = self.dataset_properties
+
+                encoder_choices = seq_encoder_choice.fit(fit_dict)
+                fit_dict = encoder_choices.transform(fit_dict)
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict)
+                fit_dict = head.transform(fit_dict)
+
+                network_structure = fit_dict['network_structure']
+                net_encoder = StackedEncoder(fit_dict['network_structure'],
+                                             network_structure.use_temporal_fusion,
+                                             fit_dict['network_encoder'], fit_dict['network_decoder'])
+                net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                             fit_dict['network_encoder'],
+                                             fit_dict['network_decoder'])
+                if hp_use_temporal_fusion:
+                    temporal_fusion: TemporalFusionLayer = fit_dict['temporal_fusion']
+
+                head = fit_dict['network_head']
+
+                if hp_variable_selection:
+                    n_feature_encoder = fit_dict['network_encoder']['block_1'].encoder_output_shape[-1]
+                    if decoder_auto_regressive:
+                        n_feature_decoder = n_feature_encoder
+                    else:
+                        n_feature_decoder = n_feature_encoder - 1
+                else:
+                    if hasattr(net_encoder.encoder['block_1'], 'lagged_value'):
+                        n_feature_encoder = n_features + n_time_features
+                        n_feature_encoder += n_targets * len(net_encoder.encoder['block_1'].lagged_value)
+                    else:
+                        n_feature_encoder = n_features + n_time_features + n_targets
+                    if hp_decoder_auto_regressive:
+                        if hasattr(net_decoder.decoder['block_1'], 'lagged_value'):
+                            n_feature_decoder = n_features + n_time_features
+                            n_feature_decoder += n_targets * len(
+                                net_decoder.decoder['block_1'].lagged_value)
+                        else:
+                            n_feature_decoder = n_features + n_time_features + n_targets
+                    else:
+                        n_feature_decoder = n_features + n_time_features
+
+                input_tensor = torch.ones([10, window_size, n_feature_encoder])
+                input_tensor_future = torch.randn([10, n_prediction_steps, n_feature_decoder])
+                input_tensor_future_ar = torch.randn([10, 1, n_feature_decoder])
+                past_observed_values = torch.ones([10, window_size, 1]).bool()
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                              additional_input=[None] * 2,
+                                                              )
+
+                decoder_output = net_decoder(x_future=input_tensor_future,
+                                             encoder_output=encoder2decoder,
+                                             pos_idx=(window_size, window_size + n_prediction_steps))
+
+                if hp_use_temporal_fusion:
+                    decoder_output = temporal_fusion(encoder_output=encoder_output,
+                                                     decoder_output=decoder_output,
+                                                     past_observed_values=past_observed_values,
+                                                     decoder_length=n_prediction_steps,
+                                                    )
+
+
+                output = head(decoder_output)
+                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+                if hp_decoder_auto_regressive:
+                    net_encoder.eval()
+                    net_decoder.eval()
+                    head.eval()
+
+                    encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                                  additional_input=[None] * 2,
+                                                                  cache_intermediate_state=False,
+                                                                  )
+
+                    decoder_output = net_decoder(x_future=input_tensor_future_ar,
+                                                 encoder_output=encoder2decoder,
+                                                 pos_idx=(window_size, window_size + 1),
+                                                 cache_intermediate_state=True,
+                                                 )
+                    if hp_use_temporal_fusion:
+                        temporal_fusion.eval()
+                        decoder_output = temporal_fusion(encoder_output=encoder_output,
+                                                         decoder_output=decoder_output,
+                                                         past_observed_values=past_observed_values,
+                                                         decoder_length=1,
+                                                         )
+                        output = head(decoder_output)
+                        self.assertListEqual(list(output.shape), [10, 1, 1])
+
+                    decoder_output = net_decoder.forward(x_future=input_tensor_future_ar,
+                                                         encoder_output=encoder2decoder,
+                                                         pos_idx=(window_size, window_size + 1),
+                                                         cache_intermediate_state=True,
+                                                         incremental_update=True,
+                                                         )
+                    if hp_use_temporal_fusion:
+                        decoder_output = temporal_fusion(encoder_output=encoder_output,
+                                                         decoder_output=decoder_output,
+                                                         past_observed_values=past_observed_values,
+                                                         decoder_length=1,
+                                                         )
+                        output = head(decoder_output)
+                        self.assertListEqual(list(output.shape), [10, 1, 1])
+
+
+

From 23dde6766b88b122845a2d21b0a7816ba83b4aab Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 18 May 2022 20:04:53 +0200
Subject: [PATCH 269/347] maint

---
 .../TimeSeriesTransformer.py                  |  8 ++--
 .../encoding/NoEncoder.py                     |  3 +-
 .../imputation/TimeSeriesImputer.py           | 10 ++---
 .../base_target_scaler.py                     |  3 +-
 .../setup/forecasting_target_scaling/utils.py |  3 +-
 .../DistributionLoss.py                       |  8 ++--
 .../forecasting_training_loss/QuantileLoss.py |  2 +-
 .../setup/network/forecasting_architecture.py | 27 ++++++-------
 .../setup/network/forecasting_network.py      | 14 +++----
 .../forecasting_backbone/__init__.py          | 12 ++----
 .../forecasting_backbone/cells.py             | 17 ++++----
 .../forecasting_backbone/components_util.py   |  5 ++-
 .../forecasting_decoder/__init__.py           | 39 ++-----------------
 .../forecasting_encoder/__init__.py           |  2 +-
 .../flat_encoder/__init__.py                  |  3 +-
 .../test_base_components.py                   | 22 +++++++++++
 .../forecasting_networks/test_seq_encoder.py  | 17 +-------
 17 files changed, 80 insertions(+), 115 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index b58889dee..37b05d209 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 from sklearn.base import BaseEstimator
-from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.pipeline import make_pipeline
 from sklearn.compose import ColumnTransformer
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
@@ -27,7 +27,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
             FitRequirement('numerical_features', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categorical_features', (List,), user_defined=True, dataset_property=True)])
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Creates a column transformer for the chosen tabular
         preprocessors
@@ -108,7 +108,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.random_state = random_state
         self.preprocessor: Optional[ColumnTransformer] = None
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "TimeSeriesTransformer":
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Creates a column transformer for the chosen tabular
         preprocessors
@@ -177,4 +177,4 @@ def get_target_transformer(self) -> ColumnTransformer:
         if self.preprocessor is None:
             raise AttributeError("{} can't return column transformer before transform is called"
                                  .format(self.__class__.__name__))
-        return self.preprocessor
\ No newline at end of file
+        return self.preprocessor
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
index cb48b4134..6eb270a97 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
 
@@ -41,4 +41,3 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             (Dict[str, Any]): the updated 'X' dictionary
         """
         return NoEncoder.transform(self, X)
-
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index 16837f640..822aa8ff3 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -8,7 +8,6 @@
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 from autoPyTorch.utils.common import FitRequirement
 
-
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent,
     autoPyTorchTimeSeriesTargetPreprocessingComponent
@@ -95,9 +94,10 @@ def get_hyperparameter_search_space(
                              " a search space.")
 
         cs = ConfigurationSpace()
-        if (dataset_properties.get('features_have_missing_values', True)
-            and isinstance(dataset_properties['numerical_columns'], List)
-            and len(dataset_properties['numerical_columns']) != 0
+        if (
+                dataset_properties.get('features_have_missing_values', True)
+                and isinstance(dataset_properties['numerical_columns'], List)
+                and len(dataset_properties['numerical_columns']) != 0
         ):
             add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
         return cs
@@ -178,4 +178,4 @@ def get_hyperparameter_search_space(
         cs = ConfigurationSpace()
         if dataset_properties.get('targets_have_missing_values', True):
             add_hyperparameter(cs, imputation_strategy, CategoricalHyperparameter)
-        return cs
\ No newline at end of file
+        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index 27849f994..3ebfac000 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 
+from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline
 
 import torch
@@ -19,7 +20,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.random_state = random_state
         self.preprocessor: Optional[Pipeline] = None
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "BaseBatchScaler":
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Creates a column transformer for the chosen tabular
         preprocessors
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
index bd59a9127..c8e499979 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
@@ -1,7 +1,6 @@
-from typing import Any, Dict, Callable, Optional, Union, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import torch
-import sklearn
 from sklearn.base import BaseEstimator
 
 
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
index cbbd429f9..b1719d45c 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -58,10 +58,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     @staticmethod
     def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dist_cls",
-                                                                            value_range=tuple(ALL_DISTRIBUTIONS.keys()),
-                                                                            default_value=
-                                                                            list(ALL_DISTRIBUTIONS.keys())[0]),
+            dist_cls: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="dist_cls",
+                value_range=tuple(ALL_DISTRIBUTIONS.keys()),
+                default_value=list(ALL_DISTRIBUTIONS.keys())[0]),
             forecast_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='forecast_strategy',
                                                                                      value_range=('sample', 'mean'),
                                                                                      default_value='sample'),
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index 04cadaea3..b71d2b65a 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -9,7 +9,7 @@
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
 from autoPyTorch.pipeline.components.training.losses import QuantileLoss
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter, FitRequirement
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
 class NetworkQuantileLoss(ForecastingLossComponents):
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index e9e862b61..53a95cd27 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -15,7 +15,6 @@
 from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderNetwork,
     EncoderBlockInfo,
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
@@ -178,9 +177,11 @@ def __init__(self,
         This structure is active when the decoder is a MLP with auto_regressive set as false
 
         Args:
+            network_structure (NetworkStructure): network structure information
             network_embedding (nn.Module): network embedding
-            network_encoder (EncoderNetwork): Encoder network, could be selected to return a sequence or a
-            network_decoder (nn.Module): network decoder
+            network_encoder (Dict[str, EncoderBlockInfo]): Encoder network, could be selected to return a sequence or a
+            network_decoder (Dict[str, DecoderBlockInfo]): network decoder
+            temporal_fusion Optional[TemporalFusionLayer]: Temporal Fusion Layer
             network_head (nn.Module): network head, maps the output of decoder to the final output
             dataset_properties (Dict): dataset properties
             auto_regressive (bool): if the overall model is auto-regressive model
@@ -528,18 +529,18 @@ def predict(self,
 
 class ForecastingSeq2SeqNet(ForecastingNet):
     future_target_required = True
-    """
-    Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while 
-
-    This structure is activate when the decoder is recurrent (RNN or transformer). 
-    We train the network with teacher forcing, thus
-    future_targets is required for the network. To train the network, past targets and past features are fed to the
-    encoder to obtain the hidden states whereas future targets and future features.
-    When the output type is distribution and forecast_strategy is sampling, this model is equivalent to a deepAR model 
-    during inference.
-    """
 
     def __init__(self, **kwargs):
+        """
+        Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while
+
+        This structure is activate when the decoder is recurrent (RNN or transformer).
+        We train the network with teacher forcing, thus
+        future_targets is required for the network. To train the network, past targets and past features are fed to the
+        encoder to obtain the hidden states whereas future targets and future features.
+        When the output type is distribution and forecast_strategy is sampling,
+        this model is equivalent to a deepAR model during inference.
+        """
         super(ForecastingSeq2SeqNet, self).__init__(**kwargs)
 
     def decoder_select_variable(self, future_targets: torch.tensor, future_features: Optional[torch.Tensor]):
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 6b984615c..f551db9db 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -10,12 +10,7 @@
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderBlockInfo,
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
-    DecoderBlockInfo,
-)
+
 
 from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -26,6 +21,9 @@
     ForecastingDeepARNet,
     NBEATSNet,
 )
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    DisForecastingStrategy
+)
 
 
 class ForecastingNetworkComponent(NetworkComponent):
@@ -43,9 +41,9 @@ def _required_fit_requirements(self):
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
             FitRequirement('network_structure', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_encoder", (Dict[str, EncoderBlockInfo]), user_defined=False,
+            FitRequirement("network_encoder", (Dict,), user_defined=False,
                            dataset_property=False),
-            FitRequirement("network_decoder", (Dict[str, DecoderBlockInfo]), user_defined=False,
+            FitRequirement("network_decoder", (Dict,), user_defined=False,
                            dataset_property=False),
             FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
             FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index e368f825e..3eec8726b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 import numpy as np
-from typing import Dict, Optional, List, Any, Union
+from typing import Dict, Optional, List, Any
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
@@ -11,16 +11,10 @@
     autoPyTorchComponent,
 )
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import (
-    BaseForecastingEncoder,
-)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
     import FlatForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder import\
     SeqForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import (
-    decoders, decoder_addons, add_decoder
-)
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
 
@@ -72,7 +66,7 @@ def get_available_components(
             to honor when creating the configuration space
          exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
              to remove from the configuration space
-         dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
+         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
              of the dataset to guide the pipeline choices of components
 
         Returns:
@@ -264,7 +258,7 @@ def set_hyperparameters(self,
         choice_component = self.get_components()[choice]
 
         self.new_params = new_params
-        sub_configuration_space = choice_component.get_hyperparameter_search_space(  # type: ignore[call-arg]
+        sub_configuration_space = choice_component.get_hyperparameter_search_space(
             self.dataset_properties,
         )
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 8ed61b238..5188494f2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, List, Tuple, Set, Union
+from typing import Any, Dict, Optional, List, Tuple, Union
 
 import torch
 from torch import nn
@@ -256,7 +256,6 @@ def __init__(self,
             future_feature_name2tensor_idx[future_name] = [idx_tracker_future, idx_tracker_future + feature_shape]
             idx_tracker_future += feature_shape
 
-
         if time_feature_names:
             for name in time_feature_names:
                 feature_names2tensor_idx[name] = [idx_tracker, idx_tracker+1]
@@ -413,7 +412,6 @@ def forward(self,
                 static_embedding = torch.zeros(
                     (batch_size, self.hidden_size), dtype=model_dtype, device=self.device
                 )
-                static_variable_selection = torch.zeros((batch_size, 0), dtype=model_dtype, device=self.device)
 
             static_context_variable_selection = self.static_context_variable_selection(static_embedding)[:, None]
             static_context_initial_hidden = tuple(init_hidden(static_embedding) for init_hidden in
@@ -506,15 +504,15 @@ def forward(self,
              additional_input (List[Optional[torch.Tensor]]) additional input to the encoder, e.g., inital hidden states
              output_seq (bool) if a sequence output is generated
              cache_intermediate_state (bool): if store the intermediate values
-             incremental_update (bool): if an incremental update is applied, this is normally applied for auto-regressive
-                model, however, ony deepAR requires encoder to do incremental update, thus the decoder only need to
-                receive the last output of the encoder
+             incremental_update (bool): if an incremental update is applied, this is normally applied for
+                auto-regressive model, however, ony deepAR requires encoder to do incremental update,
+                whose decoder only need to receive the last output of the encoder
         """
         encoder2decoder = []
         x = encoder_input
         for i, block_id in enumerate(range(1, self.num_blocks + 1)):
             output_seq_i = (output_seq or self.has_temporal_fusion or block_id < self.num_blocks)
-            encoder_i = self.encoder[f'block_{block_id}']  # type: EncoderNetwork
+            encoder_i = self.encoder[f'block_{block_id}']
             if self.encoder_has_hidden_states[i]:
                 if incremental_update:
                     hx = self.cached_intermediate_state[i]
@@ -615,7 +613,8 @@ def __init__(self,
                                 decoder[f'skip_connection_{i}'] = GateAddNorm(input_size_decoder,
                                                                               hidden_size=input_size_decoder,
                                                                               skip_size=skip_size_decoder,
-                                                                              dropout=network_structure.grn_dropout_rate)
+                                                                              dropout=network_structure.grn_dropout_rate
+                                                                              )
         self.cached_intermediate_state = [torch.empty(0) for _ in range(self.num_blocks + 1 - self.first_block)]
         self.decoder = decoder
 
@@ -628,7 +627,7 @@ def forward(self,
                 ) -> torch.Tensor:
         x = x_future
         for i, block_id in enumerate(range(self.first_block, self.num_blocks + 1)):
-            decoder_i = self.decoder[f'block_{block_id}']  # type: DecoderNetwork
+            decoder_i = self.decoder[f'block_{block_id}']
             if self.decoder_has_hidden_states[i]:
                 if incremental_update:
                     hx = self.cached_intermediate_state[i]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index fb88119c1..d4482910c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -13,7 +13,7 @@ class NetworkStructure(NamedTuple):
     num_blocks: int = 1
     variable_selection: bool = False
     share_single_variable_networks: bool = False
-    use_temporal_fusion: bool = False,
+    use_temporal_fusion: bool = False
     skip_connection: bool = False
     skip_connection_type: str = "add"  # could be 'add' or 'gate_add_norm'
     grn_dropout_rate: float = 0.0
@@ -126,10 +126,11 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         pe = pe.unsqueeze(0)
         self.register_buffer('pe', pe)
 
-    def forward(self, x, pos_idx:Optional[Tuple[int]] = None):
+    def forward(self, x, pos_idx: Optional[Tuple[int]] = None):
         r"""Inputs of forward function
         Args:
             x: the sequence fed to the positional encoder model (required).
+            pos_idx (Tuple[int]), position idx indicating the start (first) and end (last) time index of x in a sequence
         Shape:
             x: [batch size, sequence length embed dim]
             pos_idx: positional index, indicating the index of the current
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index ba780efa0..9d8ca3d5c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -1,20 +1,8 @@
 import os
-from collections import OrderedDict
-from typing import Dict, List, Optional
 
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_component import (
-    autoPyTorchComponent,
-)
-
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.\
+from autoPyTorch.pipeline.components.setup.network_backbone. \
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
 
-
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     find_components,
@@ -22,32 +10,11 @@
 
 directory = os.path.split(__file__)[0]
 decoders = find_components(__package__,
-                         directory,
-                         BaseForecastingDecoder)
+                           directory,
+                           BaseForecastingDecoder)
 
 decoder_addons = ThirdPartyComponents(BaseForecastingDecoder)
 
 
 def add_decoder(encoder: BaseForecastingDecoder) -> None:
     decoder_addons.add_component(encoder)
-
-
-class ForecastingDecoderChoice(NetworkBackboneChoice):
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
-        """Returns the available head components
-
-        Args:
-            None
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: all NetworkHeadComponents available
-                as choices for learning rate scheduling
-        """
-        components = OrderedDict()
-
-        components.update(decoders)
-        components.update(decoder_addons.components)
-
-        return components
-
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index af98e5029..3971aff81 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -24,7 +24,7 @@
     ForecastingNetworkStructure
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
-    decoders, decoder_addons, add_decoder
+    decoders, decoder_addons
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index 2d3026fd5..f48599f1b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -4,14 +4,13 @@
 
 import os
 from collections import OrderedDict
-from typing import Dict, Union, Optional, List, Type
+from typing import Dict, Union, Optional
 
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
     find_components,
 )
-from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index 8aadabc14..f90f4de31 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -26,6 +26,7 @@
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
     ALL_DISTRIBUTIONS, DisForecastingStrategy
 )
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
 
 class DummyEmbedding(torch.nn.Module):
@@ -152,6 +153,27 @@ def test_encoder_choices(self):
         encoder_choices = encoder_choices.set_hyperparameters(sample)
         self.assertIsInstance(encoder_choices.choice.choice, BaseForecastingEncoder)
 
+        encoder_choices = ForecastingNetworkChoice(dataset_properties)
+
+        update_seq = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                     hyperparameter='__choice__',
+                                                     value_range=('seq_encoder',),
+                                                     default_value='seq_encoder', )
+        encoder_choices._apply_search_space_update(update_seq)
+        cs_seq = encoder_choices.get_hyperparameter_search_space(dataset_properties)
+        self.assertListEqual(list(cs_seq.get_hyperparameter('__choice__').choices), ['seq_encoder'])
+
+        encoder_choices = ForecastingNetworkChoice(dataset_properties)
+        update_rnn_decoder_type = HyperparameterSearchSpaceUpdate(
+            node_name="network_backbone",
+            hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+            value_range=('MLPDecoder',),
+            default_value='MLPDecoder', )
+        encoder_choices._apply_search_space_update(update_rnn_decoder_type)
+        cs_seq = encoder_choices.get_hyperparameter_search_space(dataset_properties)
+        hp_rnn_decoder_type = cs_seq.get_hyperparameter(update_rnn_decoder_type.hyperparameter)
+        self.assertListEqual(list(hp_rnn_decoder_type.choices), ['MLPDecoder'])
+
     def test_base_encoder(self):
         window_size = self.fit_dictionary['window_size']
         for uni_variant in (True, False):
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index 075b479c4..e73da440d 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -10,26 +10,11 @@
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder \
     import SeqForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
+from autoPyTorch.utils.common import HyperparameterSearchSpace
 from sklearn.pipeline import Pipeline
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
-    RNNEncoder import RNNEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
-    TCNEncoder import TCNEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
-    TransformerEncoder import TransformerEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder. \
-    RNNEncoder import RNNEncoder
-
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    RNNDecoder import ForecastingRNNDecoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    MLPDecoder import ForecastingMLPDecoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    TransformerDecoder import ForecastingTransformerDecoder
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (

From 4d9fe3094504561c4f2d694c7ccf6bd1c2e2b2f4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 19 May 2022 11:08:32 +0200
Subject: [PATCH 270/347] test for recurrent decoders

---
 .../forecasting_networks/test_seq_encoder.py  | 106 ++++++++++++++----
 1 file changed, 86 insertions(+), 20 deletions(-)

diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index e73da440d..e6c13731b 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -2,16 +2,17 @@
 import unittest
 import torch
 from itertools import product
+from sklearn.pipeline import Pipeline
 
 from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
     generate_fit_dict_and_dataset_property
 )
 
+from autoPyTorch.utils.common import HyperparameterSearchSpace
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder \
     import SeqForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
-from autoPyTorch.utils.common import HyperparameterSearchSpace
-from sklearn.pipeline import Pipeline
+
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 
@@ -60,25 +61,24 @@ def test_config_space(self):
         self.assertEqual(len(fit_dict['network_decoder']), num_blocks)
 
     def test_deepar(self):
-        for i, valid_encoder in enumerate(['RNNEncoder', 'TCNEncoder', 'TransformerEncoder']):
+        for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder', 'TCNEncoder', 'InceptionTimeEncoder']):
             seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
             update_ar = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                     hyperparameter='auto_regressive',
-                                                     value_range=(True,),
-                                                     default_value=True, )
+                                                        hyperparameter='auto_regressive',
+                                                        value_range=(True,),
+                                                        default_value=True, )
             update_rnn_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                     hyperparameter='decoder_type',
-                                                     value_range=('MLPDecoder',),
-                                                     default_value='MLPDecoder', )
+                                                             hyperparameter='decoder_type',
+                                                             value_range=('MLPDecoder',),
+                                                             default_value='MLPDecoder', )
             update_transformer_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                     hyperparameter='decoder_type',
-                                                     value_range=('MLPDecoder',),
-                                                     default_value='MLPDecoder', )
+                                                                     hyperparameter='decoder_type',
+                                                                     value_range=('MLPDecoder',),
+                                                                     default_value='MLPDecoder', )
             seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_mlp,
                                               "block_1:TransformerEncoder:decoder_type": update_transformer_mlp,
                                               "block_1:MLPDecoder:auto_regressive": update_ar}
 
-
             cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
                                                                         include=[valid_encoder])
             sample = cs_seq.get_default_configuration()
@@ -102,7 +102,7 @@ def test_deepar(self):
                                          fit_dict['network_decoder'])
 
             head = fit_dict['network_head']
-            if i != 1:
+            if i < 2:
                 input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
                 input_tensor_future = torch.randn([10, 1, 59])
             else:
@@ -124,12 +124,82 @@ def test_deepar(self):
             output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
             self.assertListEqual(list(output.shape), [10, 1, 1])
 
+    def test_seq2seq(self):
+        n_prediction_steps = self.dataset_properties['n_prediction_steps']
+
+        for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder']):
+            seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+
+            update_rnn_rnn = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                             hyperparameter='decoder_type',
+                                                             value_range=('RNNDecoder',),
+                                                             default_value='RNNDecoder', )
+            update_trans_trans = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                 hyperparameter='decoder_type',
+                                                                 value_range=('TransformerDecoder',),
+                                                                 default_value='TransformerDecoder', )
+
+            seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_rnn,
+                                              "block_1:TransformerEncoder:decoder_type": update_trans_trans}
+            decoder_auto_regressive = HyperparameterSearchSpace(
+                hyperparameter="decoder_auto_regressive",
+                value_range=(True,),
+                default_value=True,
+            )
+
+            cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
+                                                                        decoder_auto_regressive=decoder_auto_regressive,
+                                                                        include=[valid_encoder])
+            sample = cs_seq.get_default_configuration()
+
+            seq_encoder_choice.set_hyperparameters(sample)
+
+            fit_dict = copy.copy(self.fit_dictionary)
+            fit_dict['dataset_properties'] = self.dataset_properties
+
+            encoder_choices = seq_encoder_choice.fit(fit_dict)
+            fit_dict = encoder_choices.transform(fit_dict)
+
+            head = ForecastingHead()
+            head = head.fit(fit_dict)
+            fit_dict = head.transform(fit_dict)
+
+            net_encoder = StackedEncoder(fit_dict['network_structure'], False,
+                                         fit_dict['network_encoder'], fit_dict['network_decoder'])
+            net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                         fit_dict['network_encoder'],
+                                         fit_dict['network_decoder'])
+
+            head = fit_dict['network_head']
+
+            input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
+            input_tensor_future = torch.randn([10, n_prediction_steps, 59])
+
+            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                          additional_input=[None],
+                                                          cache_intermediate_state=True,
+                                                          )
+            output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
+            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+            net_encoder.eval()
+            net_decoder.eval()
+            input_tensor_future = torch.randn([10, 1, 59])
+
+            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
+                                                          additional_input=[None],
+                                                          output_seq=False, cache_intermediate_state=True,
+                                                          incremental_update=True
+                                                          )
+            output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
+            self.assertListEqual(list(output.shape), [10, 1, 1])
+
     def test_seq_models(self):
         update = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
                                                  hyperparameter='auto_regressive',
                                                  value_range=(False,),
                                                  default_value=False, )
-        # TO avoid that default setting raises conflict for forbidden clauses
+        # To avoid that default setting raises conflict for forbidden clauses
         update_rnn_default = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
                                                              hyperparameter='decoder_type',
                                                              value_range=('MLPDecoder', 'RNNDecoder'),
@@ -247,8 +317,7 @@ def test_seq_models(self):
                                                      decoder_output=decoder_output,
                                                      past_observed_values=past_observed_values,
                                                      decoder_length=n_prediction_steps,
-                                                    )
-
+                                                     )
 
                 output = head(decoder_output)
                 self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
@@ -292,6 +361,3 @@ def test_seq_models(self):
                                                          )
                         output = head(decoder_output)
                         self.assertListEqual(list(output.shape), [10, 1, 1])
-
-
-

From eb5a7ec32449ca00dc7e4f4e9abc7c54ddbad1d0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 20 May 2022 00:36:16 +0200
Subject: [PATCH 271/347] test for network

---
 .../setup/network/forecasting_architecture.py | 405 ++++++++++--------
 .../setup/network/forecasting_network.py      |   3 +-
 .../forecasting_backbone/__init__.py          |   2 +-
 .../forecasting_backbone/cells.py             |  19 +-
 .../seq_encoder/__init__.py                   |   2 +-
 .../test_base_components.py                   |   4 +
 .../test_forecasting_architecture.py          | 337 +++++++++++++++
 .../forecasting_networks/test_seq_encoder.py  |   2 +-
 .../test_forecasting_training_losses.py       |   2 -
 9 files changed, 588 insertions(+), 188 deletions(-)
 create mode 100644 test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 53a95cd27..a42abda77 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -333,6 +333,7 @@ def predict(self,
                 past_targets: torch.Tensor,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.Tensor] = None,
                 ):
         raise NotImplementedError
 
@@ -350,6 +351,12 @@ def repeat_intermediate_values(self,
                 intermediate_values[i] = repeated_value
         return intermediate_values
 
+    def pad_tensor(self, tensor_to_be_padded: torch.Tensor, target_length) -> torch.Tensor:
+        tensor_shape = tensor_to_be_padded.shape
+        padding_size = [tensor_shape[0], target_length - tensor_shape[1], tensor_shape[-1]]
+        tensor_to_be_padded = torch.cat([tensor_to_be_padded.new_zeros(padding_size), tensor_to_be_padded], dim=1)
+        return tensor_to_be_padded
+
 
 class ForecastingNet(AbstractForecastingNet):
     def pre_processing(self,
@@ -362,78 +369,91 @@ def pre_processing(self,
                        variable_selector_kwargs: Dict = {},
                        ):
         if self.encoder_lagged_input:
-            past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                past_targets[:, -self.window_size:],
-                past_observed_targets[:, -self.window_size:]
-            )
-            past_targets[:, :-self.window_size] = torch.where(
-                past_observed_targets[:, :-self.window_size],
-                self.scale_value(past_targets[:, :-self.window_size], loc, scale),
-                past_targets[:, :-self.window_size])
-            x_past, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
-                                                                           self.window_size,
-                                                                           self.encoder_lagged_value,
-                                                                           self.cached_lag_mask_encoder)
+            if self.window_size < past_targets.shape[1]:
+                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                    past_targets[:, -self.window_size:],
+                    past_observed_targets[:, -self.window_size:]
+                )
+                past_targets[:, :-self.window_size] = torch.where(
+                    past_observed_targets[:, :-self.window_size],
+                    self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                    past_targets[:, :-self.window_size])
+            else:
+                past_targets, _, loc, scale = self.target_scaler(
+                    past_targets,
+                    past_observed_targets
+                )
+            truncated_past_targets, self.cached_lag_mask_encoder = get_lagged_subsequences(past_targets,
+                                                                                           self.window_size,
+                                                                                           self.encoder_lagged_value,
+                                                                                           self.cached_lag_mask_encoder)
         else:
             if self.window_size < past_targets.shape[1]:
                 past_targets = past_targets[:, -self.window_size:]
                 past_observed_targets = past_observed_targets[:, -self.window_size:]
             past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
-            x_past = past_targets
+            truncated_past_targets = past_targets
+
+        if past_features is not None:
+            if self.window_size <= past_features.shape[1]:
+                past_features = past_features[:, -self.window_size:]
+            elif self.encoder_lagged_input:
+                past_features = self.pad_tensor(past_features, self.window_size)
 
         if self.network_structure.variable_selection:
-            batch_size = x_past.shape[0]
-            x_static = {}
+            batch_size = truncated_past_targets.shape[0]
+            feat_dict_static = {}
             if length_past > 0:
                 if past_features is not None:
-                    past_features = self.embedding(past_features[:, -self.window_size:].to(self.device))
-                x_past = {'past_targets': x_past.to(device=self.device)}
+                    past_features = self.embedding(past_features.to(self.device))
+                feat_dict_past = {'past_targets': truncated_past_targets.to(device=self.device)}
 
                 if past_features is not None:
                     for feature_name in self.variable_selector.feature_names:
                         tensor_idx = self.variable_selector.feature_names2tensor_idx[feature_name]
                         if feature_name not in self.variable_selector.static_features:
-                            x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                            feat_dict_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
                         else:
                             static_feature = past_features[:, 0, tensor_idx[0]: tensor_idx[1]]
-                            x_static[feature_name] = static_feature
+                            feat_dict_static[feature_name] = static_feature
 
                 if hasattr(self.variable_selector, 'placeholder_features'):
                     for placehold in self.variable_selector.placeholder_features:
-                        x_past[placehold] = torch.zeros((batch_size, length_past, 1),
-                                                        dtype=past_targets.dtype,
-                                                        device=self.device)
+                        feat_dict_past[placehold] = torch.zeros((batch_size, length_past, 1),
+                                                                dtype=past_targets.dtype,
+                                                                device=self.device)
             else:
-                x_past = None
+                feat_dict_past = None
             if length_future > 0:
                 if future_features is not None:
                     future_features = self.decoder_embedding(future_features.to(self.device))
-                x_future = {}
+                feat_dict_future = {}
                 if hasattr(self.variable_selector, 'placeholder_features'):
                     for placehold in self.variable_selector.placeholder_features:
-                        x_future[placehold] = torch.zeros((batch_size,
-                                                           length_future, 1),
-                                                          dtype=past_targets.dtype,
-                                                          device=self.device)
+                        feat_dict_future[placehold] = torch.zeros((batch_size,
+                                                                   length_future, 1),
+                                                                  dtype=past_targets.dtype,
+                                                                  device=self.device)
                 else:
-                    x_future = {}
+                    feat_dict_future = {}
                 if future_features is not None:
                     for feature_name in self.variable_selector.known_future_features:
                         tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
                         if feature_name not in self.variable_selector.static_features:
-                            x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+                            feat_dict_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
                         else:
                             if length_past == 0:
+                                # Otherwise static_feature is acquired when processing with encoder network
                                 static_feature = future_features[:, 0, tensor_idx[0]: tensor_idx[1]]
-                                x_static[feature_name] = static_feature
+                                feat_dict_static[feature_name] = static_feature
 
             else:
-                x_future = None
+                feat_dict_future = None
 
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
-                x_past=x_past,
-                x_future=x_future,
-                x_static=x_static,
+                x_past=feat_dict_past,
+                x_future=feat_dict_future,
+                x_static=feat_dict_static,
                 batch_size=batch_size,
                 length_past=length_past,
                 length_future=length_future,
@@ -443,11 +463,10 @@ def pre_processing(self,
             return x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets
         else:
             if past_features is not None:
-                past_features = past_features[:, -self.window_size:]
-                x_past = torch.cat([x_past, past_features], dim=-1)
+                x_past = torch.cat([truncated_past_targets, past_features], dim=-1)
 
             x_past = self.embedding(x_past.to(device=self.device))
-            if future_features is not None:
+            if future_features is not None and length_future > 0:
                 future_features = self.decoder_embedding(future_features.to(self.device))
             return x_past, future_features, None, loc, scale, None, past_targets
 
@@ -470,6 +489,7 @@ def forward(self,
 
         encoder_additional = [static_context_initial_hidden]
         encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
+
         encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
 
         decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
@@ -478,7 +498,7 @@ def forward(self,
         if self.has_temporal_fusion:
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
-                                                  past_observed_values=past_observed_targets,
+                                                  past_observed_targets=past_observed_targets,
                                                   decoder_length=self.n_prediction_steps,
                                                   static_embedding=x_static
                                                   )
@@ -549,19 +569,22 @@ def decoder_select_variable(self, future_targets: torch.tensor, future_features:
         future_targets = future_targets.to(self.device)
         if future_features is not None:
             future_features = self.decoder_embedding(future_features.to(self.device))
-        x_future = {}
+        feat_dict_future = {}
         if hasattr(self.variable_selector, 'placeholder_features'):
             for placeholder in self.variable_selector.placeholder_features:
-                x_future[placeholder] = torch.zeros((batch_size,
-                                                     length_future, 1),
-                                                    dtype=future_targets.dtype,
-                                                    device=self.device)
+                feat_dict_future[placeholder] = torch.zeros((batch_size,
+                                                             length_future, 1),
+                                                            dtype=future_targets.dtype,
+                                                            device=self.device)
+
         for feature_name in self.variable_selector.known_future_features:
             tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
-            x_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
-        x_future['future_prediction'] = future_targets
+            if feature_name not in self.variable_selector.static_features:
+                feat_dict_future[feature_name] = future_features[:, :, tensor_idx[0]: tensor_idx[1]]
+
+        feat_dict_future['future_prediction'] = future_targets
         _, x_future, _, _ = self.variable_selector(x_past=None,
-                                                   x_future=x_future,
+                                                   x_future=feat_dict_future,
                                                    x_static=None,
                                                    length_past=0,
                                                    length_future=length_future,
@@ -577,7 +600,7 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
-        x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
+        x_past, _, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
             past_targets=past_targets,
             past_observed_targets=past_observed_targets,
             past_features=past_features,
@@ -602,16 +625,17 @@ def forward(self,
                 future_targets = torch.cat([past_targets[:, [-1], :], future_targets[:, :-1, :]], dim=1)
 
             if self.network_structure.variable_selection:
-                x_future = self.decoder_select_variable(future_targets, future_features)
+                decoder_input = self.decoder_select_variable(future_targets, future_features)
             else:
-                x_future = future_targets if future_features is None else torch.cat([future_features, future_targets],
-                                                                                    dim=-1)
-                x_future = self.decoder_embedding(x_future.to(self.device))
+                decoder_input = future_targets if future_features is None else torch.cat([future_features,
+                                                                                          future_targets], dim=-1)
+                decoder_input.to(self.device)
+                decoder_input = self.decoder_embedding(decoder_input)
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
                                                            additional_input=encoder_additional)
 
-            decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
+            decoder_output = self.decoder(x_future=decoder_input, encoder_output=encoder2decoder,
                                           pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
 
             if self.has_temporal_fusion:
@@ -627,9 +651,6 @@ def forward(self,
         else:
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
 
-            if future_features is not None:
-                future_features = future_features
-
             if self.has_temporal_fusion:
                 decoder_output_all = None
 
@@ -640,26 +661,27 @@ def forward(self,
                 for idx_pred in range(self.n_prediction_steps):
                     predicted_target = predicted_target.cpu()
                     if self.decoder_lagged_input:
-                        x_future = torch.cat([past_targets, predicted_target], dim=1)
-                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder_lagged_value)
+                        past_targets = torch.cat([past_targets, predicted_target], dim=1)
+                        ar_future_target = get_lagged_subsequences_inference(past_targets, 1,
+                                                                             self.decoder_lagged_value)
                     else:
-                        x_future = predicted_target[:, [-1]]
+                        ar_future_target = predicted_target[:, [-1]]
 
                     if self.network_structure.variable_selection:
-                        x_future = self.decoder_select_variable(
+                        decoder_input = self.decoder_select_variable(
                             future_targets=predicted_target[:, -1:].to(self.device),
                             future_features=future_features[:, [idx_pred]] if future_features is not None else None
                         )
                     else:
-                        x_future = x_future if future_features is None else torch.cat([x_future,
-                                                                                       future_features[:, [idx_pred]],
-                                                                                       ],
-                                                                                      dim=-1)
-                        x_future = x_future.to(self.device)
-
-                    x_future = self.decoder_embedding(x_future)
-
-                    decoder_output = self.decoder(x_future,
+                        decoder_input = ar_future_target if future_features is None else torch.cat(
+                            [future_features[:, [idx_pred]],
+                             ar_future_target,
+                             ],
+                            dim=-1)
+                        decoder_input = decoder_input.to(self.device)
+                        decoder_input = self.decoder_embedding(decoder_input)
+
+                    decoder_output = self.decoder(decoder_input,
                                                   encoder_output=encoder2decoder,
                                                   pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
                                                   cache_intermediate_state=True,
@@ -718,7 +740,11 @@ def forward(self,
                 repeated_predicted_target = repeated_past_target[:, [-1]]
                 repeated_past_target = repeated_past_target[:, :-1, ]
 
-                repeated_time_feat = future_features.repeat_interleave(
+                repeated_x_static = x_static.repeat_interleave(
+                    repeats=self.num_samples, dim=0
+                ) if x_static is not None else None
+
+                repeated_future_features = future_features.repeat_interleave(
                     repeats=self.num_samples, dim=0
                 ) if future_features is not None else None
 
@@ -730,28 +756,30 @@ def forward(self,
 
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
-                        x_future = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
-                        x_future = get_lagged_subsequences_inference(x_future, 1, self.decoder_lagged_value)
+                        ar_future_target = torch.cat([repeated_past_target, repeated_predicted_target.cpu()], dim=1)
+                        ar_future_target = get_lagged_subsequences_inference(ar_future_target, 1,
+                                                                             self.decoder_lagged_value)
                     else:
-                        x_future = repeated_predicted_target[:, [-1]]
+                        ar_future_target = repeated_predicted_target[:, [-1]]
 
                     if self.network_structure.variable_selection:
-                        x_future = self.decoder_select_variable(
-                            future_targets=x_future[:, -1:],
-                            future_features=None if repeated_time_feat is None else repeated_time_feat[:, [idx_pred]])
+                        decoder_input = self.decoder_select_variable(
+                            future_targets=ar_future_target,
+                            future_features=None if repeated_future_features is None else
+                            repeated_future_features[:, [idx_pred]])
                     else:
-                        x_future = x_future if repeated_time_feat is None else torch.cat(
-                            [repeated_time_feat[:, [idx_pred], :], x_future], dim=-1)
+                        decoder_input = repeated_future_features if repeated_future_features is None else torch.cat(
+                            [repeated_future_features[:, [idx_pred], :], ar_future_target], dim=-1)
 
-                        x_future = x_future.to(self.device)
+                        decoder_input = decoder_input.to(self.device)
+                        decoder_input = self.decoder_embedding(decoder_input)
 
-                    x_future = self.decoder_embedding(x_future)
-
-                    decoder_output = self.decoder(x_future,
+                    decoder_output = self.decoder(decoder_input,
                                                   encoder_output=encoder2decoder,
                                                   pos_idx=(x_past.shape[1] + idx_pred, x_past.shape[1] + idx_pred + 1),
                                                   cache_intermediate_state=True,
                                                   incremental_update=idx_pred > 0)
+
                     if self.has_temporal_fusion:
                         if decoder_output_all is not None:
                             decoder_output_all = torch.cat([decoder_output_all, decoder_output], dim=1)
@@ -761,8 +789,8 @@ def forward(self,
                                                               decoder_output=decoder_output_all,
                                                               past_observed_targets=past_observed_targets,
                                                               decoder_length=idx_pred + 1,
-                                                              static_embedding=x_static,
-                                                              )
+                                                              static_embedding=repeated_x_static,
+                                                              )[:, -1:,]
 
                     net_output = self.head(decoder_output)
                     samples = net_output.sample().cpu()
@@ -826,24 +854,32 @@ def encoder_select_variable(self, past_targets: torch.tensor, past_features: Opt
         past_targets = past_targets.to(self.device)
         if past_features is not None:
             past_features = past_features.to(self.device)
-        x_past = []
+            past_features = self.embedding(past_features)
+        feat_dict_past = {'past_targets': past_targets.to(device=self.device)}
+        feat_dict_static = {}
         if hasattr(self.variable_selector, 'placeholder_features'):
             for placehold in self.variable_selector.placeholder_features:
-                x_past[placehold] = torch.zeros((batch_size, length_past, 1),
-                                                dtype=past_targets.dtype,
-                                                device=self.device)
+                feat_dict_past[placehold] = torch.zeros((batch_size, length_past, 1),
+                                                        dtype=past_targets.dtype,
+                                                        device=self.device)
+
         for feature_name in self.variable_selector.feature_names:
-            tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
-            x_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
-        x_past, _, _, _ = self.variable_selector(x_past=x_past,
-                                                 x_future=None,
-                                                 x_static=None,
-                                                 length_past=length_past,
-                                                 length_future=0,
-                                                 batch_size=batch_size,
-                                                 **variable_selector_kwargs,
-                                                 )
-        return x_past
+            tensor_idx = self.variable_selector.feature_names2tensor_idx[feature_name]
+            if feature_name not in self.variable_selector.static_features:
+                feat_dict_past[feature_name] = past_features[:, :, tensor_idx[0]: tensor_idx[1]]
+            else:
+                static_feature = past_features[:, 0, tensor_idx[0]: tensor_idx[1]]
+                feat_dict_static[feature_name] = static_feature
+
+        x_past, _, _, static_context_initial_hidden = self.variable_selector(x_past=feat_dict_past,
+                                                                             x_future=None,
+                                                                             x_static=feat_dict_static,
+                                                                             length_past=length_past,
+                                                                             length_future=0,
+                                                                             batch_size=batch_size,
+                                                                             **variable_selector_kwargs,
+                                                                             )
+        return x_past, static_context_initial_hidden
 
     def forward(self,
                 past_targets: torch.Tensor,
@@ -852,17 +888,25 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
+        encode_length = min(self.window_size, past_targets.shape[1])
+
         if self.training:
             if self.encoder_lagged_input:
-                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    past_targets[:, -self.window_size:],
-                    past_observed_targets[:, -self.window_size:]
-                )
-
-                past_targets[:, :-self.window_size] = torch.where(
-                    past_observed_targets[:, :-self.window_size],
-                    self.scale_value(past_targets[:, :-self.window_size], loc, scale),
-                    past_targets[:, :-self.window_size])
+                if self.window_size < past_targets.shape[1]:
+                    past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                        past_targets[:, -self.window_size:],
+                        past_observed_targets[:, -self.window_size:]
+                    )
+
+                    past_targets[:, :-self.window_size] = torch.where(
+                        past_observed_targets[:, :-self.window_size],
+                        self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                        past_targets[:, :-self.window_size])
+                else:
+                    past_targets, _, loc, scale = self.target_scaler(
+                        past_targets,
+                        past_observed_targets
+                    )
 
                 future_targets = self.scale_value(future_targets, loc, scale)
 
@@ -872,6 +916,7 @@ def forward(self,
                                                                                     seq_length - 1,
                                                                                     self.encoder_lagged_value,
                                                                                     self.cached_lag_mask_encoder)
+                targets_all = targets_all[:, -(encode_length + self.n_prediction_steps - 1):]
             else:
                 if self.window_size < past_targets.shape[1]:
                     past_targets = past_targets[:, -self.window_size:]
@@ -882,28 +927,33 @@ def forward(self,
 
             if self.network_structure.variable_selection:
                 if past_features is not None:
-                    past_features = past_features[:-self.window_size:]
-                    features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
+                    past_features = past_features[:, -self.window_size:]
+                    features_all = torch.cat([past_features, future_features[:, :-1]], dim=1)
                 else:
                     features_all = None
-                length_past = min(self.window_size, past_targets.shape[1]) + self.n_prediction_steps
-                x_input = self.encoder_select_variable(targets_all, past_features=features_all, length_past=length_past)
+                length_past = min(self.window_size, past_targets.shape[1]) + self.n_prediction_steps - 1
+                encoder_input, static_context_initial_hidden = self.encoder_select_variable(targets_all,
+                                                                                            past_features=features_all,
+                                                                                            length_past=length_past)
             else:
-                x_input = targets_all
                 if past_features is not None:
-                    past_features = past_features[:, -self.window_size:]
-                    features_all = torch.cat([past_features[:, 1:], future_features], dim=1)
-                    x_input = torch.cat([features_all, targets_all], dim=-1)
+                    if self.window_size <= past_features.shape[1]:
+                        past_features = past_features[:, -self.window_size:]
+
+                    features_all = torch.cat([past_features, future_features[:, :-1]], dim=1)
+                    encoder_input = torch.cat([features_all, targets_all], dim=-1)
+                else:
+                    encoder_input = targets_all
 
-                x_input = x_input.to(self.device)
+                encoder_input = encoder_input.to(self.device)
 
-                x_input = self.embedding(x_input)
+                encoder_input = self.embedding(encoder_input)
                 static_context_initial_hidden = None
 
             encoder_additional = [static_context_initial_hidden]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
-            encoder2decoder, encoder_output = self.encoder(encoder_input=x_input,
+            encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
                                                            additional_input=encoder_additional,
                                                            output_seq=True)
 
@@ -915,67 +965,70 @@ def forward(self,
             return self.rescale_output(net_output, loc, scale, self.device)
         else:
             if self.encoder_lagged_input:
-                past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
-                    past_targets[:, -self.window_size:],
-                    past_observed_targets[:, -self.window_size:],
+                if self.window_size < past_targets.shape[1]:
+                    past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
+                        past_targets[:, -self.window_size:],
+                        past_observed_targets[:, -self.window_size:],
+                    )
+
+                    past_targets[:, :-self.window_size] = torch.where(
+                        past_observed_targets[:, :-self.window_size],
+                        self.scale_value(past_targets[:, :-self.window_size], loc, scale),
+                        past_targets[:, :-self.window_size])
+                else:
+                    past_targets, _, loc, scale = self.target_scaler(
+                        past_targets,
+                        past_observed_targets,
+                    )
+
+                truncated_past_targets, self.cached_lag_mask_encoder_test = get_lagged_subsequences(
+                    past_targets,
+                    self.window_size,
+                    self.encoder_lagged_value,
+                    self.cached_lag_mask_encoder_test
                 )
-
-                past_targets[:, :-self.window_size] = torch.where(
-                    past_observed_targets[:, :-self.window_size],
-                    self.scale_value(past_targets[:, :-self.window_size], loc, scale),
-                    past_targets[:, :-self.window_size])
-
-                x_past, self.cached_lag_mask_encoder_test = get_lagged_subsequences(past_targets,
-                                                                                    self.window_size,
-                                                                                    self.encoder_lagged_value,
-                                                                                    self.cached_lag_mask_encoder_test)
+                truncated_past_targets = truncated_past_targets[:, -encode_length:]
             else:
                 if self.window_size < past_targets.shape[1]:
                     past_targets = past_targets[:, -self.window_size:]
                     past_observed_targets = past_observed_targets[:, -self.window_size]
                 past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
-                x_past = past_targets
+                truncated_past_targets = past_targets
 
             if self.network_structure.variable_selection:
                 if past_features is not None:
-                    past_features = past_features[:-self.window_size:]
-                    features_all = torch.cat([past_features[:, 1:], future_features[:, :1]], dim=1)
+                    features_all = past_features[:, -self.window_size:]
                 else:
                     features_all = None
-                length_past = min(self.window_size, past_targets.shape[1])
                 variable_selector_kwargs = dict(cache_static_contex=True,
                                                 use_cached_static_contex=False)
-                x_past = self.encoder_select_variable(x_past,
-                                                      past_features=features_all,
-                                                      length_past=length_past,
-                                                      **variable_selector_kwargs)
+
+                encoder_input, static_context_initial_hidden = self.encoder_select_variable(truncated_past_targets,
+                                                                                            past_features=features_all,
+                                                                                            length_past=encode_length,
+                                                                                            **variable_selector_kwargs)
 
             else:
                 if past_features is not None:
-                    # features is one step ahead of target
-                    if self.window_size > 1:
-                        features_all = torch.cat([past_features[:, -self.window_size + 1:, ],
-                                                  future_features],
-                                                 dim=1)
-                    else:
-                        features_all = future_features
+                    features_all = torch.cat([past_features[:, -encode_length:], future_features[:, :-1]], dim=1)
                 else:
                     features_all = None
-                x_past = x_past if features_all is None else torch.cat([features_all[:, :self.window_size], x_past],
-                                                                       dim=-1)
 
-                x_past = x_past.to(self.device)
-                # TODO consider static features
-                x_past = self.embedding(x_past)
+                encoder_input = truncated_past_targets if features_all is None else torch.cat(
+                    [features_all[:, :encode_length], truncated_past_targets], dim=-1
+                )
+
+                encoder_input = encoder_input.to(self.device)
+                encoder_input = self.embedding(encoder_input)
                 static_context_initial_hidden = None
 
             all_samples = []
-            batch_size = past_targets.shape[0]
+            batch_size: int = past_targets.shape[0]
 
             encoder_additional = [static_context_initial_hidden]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
-            encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
+            encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
                                                            additional_input=encoder_additional,
                                                            cache_intermediate_state=True,
                                                            )
@@ -984,6 +1037,7 @@ def forward(self,
                 self.encoder.cached_intermediate_state,
                 is_hidden_states=self.encoder.encoder_has_hidden_states,
                 repeats=self.num_samples)
+
             if self.network_structure.variable_selection:
                 self.variable_selector.cached_static_contex = self.repeat_intermediate_values(
                     [self.variable_selector.cached_static_contex],
@@ -991,9 +1045,9 @@ def forward(self,
                     repeats=self.num_samples)[0]
 
             if self.encoder_lagged_input:
-                max_lag_seq_length = max(max(self.encoder_lagged_value), self.window_size)
+                max_lag_seq_length = max(max(self.encoder_lagged_value), encode_length)
             else:
-                max_lag_seq_length = self.window_size
+                max_lag_seq_length = encode_length
 
             net_output = self.head(self.decoder(x_future=None, encoder_output=encoder2decoder))
 
@@ -1012,39 +1066,42 @@ def forward(self,
                     dim=0).squeeze(1)
 
                 if future_features is not None:
-                    time_feature = future_features[:, 1:]
+                    future_features = future_features[:, 1:]
                 else:
-                    time_feature = None
+                    future_features = None
 
-                repeated_time_feat = time_feature.repeat_interleave(
+                repeated_future_features = future_features.repeat_interleave(
                     repeats=self.num_samples, dim=0
-                ) if time_feature is not None else None
+                ) if future_features is not None else None
 
             for k in range(1, self.n_prediction_steps):
                 if self.encoder_lagged_input:
                     repeated_past_target = torch.cat([repeated_past_target, all_samples[-1]], dim=1)
-                    x_next = get_lagged_subsequences_inference(repeated_past_target, 1, self.encoder_lagged_value)
+                    ar_future_target = get_lagged_subsequences_inference(repeated_past_target, 1,
+                                                                         self.encoder_lagged_value)
                 else:
-                    x_next = next_sample
+                    ar_future_target = next_sample
 
                 if self.network_structure.variable_selection:
                     length_past = 1
                     variable_selector_kwargs = dict(use_cached_static_contex=True)
-                    if repeated_time_feat is not None:
-                        feature_next = repeated_time_feat[:, [k - 1]]
+                    if repeated_future_features is not None:
+                        feature_next = repeated_future_features[:, [k - 1]]
                     else:
                         feature_next = None
-                    x_next = self.encoder_select_variable(x_next, past_features=feature_next, length_past=1,
-                                                          **variable_selector_kwargs)
+                    encoder_input, _ = self.encoder_select_variable(ar_future_target, past_features=feature_next,
+                                                                    length_past=1,
+                                                                    **variable_selector_kwargs)
 
                 else:
-                    if repeated_time_feat is not None:
-                        x_next = torch.cat([repeated_time_feat[:, [k - 1]], x_next], dim=-1)
-                    x_next = x_next.to(self.device)
-
-                x_next = self.embedding(x_next)
+                    if repeated_future_features is not None:
+                        encoder_input = torch.cat([repeated_future_features[:, [k - 1]], ar_future_target], dim=-1)
+                    else:
+                        encoder_input = ar_future_target
+                    encoder_input = encoder_input.to(self.device)
+                    encoder_input = self.embedding(encoder_input)
 
-                encoder2decoder, _ = self.encoder(encoder_input=x_next,
+                encoder2decoder, _ = self.encoder(encoder_input=encoder_input,
                                                   additional_input=[None] * self.network_structure.num_blocks,
                                                   output_seq=False, cache_intermediate_state=True,
                                                   incremental_update=True)
@@ -1090,9 +1147,11 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.Tensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ):
-        if self.window_size < past_targets.shape[1]:
+        if self.window_size <= past_targets.shape[1]:
             past_targets = past_targets[:, -self.window_size:]
             past_observed_targets = past_observed_targets[:, -self.window_size:]
+        else:
+            past_targets = self.pad_tensor(past_targets, self.window_size)
 
         past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index f551db9db..490bcb904 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -52,9 +52,8 @@ def _required_fit_requirements(self):
             FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
-            FitRequirement('static_features', (Tuple,), user_defined=True, dataset_property=False),
+            FitRequirement('static_features', (Tuple,), user_defined=True, dataset_property=True),
             FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement("static_features", (Tuple, ), user_defined=True, dataset_property=True)
         ]
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 3eec8726b..87b64e538 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -289,7 +289,7 @@ def _defaults_network(self):
         return ['flat_network',
                 'seq_network']
 
-    def fit(self, X: Dict[str, Any], y: Any) -> autoPyTorchComponent:
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
         """Handy method to check if a component is fitted
 
         Args:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 5188494f2..2b8d20521 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -93,14 +93,14 @@ def __init__(self,
     def forward(self,
                 encoder_output: torch.Tensor,
                 decoder_output: torch.Tensor,
-                past_observed_values: torch.BoolTensor,
+                past_observed_targets: torch.BoolTensor,
                 decoder_length: int,
                 static_embedding: Optional[torch.Tensor] = None):
         """
         Args:
             encoder_output: the output of the last layer of encoder network
             decoder_output: the output of the last layer of decoder network
-            past_observed_values: observed values in the past
+            past_observed_targets: observed values in the past
             decoder_length: length of decoder network
             static_embedding: output of static variable selection network (if applible)
         """
@@ -120,10 +120,10 @@ def forward(self,
 
         # Attention
         encoder_out_length = encoder_output.shape[1]
-        past_observed_values = past_observed_values[:, -encoder_out_length:]
-        past_observed_values = past_observed_values.to(self.device)
+        past_observed_targets = past_observed_targets[:, -encoder_out_length:]
+        past_observed_targets = past_observed_targets.to(self.device)
 
-        mask = self.get_attention_mask(past_observed_values=past_observed_values, decoder_length=decoder_length)
+        mask = self.get_attention_mask(past_observed_targets=past_observed_targets, decoder_length=decoder_length)
         if mask.shape[-1] < attn_input.shape[1]:
             # in case that none of the samples has length greater than window_size
             mask = torch.cat([
@@ -155,7 +155,7 @@ def device(self, device: torch.device):
         self.to(device)
         self._device = device
 
-    def get_attention_mask(self, past_observed_values: torch.BoolTensor, decoder_length: int):
+    def get_attention_mask(self, past_observed_targets: torch.BoolTensor, decoder_length: int):
         """
         https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
         temporal_fusion_transformer/__init__.py
@@ -174,7 +174,7 @@ def get_attention_mask(self, past_observed_values: torch.BoolTensor, decoder_len
         decoder_mask = attend_step >= predict_step
         # do not attend to steps where data is padded
         # this is the result of our padding strategy: we pad values at the start of the tensors
-        encoder_mask = ~past_observed_values.squeeze(-1)
+        encoder_mask = ~past_observed_targets.squeeze(-1)
 
         # combine masks along attended time - first encoder and then decoder
         mask = torch.cat(
@@ -237,6 +237,8 @@ def __init__(self,
 
         # static_features should always be known beforehand
         known_future_features = tuple(known_future_features)
+        feature_names = tuple(feature_names)
+        time_feature_names = tuple(time_feature_names)
 
         if feature_names:
             for name in feature_names:
@@ -368,6 +370,7 @@ def __init__(self,
         if network_encoder['block_1'].encoder_properties.has_hidden_states:
             n_hidden_states = network_encoder['block_1'].n_hidden_states
 
+
         static_context_initial_hidden = [GatedResidualNetwork(input_size=self.hidden_size,
                                                               hidden_size=self.hidden_size,
                                                               output_size=self.hidden_size,
@@ -546,7 +549,7 @@ def forward(self,
             elif self.encoder_output_type[i] == EncoderOutputForm.Sequence:
                 encoder2decoder.append(fx)
             elif self.encoder_output_type[i] == EncoderOutputForm.SequenceLast:
-                if output_seq_i:
+                if output_seq_i and not output_seq:
                     encoder2decoder.append(encoder_i.get_last_seq_value(fx).squeeze(1))
                 else:
                     encoder2decoder.append(fx)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 401016d00..b270c1761 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -469,7 +469,7 @@ def get_hyperparameter_search_space(
 
                     forbidden_deep_ars = []
 
-                    hps_forbidden_deep_ar = [variable_selection, use_temporal_fusion]
+                    hps_forbidden_deep_ar = [use_temporal_fusion]
                     for hp_forbidden_deep_ar in hps_forbidden_deep_ar:
                         if True in hp_forbidden_deep_ar.choices:
                             forbidden_deep_ars.append(ForbiddenAndConjunction(
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index f90f4de31..c125fd024 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -65,6 +65,7 @@ def generate_fit_dict_and_dataset_property():
     output_shape = (n_prediction_steps, 1)
     time_feature_transform = [1, 2]
 
+    feature_names = ('f1', 'f2', 'f3', 'f4', 'f5')
     feature_shapes = {'f1': 10, 'f2': 10, 'f3': 10, 'f4': 10, 'f5': 10}
     known_future_features = ('f1', 'f2', 'f3', 'f4', 'f5')
 
@@ -76,6 +77,7 @@ def generate_fit_dict_and_dataset_property():
                               known_future_features=known_future_features,
                               n_prediction_steps=n_prediction_steps,
                               encoder_can_be_auto_regressive=True,
+                              feature_names=feature_names,
                               is_small_preprocess=True,
                               task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
                               uni_variant=False,
@@ -159,10 +161,12 @@ def test_encoder_choices(self):
                                                      hyperparameter='__choice__',
                                                      value_range=('seq_encoder',),
                                                      default_value='seq_encoder', )
+
         encoder_choices._apply_search_space_update(update_seq)
         cs_seq = encoder_choices.get_hyperparameter_search_space(dataset_properties)
         self.assertListEqual(list(cs_seq.get_hyperparameter('__choice__').choices), ['seq_encoder'])
 
+
         encoder_choices = ForecastingNetworkChoice(dataset_properties)
         update_rnn_decoder_type = HyperparameterSearchSpaceUpdate(
             node_name="network_backbone",
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
new file mode 100644
index 000000000..df4461c42
--- /dev/null
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
@@ -0,0 +1,337 @@
+import copy
+import unittest
+
+import pytest
+import torch
+
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
+    generate_fit_dict_and_dataset_property
+)
+
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import (
+    ForecastingHead
+)
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    DisForecastingStrategy,
+    ALL_DISTRIBUTIONS,
+)
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
+from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
+    get_lagged_subsequences,
+    get_lagged_subsequences_inference,
+    AbstractForecastingNet
+)
+
+
+class ReducedEmbedding(torch.nn.Module):
+    # a dummy reduced embedding, it simply cut row for each categorical features
+    def __init__(self, num_input_features, num_numerical_features: int):
+        super(ReducedEmbedding, self).__init__()
+        self.num_input_features = num_input_features
+        self.num_numerical_features = num_numerical_features
+        self.n_cat_features = len(num_input_features) - num_numerical_features
+
+    def forward(self, x):
+        x = x[..., :-self.n_cat_features]
+        return x
+
+    def get_partial_models(self, subset_features):
+        num_numerical_features = sum([sf < self.num_numerical_features for sf in subset_features])
+        num_input_features = [self.num_input_features[sf] for sf in subset_features]
+        return ReducedEmbedding(num_input_features, num_numerical_features)
+
+
+@pytest.fixture(params=['ForecastingNet', 'ForecastingSeq2SeqNet', 'ForecastingDeepARNet', 'NBEATSNet'])
+def network_type(request):
+    return request.param
+
+
+@pytest.fixture(params=['RNNEncoder', 'TCNEncoder'])
+def network_encoder(request):
+    return request.param
+
+
+@pytest.fixture(params=['ReducedEmbedding', 'NoEmbedding'])
+def embedding(request):
+    return request.param
+
+
+@pytest.fixture(params=['distribution_mean', 'distribution_sample', 'regression', 'quantile'])
+def net_output_type(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def variable_selection(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def with_static_features(request):
+    return request.param
+
+
+class TestForecastingNetworks:
+    dataset_properties, fit_dictionary = generate_fit_dict_and_dataset_property()
+
+    def test_network_forward(self,
+                             embedding,
+                             net_output_type,
+                             variable_selection,
+                             with_static_features,
+                             network_encoder,
+                             network_type):
+        if network_type == 'ForecastingDeepARNet' and net_output_type != 'distribution_sample':
+            return
+        if network_type == 'ForecastingSeq2SeqNet' and network_encoder == 'TCNEncoder':
+            return
+        if network_type == 'NBEATSNet':
+            # NBEATS only needs one pass
+            if not (embedding == 'NoEmbedding' and net_output_type == 'regression' and
+                    not variable_selection and not with_static_features and network_encoder == 'RNNEncoder'):
+                return
+
+        dataset_properties = copy.copy(self.dataset_properties)
+        time_feature_names = ('t1', 't2')
+        dataset_properties['time_feature_names'] = time_feature_names
+
+        if network_type != 'ForecastingDeepARNet':
+            dataset_properties['known_future_features'] = ('f1', 'f3', 'f5')
+
+        if with_static_features:
+            dataset_properties['static_features'] = (0, 4)
+        else:
+            dataset_properties['static_features'] = tuple()
+
+        fit_dictionary = copy.copy(self.fit_dictionary)
+        fit_dictionary['dataset_properties'] = dataset_properties
+        fit_dictionary['target_scaler'] = TargetStandardScaler().fit(fit_dictionary)
+
+        if net_output_type.startswith("distribution"):
+            fit_dictionary['dist_forecasting_strategy'] = DisForecastingStrategy(list(ALL_DISTRIBUTIONS.keys())[0],
+                                                                                 forecast_strategy=
+                                                                                 net_output_type.split("_")[1])
+            net_output_type = net_output_type.split("_")[0]
+        elif net_output_type == 'quantile':
+            fit_dictionary['quantile_values'] = [0.5, 0.1, 0.9]
+
+        fit_dictionary['net_output_type'] = net_output_type
+
+        if embedding == 'NoEmbedding':
+            fit_dictionary['network_embedding'] = _NoEmbedding()
+        else:
+            fit_dictionary['network_embedding'] = ReducedEmbedding([10] * 5, 2)
+            dataset_properties['feature_shapes'] = {'f1': 10, 'f2': 10, 'f3': 9, 'f4': 9, 'f5': 9}
+
+        n_prediction_steps = dataset_properties['n_prediction_steps']
+        window_size = fit_dictionary['window_size']
+        n_features_past = 10 * len(dataset_properties['feature_names']) + len(time_feature_names)
+        n_features_future = 10 * len(dataset_properties['known_future_features']) + len(time_feature_names)
+        n_targets = 1
+
+        backbone = ForecastingNetworkChoice(dataset_properties)
+        head = ForecastingHead()
+        network = ForecastingNetworkComponent()
+
+        if network_type == 'NBEATSNet':
+            updates = [HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                       hyperparameter='__choice__',
+                                                       value_range=('flat_encoder',),
+                                                       default_value='flat_encoder', )]
+            include = ['flat_encoder:NBEATSEncoder']
+
+        else:
+            updates = [HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                       hyperparameter='__choice__',
+                                                       value_range=('seq_encoder',),
+                                                       default_value='seq_encoder', ),
+                       HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                       hyperparameter='seq_encoder:num_blocks',
+                                                       value_range=(1, 1),
+                                                       default_value=1, ),
+                       ]
+            include = [f'seq_encoder:{network_encoder}']
+
+            if network_type == 'ForecastingNet':
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                    value_range=(False,),
+                    default_value=False, ))
+
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:decoder_auto_regressive',
+                    value_range=(False,),
+                    default_value=False, ))
+            elif network_type == 'ForecastingSeq2SeqNet':
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+                    value_range=("RNNDecoder",),
+                    default_value="RNNDecoder", ))
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:decoder_auto_regressive',
+                    value_range=(True,),
+                    default_value=True, ))
+            elif network_type == 'ForecastingDeepARNet':
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+                    value_range=('MLPDecoder',),
+                    default_value='MLPDecoder', ))
+
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                    value_range=(True,),
+                    default_value=True, ))
+
+            if variable_selection:
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:variable_selection',
+                    value_range=(True,),
+                    default_value=True, ))
+            else:
+                updates.append(HyperparameterSearchSpaceUpdate(
+                    node_name="network_backbone",
+                    hyperparameter='seq_encoder:variable_selection',
+                    value_range=(False,),
+                    default_value=False, ))
+
+        for update in updates:
+            backbone._apply_search_space_update(update)
+
+        cs = backbone.get_hyperparameter_search_space(dataset_properties=dataset_properties, include=include)
+
+        sample = cs.sample_configuration()
+        backbone.set_hyperparameters(sample)
+
+        backbone = backbone.fit(fit_dictionary)
+        fit_dictionary = backbone.transform(fit_dictionary)
+
+        head = head.fit(fit_dictionary)
+        fit_dictionary = head.transform(fit_dictionary)
+
+        network = network.fit(fit_dictionary)
+        fit_dictionary = network.transform(fit_dictionary)
+
+        neu_arch = fit_dictionary['network']
+
+        assert isinstance(neu_arch, AbstractForecastingNet)
+        batch_size = 2
+
+        past_targets = torch.ones([batch_size, 50, n_targets])
+        past_features = torch.ones([batch_size, 50, n_features_past])
+        future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
+        future_targets = torch.ones([batch_size, n_prediction_steps, n_targets])
+        past_observed_targets = torch.ones([batch_size, 50, n_targets]).bool()
+
+        output = neu_arch(past_targets=past_targets,
+                          future_targets=future_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+
+        if net_output_type.startswith('distribution'):
+            assert isinstance(output, torch.distributions.Distribution)
+            output = output.mean
+        elif net_output_type == 'quantile':
+            assert len(output) == 3
+            output = output[0]
+        if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
+            assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+        elif network_type == "ForecastingDeepARNet":
+            assert list(output.shape) == [batch_size, n_prediction_steps + min(50, neu_arch.window_size) - 1, n_targets]
+        else:
+            backcast = output[0]
+            forecast = output[1]
+            assert list(backcast.shape) == [batch_size, window_size, n_targets]
+            assert list(forecast.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        neu_arch.eval()
+        output = neu_arch.predict(past_targets=past_targets,
+                                  past_features=past_features,
+                                  future_features=future_features,
+                                  past_observed_targets=past_observed_targets)
+
+        assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        neu_arch.train()
+
+        past_targets = torch.ones([batch_size, 3, n_targets])
+        past_features = torch.ones([batch_size, 3, n_features_past])
+        future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
+        future_targets = torch.ones([batch_size, n_prediction_steps, n_targets])
+        past_observed_targets = torch.ones([batch_size, 3, n_targets]).bool()
+
+        output = neu_arch(past_targets=past_targets,
+                          future_targets=future_targets,
+                          past_features=past_features,
+                          future_features=future_features,
+                          past_observed_targets=past_observed_targets)
+        if net_output_type.startswith('distribution'):
+            assert isinstance(output, torch.distributions.Distribution)
+            output = output.mean
+        elif net_output_type == 'quantile':
+            assert len(output) == 3
+            output = output[0]
+        if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
+            assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+        elif network_type == "ForecastingDeepARNet":
+            assert list(output.shape) == [batch_size, n_prediction_steps + min(3, neu_arch.window_size) - 1, n_targets]
+        else:
+            backcast = output[0]
+            forecast = output[1]
+            assert list(backcast.shape) == [batch_size, window_size, n_targets]
+            assert list(forecast.shape) == [batch_size, n_prediction_steps, n_targets]
+
+        if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
+            assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+        neu_arch.eval()
+
+        output = neu_arch.predict(past_targets=past_targets,
+                                  past_features=past_features,
+                                  future_features=future_features,
+                                  past_observed_targets=past_observed_targets)
+
+        assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+
+
+class TestForecastingNetworkUtil(unittest.TestCase):
+    def test_get_lagged_values(self):
+        seq_raw = torch.arange(10).reshape([1, -1, 1]).float()
+        window_size = 3
+        lag_sequence = [0, 1, 2, 3, 5]
+        lagged_seq1, mask = get_lagged_subsequences(seq_raw, window_size, lag_sequence)
+        lagged_seq2, _ = get_lagged_subsequences(seq_raw, window_size, lag_sequence, mask)
+        lagged_seq3 = get_lagged_subsequences_inference(seq_raw, window_size, lag_sequence)
+
+        self.assertTrue(torch.equal(lagged_seq1, lagged_seq2))
+        self.assertTrue(torch.equal(lagged_seq2, lagged_seq3))
+        self.assertTrue(torch.equal(lagged_seq1[0], torch.Tensor([[7, 6, 5, 4, 2],
+                                                                  [8, 7, 6, 5, 3],
+                                                                  [9, 8, 7, 6, 4]]).float()))
+        self.assertListEqual(list(mask.shape), [len(lag_sequence), max(lag_sequence) + window_size])
+
+        seq_raw = torch.arange(5, 5 + 3).reshape([1, -1, 1]).float()
+        window_size = 3
+        lag_sequence = [0, 1, 2, 3, 5]
+        lagged_seq1, mask = get_lagged_subsequences(seq_raw, window_size, lag_sequence)
+        lagged_seq2, mask2 = get_lagged_subsequences(seq_raw, window_size, lag_sequence, mask)
+        lagged_seq3 = get_lagged_subsequences_inference(seq_raw, window_size, lag_sequence)
+
+        self.assertTrue(torch.all(lagged_seq1 == lagged_seq2))
+        self.assertTrue(torch.all(lagged_seq2 == lagged_seq3))
+        self.assertTrue(torch.equal(lagged_seq1[0], torch.Tensor([[5, 0, 0, 0, 0],
+                                                                  [6, 5, 0, 0, 0],
+                                                                  [7, 6, 5, 0, 0]]).float()))
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index e6c13731b..84b87a6f5 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -315,7 +315,7 @@ def test_seq_models(self):
                 if hp_use_temporal_fusion:
                     decoder_output = temporal_fusion(encoder_output=encoder_output,
                                                      decoder_output=decoder_output,
-                                                     past_observed_values=past_observed_values,
+                                                     past_observed_targets=past_observed_values,
                                                      decoder_length=n_prediction_steps,
                                                      )
 
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
index b626299ae..3fc1f491a 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
@@ -1,5 +1,3 @@
-import torch
-
 import copy
 import unittest
 

From 0ea372ee74008daa17fa1399194fc5ff7469d61a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 20 May 2022 00:37:34 +0200
Subject: [PATCH 272/347] maint

---
 .../forecasting/forecasting_networks/test_seq_encoder.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index 84b87a6f5..ee9c5a7ac 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -341,7 +341,7 @@ def test_seq_models(self):
                         temporal_fusion.eval()
                         decoder_output = temporal_fusion(encoder_output=encoder_output,
                                                          decoder_output=decoder_output,
-                                                         past_observed_values=past_observed_values,
+                                                         past_observed_targets=past_observed_values,
                                                          decoder_length=1,
                                                          )
                         output = head(decoder_output)
@@ -356,7 +356,7 @@ def test_seq_models(self):
                     if hp_use_temporal_fusion:
                         decoder_output = temporal_fusion(encoder_output=encoder_output,
                                                          decoder_output=decoder_output,
-                                                         past_observed_values=past_observed_values,
+                                                         past_observed_targets=past_observed_values,
                                                          decoder_length=1,
                                                          )
                         output = head(decoder_output)

From 1b7ebbe00e4461fd8e111ea2282fdec4f9d67c46 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 20 May 2022 14:52:33 +0200
Subject: [PATCH 273/347] test for architecture

---
 autoPyTorch/datasets/time_series_dataset.py   |   2 +
 .../setup/network/forecasting_architecture.py |  12 +-
 .../forecasting_backbone/__init__.py          |   3 +
 .../forecasting_decoder/MLPDecoder.py         |   8 +-
 .../forecasting_encoder/__init__.py           |   3 +-
 .../seq_encoder/__init__.py                   | 189 ++++++++++++------
 .../test_base_components.py                   | 110 +++++-----
 .../test_forecasting_architecture.py          |  51 ++++-
 .../forecasting_networks/test_seq_encoder.py  |  17 +-
 9 files changed, 261 insertions(+), 134 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 0d4fc0567..bb29d96bb 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -948,6 +948,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                                    'input_shape': self.input_shape,
                                    'time_feature_transform': self.time_feature_transform,
                                    'uni_variant': self.is_uni_variant,
+                                   'static_features_shape': len(self.static_features),
+                                   'future_feature_shapes': (self.n_prediction_steps, len(self.known_future_features)),
                                    'targets_have_missing_values': self.train_tensors[1].isnull().values.any(),
                                    'encoder_can_be_auto_regressive': self.encoder_can_be_auto_regressive,
                                    'features_have_missing_values': False if self.train_tensors[0] is None
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index a42abda77..b5259e011 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -463,9 +463,10 @@ def pre_processing(self,
             return x_past, x_future, x_static, loc, scale, static_context_initial_hidden, past_targets
         else:
             if past_features is not None:
-                x_past = torch.cat([truncated_past_targets, past_features], dim=-1)
-
-            x_past = self.embedding(x_past.to(device=self.device))
+                x_past = torch.cat([truncated_past_targets, past_features], dim=-1).to(device=self.device)
+                x_past = self.embedding(x_past.to(device=self.device))
+            else:
+                x_past = self.embedding(truncated_past_targets.to(device=self.device))
             if future_features is not None and length_future > 0:
                 future_features = self.decoder_embedding(future_features.to(self.device))
             return x_past, future_features, None, loc, scale, None, past_targets
@@ -495,6 +496,7 @@ def forward(self,
         decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
                                       pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
 
+
         if self.has_temporal_fusion:
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
@@ -629,7 +631,7 @@ def forward(self,
             else:
                 decoder_input = future_targets if future_features is None else torch.cat([future_features,
                                                                                           future_targets], dim=-1)
-                decoder_input.to(self.device)
+                decoder_input = decoder_input.to(self.device)
                 decoder_input = self.decoder_embedding(decoder_input)
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past,
@@ -768,7 +770,7 @@ def forward(self,
                             future_features=None if repeated_future_features is None else
                             repeated_future_features[:, [idx_pred]])
                     else:
-                        decoder_input = repeated_future_features if repeated_future_features is None else torch.cat(
+                        decoder_input = ar_future_target if repeated_future_features is None else torch.cat(
                             [repeated_future_features[:, [idx_pred], :], ar_future_target], dim=-1)
 
                         decoder_input = decoder_input.to(self.device)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 87b64e538..1bcd1b59e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -257,9 +257,12 @@ def set_hyperparameters(self,
 
         choice_component = self.get_components()[choice]
 
+        updates = self._get_search_space_updates(prefix=choice)
+
         self.new_params = new_params
         sub_configuration_space = choice_component.get_hyperparameter_search_space(
             self.dataset_properties,
+            **updates
         )
 
         sub_configuration = Configuration(sub_configuration_space,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 96dff0b72..be3493014 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -32,18 +32,18 @@ def __init__(self,
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
                 pos_idx: Optional[Tuple[int]] = None):
+        if not self.auto_regressive:
+            if len(encoder_output.shape) == 3:
+                encoder_output = encoder_output.squeeze(1)
+
         if x_future is None or self.auto_regressive:
             # for auto-regressive model, x_future is fed to the encoders
             x = self.global_layers(encoder_output)
             if self.local_layers is None:
                 return x
             else:
-                # auto regressive model does not have local layers
                 return self.local_layers(x)
 
-        if len(encoder_output.shape) == 3:
-            encoder_output = encoder_output.squeeze(1)
-
         if self.local_layers is None:
             x = torch.concat([encoder_output, x_future.flatten(-2)], dim=-1)
             return self.global_layers(x)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 3971aff81..32c774cf7 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -312,8 +312,7 @@ def set_hyperparameters(self,
         new_params = {}
 
         params = configuration.get_dictionary()
-        choice = params['__choice__']
-        del params['__choice__']
+        choice = params.pop('__choice__')
 
         for param, value in params.items():
             param = param.replace(choice + ':', '')
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index b270c1761..803d42231 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -2,6 +2,7 @@
 from collections import OrderedDict
 from typing import Dict, Optional, List, Any, Union
 from sklearn.pipeline import Pipeline
+import inspect
 
 from ConfigSpace.hyperparameters import (
     Constant,
@@ -30,6 +31,8 @@
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    base_forecasting_decoder import BaseForecastingDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     ForecastingNetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.TemporalFusion import \
@@ -215,21 +218,17 @@ def get_hyperparameter_search_space(
             cond_vs_dropout = EqualsCondition(variable_selection_use_dropout, variable_selection, True)
             cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate, variable_selection_use_dropout, True)
             cs.add_conditions([cond_vs_dropout, cond_vs_dropoutrate])
-        
-        add_forbidden_for_non_ar_recurrent_decoder = False
-        if static_features_shape + future_feature_shapes[-1] == 0:
-            add_forbidden_for_non_ar_recurrent_decoder = True
 
         if True in variable_selection.choices:
             cs.add_hyperparameter(share_single_variable_networks)
             cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection, True))
 
         # Compile a list of legal preprocessors for this problem
-        available_encoders = self.get_available_components(
+        available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(
             dataset_properties=dataset_properties,
             include=include, exclude=exclude)
 
-        available_decoders = self.get_available_components(
+        available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(
             dataset_properties=dataset_properties,
             include=None, exclude=exclude,
             components=self.get_decoder_components())
@@ -250,11 +249,32 @@ def get_hyperparameter_search_space(
 
         forbiddens_decoder_auto_regressive = []
 
+        # TODO this is only a temporary solution, needs to be updated when ConfigSpace allows more complex conditions!
+        # General Idea to work with auto-regressive decoders:
+        # decoder cannot be auto-regressive if it is not recurrent
+        #   decoder_auto_regressive is conditioned on the HPs that allow recurrent decoders:
+        #     encoders that only have recurrent decoders -> EqCond(dar, encoder, en_name)
+        #     decoder_types of Encoders that contain recurrent decoders -> EqCond(dar, encoder:de_type, de_name)
+        #
+        # When no future data can be fed to the decoder (no future features), decoder must be auto-regressive:
+        #   disable the recurrent decoders without auto-regressive or variable selection
+        #   this is judged by add_forbidden_for_non_ar_recurrent_decoder
+
         if True in decoder_auto_regressive.choices:
             forbidden_decoder_ar = ForbiddenEqualsClause(decoder_auto_regressive, True)
         else:
             forbidden_decoder_ar = None
-            
+
+        add_forbidden_for_non_ar_recurrent_decoder = False
+        if static_features_shape + future_feature_shapes[-1] == 0:
+            if False in decoder_auto_regressive.choices and False in variable_selection.choices:
+                add_forbidden_for_non_ar_recurrent_decoder = True
+
+        if len(decoder_auto_regressive.choices) == 1 and True in decoder_auto_regressive.choices:
+            conds_decoder_ar = None
+        else:
+            conds_decoder_ar = []
+
         for i in range(1, int(max_num_blocks) + 1):
             block_prefix = f'block_{i}:'
 
@@ -274,6 +294,23 @@ def get_hyperparameter_search_space(
                     list(available_encoders.keys()),
                     default_value=default
                 )
+            if conds_decoder_ar is None:
+                # In this case we only allow encoders that has recurrent decoders
+                available_encoders_w_recurrent_decoder = []
+                for encoder_name in hp_encoder.choices:
+                    decoders = available_encoders[encoder_name].allowed_decoders()
+                    for decoder_name in decoders:
+                        if available_decoders[decoder_name].decoder_properties().recurrent:
+                            available_encoders_w_recurrent_decoder.append(encoder_name)
+                            break
+                if not available_encoders_w_recurrent_decoder:
+                    raise ValueError('If only auto-regressive decoder is allowed, at least one encoder must contain '
+                                     'recurrent decoder!')
+                hp_encoder = CategoricalHyperparameter(
+                    block_prefix + '__choice__',
+                    available_encoders_w_recurrent_decoder,
+                    default_value=available_encoders_w_recurrent_decoder[0])
+
             cs.add_hyperparameter(hp_encoder)
             if i > int(min_num_blocks):
                 cs.add_condition(
@@ -296,6 +333,23 @@ def get_hyperparameter_search_space(
                     if not set(hp_decoder_choice).issubset(allowed_decoders):
                         raise ValueError(
                             'The encoder hyperparameter decoder_type must be a subset of the allowed_decoders')
+                    recurrent_decoders = []
+                    for decoder_name in allowed_decoders:
+                        if available_decoders[decoder_name].decoder_properties().recurrent:
+                            recurrent_decoders.append(decoder_name)
+                    if conds_decoder_ar is None:
+                        if recurrent_decoders:
+                            updates['decoder_type'] = HyperparameterSearchSpace('decoder_type',
+                                                                                tuple(recurrent_decoders),
+                                                                                recurrent_decoders[0]
+                                                                                )
+                            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                                dataset_properties,
+                                **updates)
+                            hp_decoder_choice = recurrent_decoders
+                        else:
+                            cs.add_forbidden_clause(ForbiddenEqualsClause(hp_encoder, encoder_name))
+
                     allowed_decoders = hp_decoder_choice
                 valid_decoders = []
                 for decoder_name in allowed_decoders:
@@ -375,15 +429,28 @@ def get_hyperparameter_search_space(
 
                 cs.add_conditions(conditions_to_add)
 
-                if forbidden_decoder_ar is not None:
-                    forbiddens_ar_non_recurrent = []
-                    for encoder in hp_encoder.choices:
-                        if encoder in encoder_with_single_decoder:
-                            if not available_decoders[encoder2decoder[encoder][0]].decoder_properties().recurrent:
-                                forbiddens_ar_non_recurrent.append(ForbiddenAndConjunction(
-                                    forbidden_decoder_ar,
-                                    ForbiddenEqualsClause(hp_encoder, encoder)
-                                ))
+            if conds_decoder_ar is not None or forbidden_decoder_ar is not None:
+                forbiddens_ar_non_recurrent = []
+                for encoder in hp_encoder.choices:
+                    if len(encoder2decoder[encoder]) == 1:
+                        if available_decoders[encoder2decoder[encoder][0]].decoder_properties().recurrent:
+                            # conds_decoder_ar is not None: False can be in decoder_auto_regressive. In this case,
+                            # if hp_encoder selects encoder, then decoder_auto_regressive becomes inactiavte
+                            # (indicates a default decoder_auto_regressive=False, thus we need to add another
+                            # forbidden incase add_forbidden_for_non_ar_recurrent_decoder is required)
+                            # forbidden_decoder_ar is not None: only False in decoder_auto_regressive
+                            # add_forbidden_for_non_ar_recurrent_decoder is True:False in decoder_auto_regressive
+                            if conds_decoder_ar is not None:
+                                conds_decoder_ar.append(
+                                    EqualsCondition(decoder_auto_regressive, hp_encoder, encoder)
+                                )
+                                if add_forbidden_for_non_ar_recurrent_decoder:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            ForbiddenEqualsClause(variable_selection, False),
+                                            ForbiddenEqualsClause(hp_encoder, encoder)
+                                        )
+                                    )
                             else:
                                 if add_forbidden_for_non_ar_recurrent_decoder:
                                     forbiddens_decoder_auto_regressive.append(
@@ -395,28 +462,38 @@ def get_hyperparameter_search_space(
                                             ForbiddenEqualsClause(hp_encoder, encoder)
                                         )
                                     )
-                                
-                        elif encoder in encoders_with_multi_decoder:
-                            hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encoder}:decoder_type')
-                            for decoder in hp_decoder_type.choices:
-                                if not available_decoders[decoder].decoder_properties().recurrent:
-                                    forbiddens_ar_non_recurrent.append(ForbiddenAndConjunction(
-                                        forbidden_decoder_ar,
-                                        ForbiddenEqualsClause(hp_decoder_type, decoder)
-                                    ))
-                                else:
-                                    if add_forbidden_for_non_ar_recurrent_decoder:
-                                        forbiddens_decoder_auto_regressive.append(
+
+                    elif len(encoder2decoder[encoder]) > 1:
+                        hp_decoder_type = cs.get_hyperparameter(f'{block_prefix + encoder}:decoder_type')
+                        for decoder in hp_decoder_type.choices:
+                            if not available_decoders[decoder].decoder_properties().recurrent:
+                                # TODO this is a temporary solution: currently ConfigSpace is not able to correctly
+                                # activate/deactivate a complex nested configspace; Too many forbiddens might also rise
+                                # errors. Thus we only allow decoder_ar to be conditioned on the top layer hps and
+                                # put forbiddenclauses here
+                                if forbidden_decoder_ar is not None:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
+                                            forbidden_decoder_ar,
+                                            ForbiddenEqualsClause(hp_decoder_type, decoder)
+                                        )
+                                )
+                            else:
+                                if add_forbidden_for_non_ar_recurrent_decoder:
+                                    forbiddens_decoder_auto_regressive.append(
+                                        ForbiddenAndConjunction(
                                             ForbiddenAndConjunction(
-                                                ForbiddenAndConjunction(
-                                                    ForbiddenEqualsClause(variable_selection, False),
-                                                    ForbiddenEqualsClause(decoder_auto_regressive, False)
-                                                ),
-                                                ForbiddenEqualsClause(hp_decoder_type, decoder)
-                                            )
+                                                ForbiddenEqualsClause(variable_selection, False),
+                                                ForbiddenEqualsClause(decoder_auto_regressive, False)
+                                            ),
+                                            ForbiddenEqualsClause(hp_decoder_type, decoder)
                                         )
+                                    )
+
                     if forbiddens_ar_non_recurrent:
                         cs.add_forbidden_clauses(forbiddens_ar_non_recurrent)
+        if conds_decoder_ar:
+            cs.add_condition(OrConjunction(*conds_decoder_ar))
 
 
         use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
@@ -512,6 +589,8 @@ def get_hyperparameter_search_space(
                         ))
 
         cs.add_forbidden_clauses(forbidden_mlp_local_layer)
+        cs.get_children_of(decoder_auto_regressive)
+
         return cs
 
     @property
@@ -538,33 +617,18 @@ def set_hyperparameters(self,
         """
 
         params = configuration.get_dictionary()
-        num_blocks = params['num_blocks']
-        decoder_auto_regressive = params['decoder_auto_regressive']
-        use_temporal_fusion = params['use_temporal_fusion']
-        forecasting_structure_kwargs = dict(num_blocks=num_blocks,
-                                            use_temporal_fusion=use_temporal_fusion,
-                                            variable_selection=params['variable_selection'],
-                                            skip_connection=params['skip_connection'])
-        if 'share_single_variable_networks' in params:
-            forecasting_structure_kwargs['share_single_variable_networks'] = params['share_single_variable_networks']
-            del params['share_single_variable_networks']
-
-        del params['num_blocks']
-        del params['use_temporal_fusion']
-        del params['variable_selection']
-        del params['skip_connection']
-        del params['decoder_auto_regressive']
-
-        if 'skip_connection_type' in params:
-            forecasting_structure_kwargs['skip_connection_type'] = params['skip_connection_type']
-            del params['skip_connection_type']
-            if 'grn_use_dropout' in params:
-                del params['grn_use_dropout']
-                if 'grn_dropout_rate' in params:
-                    forecasting_structure_kwargs['grn_dropout_rate'] = params['grn_dropout_rate']
-                    del params['grn_dropout_rate']
-                else:
-                    forecasting_structure_kwargs['grn_dropout_rate'] = 0.0
+        decoder_auto_regressive = params.pop('decoder_auto_regressive', False)
+        net_structure_default_kwargs = inspect.signature(ForecastingNetworkStructure.__init__).parameters
+
+        forecasting_structure_kwargs = {
+            key: params.pop(key, value.default) for key, value in net_structure_default_kwargs.items()
+            if key != 'self'
+        }
+        if not params.pop('grn_use_dropout', False):
+            forecasting_structure_kwargs['grn_dropout_rate'] = 0.0
+
+        num_blocks = forecasting_structure_kwargs['num_blocks']
+        use_temporal_fusion = forecasting_structure_kwargs['use_temporal_fusion']
 
         pipeline_steps = [('net_structure', ForecastingNetworkStructure(**forecasting_structure_kwargs))]
         self.encoder_choice = []
@@ -576,8 +640,7 @@ def set_hyperparameters(self,
             new_params = {}
 
             block_prefix = f'block_{i}:'
-            choice = params[block_prefix + '__choice__']
-            del params[block_prefix + '__choice__']
+            choice = params.pop(block_prefix + '__choice__')
 
             for param, value in params.items():
                 if param.startswith(block_prefix):
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index c125fd024..e3162c13d 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -1,4 +1,5 @@
 import copy
+import itertools
 import unittest
 
 from ConfigSpace import Configuration
@@ -180,59 +181,62 @@ def test_encoder_choices(self):
 
     def test_base_encoder(self):
         window_size = self.fit_dictionary['window_size']
-        for uni_variant in (True, False):
-            for variable_selection in (True, False):
-                for transform_time_features in (True, False):
-                    for is_small_preprocess in (True, False):
-                        network_structure = NetworkStructure(variable_selection=variable_selection)
-
-                        dataset_properties = copy.copy(self.dataset_properties)
-                        fit_dictionary = copy.copy(self.fit_dictionary)
-
-                        dataset_properties['is_small_preprocess'] = is_small_preprocess
-                        dataset_properties['uni_variant'] = uni_variant
-
-                        fit_dictionary['dataset_properties'] = self.dataset_properties
-                        fit_dictionary['network_structure'] = network_structure
-                        fit_dictionary['transform_time_features'] = transform_time_features
-                        fit_dictionary['dataset_properties'] = dataset_properties
-
-                        encoder_block_1 = copy.deepcopy(self.encoder)
-
-                        encoder_block_2 = copy.deepcopy(self.encoder)
-                        encoder_block_2.block_number = 2
-
-                        encoder_block_1 = encoder_block_1.fit(fit_dictionary)
-                        fit_dictionary = encoder_block_1.transform(fit_dictionary)
-                        network_encoder = fit_dictionary['network_encoder']
-                        self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
-                        self.assertEqual(network_encoder['block_1'].encoder_output_shape, (1, 10))
-
-                        if variable_selection:
-                            self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
-                        else:
-                            if uni_variant:
-                                n_input_features = 0
-                            else:
-                                if is_small_preprocess:
-                                    n_input_features = 40
-                                else:
-                                    n_input_features = 15
-
-                            if transform_time_features:
-                                n_input_features += len(dataset_properties['time_feature_transform'])
-
-                            n_input_features += dataset_properties['output_shape'][-1]
-                            self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size,
-                                                                                              n_input_features))
-
-                        encoder_block_2 = encoder_block_2.fit(fit_dictionary)
-                        fit_dictionary = encoder_block_2.transform(fit_dictionary)
-
-                        network_encoder = fit_dictionary['network_encoder']
-                        self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
-                        self.assertEqual(network_encoder['block_2'].encoder_output_shape, (1, 10))
-                        self.assertEqual(network_encoder['block_2'].encoder_input_shape, (1, 10))
+        all_settings = [(True, False)] * 4
+        for hp_values in itertools.product(*all_settings):
+            uni_variant = hp_values[0]
+            variable_selection = hp_values[1]
+            transform_time_features = hp_values[2]
+            is_small_preprocess = hp_values[3]
+
+            network_structure = NetworkStructure(variable_selection=variable_selection)
+
+            dataset_properties = copy.copy(self.dataset_properties)
+            fit_dictionary = copy.copy(self.fit_dictionary)
+
+            dataset_properties['is_small_preprocess'] = is_small_preprocess
+            dataset_properties['uni_variant'] = uni_variant
+
+            fit_dictionary['dataset_properties'] = self.dataset_properties
+            fit_dictionary['network_structure'] = network_structure
+            fit_dictionary['transform_time_features'] = transform_time_features
+            fit_dictionary['dataset_properties'] = dataset_properties
+
+            encoder_block_1 = copy.deepcopy(self.encoder)
+
+            encoder_block_2 = copy.deepcopy(self.encoder)
+            encoder_block_2.block_number = 2
+
+            encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+            fit_dictionary = encoder_block_1.transform(fit_dictionary)
+            network_encoder = fit_dictionary['network_encoder']
+            self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
+            self.assertEqual(network_encoder['block_1'].encoder_output_shape, (1, 10))
+
+            if variable_selection:
+                self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
+            else:
+                if uni_variant:
+                    n_input_features = 0
+                else:
+                    if is_small_preprocess:
+                        n_input_features = 40
+                    else:
+                        n_input_features = 15
+
+                if transform_time_features:
+                    n_input_features += len(dataset_properties['time_feature_transform'])
+
+                n_input_features += dataset_properties['output_shape'][-1]
+                self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size,
+                                                                                  n_input_features))
+
+            encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+            fit_dictionary = encoder_block_2.transform(fit_dictionary)
+
+            network_encoder = fit_dictionary['network_encoder']
+            self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
+            self.assertEqual(network_encoder['block_2'].encoder_output_shape, (1, 10))
+            self.assertEqual(network_encoder['block_2'].encoder_input_shape, (1, 10))
 
     def test_base_decoder(self):
         n_prediction_steps = self.dataset_properties['n_prediction_steps']
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
index df4461c42..6f6935f64 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
@@ -78,6 +78,11 @@ def with_static_features(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def uni_variant_data(request):
+    return request.param
+
+
 class TestForecastingNetworks:
     dataset_properties, fit_dictionary = generate_fit_dict_and_dataset_property()
 
@@ -87,7 +92,8 @@ def test_network_forward(self,
                              variable_selection,
                              with_static_features,
                              network_encoder,
-                             network_type):
+                             network_type,
+                             uni_variant_data):
         if network_type == 'ForecastingDeepARNet' and net_output_type != 'distribution_sample':
             return
         if network_type == 'ForecastingSeq2SeqNet' and network_encoder == 'TCNEncoder':
@@ -95,7 +101,11 @@ def test_network_forward(self,
         if network_type == 'NBEATSNet':
             # NBEATS only needs one pass
             if not (embedding == 'NoEmbedding' and net_output_type == 'regression' and
-                    not variable_selection and not with_static_features and network_encoder == 'RNNEncoder'):
+                    not variable_selection and not with_static_features and network_encoder == 'RNNEncoder'
+                    and not uni_variant_data):
+                return
+        if uni_variant_data:
+            if not (embedding == 'NoEmbedding' and not with_static_features):
                 return
 
         dataset_properties = copy.copy(self.dataset_properties)
@@ -130,6 +140,18 @@ def test_network_forward(self,
             fit_dictionary['network_embedding'] = ReducedEmbedding([10] * 5, 2)
             dataset_properties['feature_shapes'] = {'f1': 10, 'f2': 10, 'f3': 9, 'f4': 9, 'f5': 9}
 
+        if uni_variant_data:
+            fit_dictionary['X_train'] = None
+            fit_dictionary['transform_time_features'] = False
+            dataset_properties.update({'feature_shapes': {},
+                                       'feature_names': tuple(),
+                                       'known_future_features': tuple(),
+                                       'uni_variant': True,
+                                       'input_shape': (100, 0),
+                                       'static_features': tuple(),
+                                       'future_feature_shapes': (dataset_properties['n_prediction_steps'], 0),
+                                       })
+
         n_prediction_steps = dataset_properties['n_prediction_steps']
         window_size = fit_dictionary['window_size']
         n_features_past = 10 * len(dataset_properties['feature_names']) + len(time_feature_names)
@@ -171,6 +193,13 @@ def test_network_forward(self,
                     hyperparameter='seq_encoder:decoder_auto_regressive',
                     value_range=(False,),
                     default_value=False, ))
+                if uni_variant_data and network_encoder == 'RNNEncoder':
+                    updates.append(HyperparameterSearchSpaceUpdate(
+                        node_name="network_backbone",
+                        hyperparameter='seq_encoder:block_1:RNNEncoder:decoder_type',
+                        value_range=('MLPDecoder',),
+                        default_value='MLPDecoder', ))
+
             elif network_type == 'ForecastingSeq2SeqNet':
                 updates.append(HyperparameterSearchSpaceUpdate(
                     node_name="network_backbone",
@@ -182,6 +211,7 @@ def test_network_forward(self,
                     hyperparameter='seq_encoder:decoder_auto_regressive',
                     value_range=(True,),
                     default_value=True, ))
+
             elif network_type == 'ForecastingDeepARNet':
                 updates.append(HyperparameterSearchSpaceUpdate(
                     node_name="network_backbone",
@@ -231,10 +261,14 @@ def test_network_forward(self,
         batch_size = 2
 
         past_targets = torch.ones([batch_size, 50, n_targets])
-        past_features = torch.ones([batch_size, 50, n_features_past])
-        future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
         future_targets = torch.ones([batch_size, n_prediction_steps, n_targets])
         past_observed_targets = torch.ones([batch_size, 50, n_targets]).bool()
+        if uni_variant_data:
+            past_features = None
+            future_features = None
+        else:
+            past_features = torch.ones([batch_size, 50, n_features_past])
+            future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
 
         output = neu_arch(past_targets=past_targets,
                           future_targets=future_targets,
@@ -250,6 +284,7 @@ def test_network_forward(self,
             output = output[0]
         if network_type in ["ForecastingNet", "ForecastingSeq2SeqNet"]:
             assert list(output.shape) == [batch_size, n_prediction_steps, n_targets]
+
         elif network_type == "ForecastingDeepARNet":
             assert list(output.shape) == [batch_size, n_prediction_steps + min(50, neu_arch.window_size) - 1, n_targets]
         else:
@@ -269,10 +304,14 @@ def test_network_forward(self,
         neu_arch.train()
 
         past_targets = torch.ones([batch_size, 3, n_targets])
-        past_features = torch.ones([batch_size, 3, n_features_past])
-        future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
         future_targets = torch.ones([batch_size, n_prediction_steps, n_targets])
         past_observed_targets = torch.ones([batch_size, 3, n_targets]).bool()
+        if uni_variant_data:
+            past_features = None
+            future_features = None
+        else:
+            past_features = torch.ones([batch_size, 3, n_features_past])
+            future_features = torch.ones([batch_size, n_prediction_steps, n_features_future])
 
         output = neu_arch(past_targets=past_targets,
                           future_targets=future_targets,
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index ee9c5a7ac..2155b53da 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -60,6 +60,17 @@ def test_config_space(self):
         self.assertTrue('network_decoder' in fit_dict)
         self.assertEqual(len(fit_dict['network_decoder']), num_blocks)
 
+        #test error:
+        dataset_properties = copy.copy(self.dataset_properties)
+        dataset_properties.update({'feature_shapes': {},
+                                   'feature_names': tuple(),
+                                   'known_future_features': tuple(),
+                                   'uni_variant': True,
+                                   'input_shape': (100, 0),
+                                   'static_features': tuple(),
+                                   'future_feature_shapes': (dataset_properties['n_prediction_steps'], 0),
+                                   })
+
     def test_deepar(self):
         for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder', 'TCNEncoder', 'InceptionTimeEncoder']):
             seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
@@ -114,7 +125,11 @@ def test_deepar(self):
                                                           cache_intermediate_state=True,
                                                           )
             output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
-            self.assertListEqual(list(output.shape), [10, 1, 1])
+            try:
+                self.assertListEqual(list(output.shape), [10, 1, 1])
+            except Exception:
+                import pdb
+                pdb.set_trace()
 
             encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
                                                           additional_input=[None],

From f055fd5d38d9452850cf4c32cafdd82cb0ed48eb Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 20 May 2022 19:40:04 +0200
Subject: [PATCH 274/347] test for pipelines

---
 autoPyTorch/pipeline/base_pipeline.py         |  16 ++-
 .../forecasting_decoder/NBEATSDecoder.py      |  10 +-
 .../forecasting_base_trainer.py               |   2 +-
 .../pipeline/create_searchspace_util.py       |   1 +
 .../pipeline/time_series_forecasting.py       | 113 +++++++++-------
 test/conftest.py                              |  41 +++---
 .../test_time_series_datasets.py              |  24 ++--
 .../test_time_series_transformer.py           |  14 +-
 .../test_flat_backbones.py                    |   6 +-
 .../training/test_forecasting_training.py     |  23 ++++
 .../test_time_series_forecasting_pipeline.py  | 127 ++++++++++++++++++
 11 files changed, 280 insertions(+), 97 deletions(-)
 create mode 100644 test/test_pipeline/components/training/test_forecasting_training.py
 create mode 100644 test/test_pipeline/test_time_series_forecasting_pipeline.py

diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index 1150bbca6..b8bcfade1 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -412,12 +412,24 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 # check if component is not present in include
                 if include is not None and update.node_name in include.keys():
                     if split_hyperparameter[0] not in include[update.node_name]:
-                        raise ValueError("Not found {} in include".format(split_hyperparameter[0]))
+                        hp_in_component = False
+                        for include_component in include[update.node_name]:
+                            if include_component.startswith(split_hyperparameter[0]):
+                                hp_in_component = True
+                                break
+                        if not hp_in_component:
+                            raise ValueError("Not found {} in include".format(split_hyperparameter[0]))
 
                 # check if component is present in exclude
                 if exclude is not None and update.node_name in exclude.keys():
                     if split_hyperparameter[0] in exclude[update.node_name]:
-                        raise ValueError("Found {} in exclude".format(split_hyperparameter[0]))
+                        hp_in_component = False
+                        for exclude_component in exclude[update.node_name]:
+                            if exclude_component.startswith(split_hyperparameter[0]):
+                                hp_in_component = True
+                                break
+                        if not hp_in_component:
+                            raise ValueError("Found {} in exclude".format(split_hyperparameter[0]))
 
                 components = node.get_components()
                 # if hyperparameter is __choice__, check if
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index a783d4756..8824199c8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -176,7 +176,7 @@ def fitted_encoder(self):
         return ['NBEATSEncoder']
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
-        X.update({'backcast_loss_ratio': self.config['backcast_loss_ratio']})
+        X.update({'backcast_loss_ration': self.config['backcast_loss_ration']})
         return super().transform(X)
 
     @staticmethod
@@ -275,8 +275,8 @@ def get_hyperparameter_search_space(
                 value_range=(0, 0.8),
                 default_value=0.1,
             ),
-            backcast_loss_ratio: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="backcast_loss_ratio",
+            backcast_loss_ration: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter="backcast_loss_ration",
                 value_range=(0., 1.),
                 default_value=1.,
             )
@@ -315,7 +315,7 @@ def get_hyperparameter_search_space(
             use_dropout: if dropout is applied
             normalization: if normalization is applied
             dropout: dropout value, if use_dropout is set as True
-            backcast_loss_ratio: weight of backcast in comparison to forecast when calculating the loss.
+            backcast_loss_ration: weight of backcast in comparison to forecast when calculating the loss.
                 A weight of 1.0 means that forecast and backcast loss is weighted the same (regardless of backcast and
                 forecast lengths). Defaults to 0.0, i.e. no weight.
         Returns:
@@ -329,7 +329,7 @@ def get_hyperparameter_search_space(
         # General Hyperparameters
         add_hyperparameter(cs, activation, CategoricalHyperparameter)
         add_hyperparameter(cs, normalization, CategoricalHyperparameter)
-        add_hyperparameter(cs, backcast_loss_ratio, UniformFloatHyperparameter)
+        add_hyperparameter(cs, backcast_loss_ration, UniformFloatHyperparameter)
 
         cs.add_hyperparameter(n_beats_type)
         # N-BEATS-G
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index f31d0aa15..0c1fe145b 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -220,7 +220,7 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
             Dict[str, float]: scores for each desired metric
         """
         if not isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
-            # To save time, we simply make one step prediction for DeepAR and Seq2Seq
+            # To save time, we simply make one-step prediction for DeepAR and Seq2Seq
             self.model.eval()
         if isinstance(self.model, ForecastingDeepARNet):
             self.model.only_generate_future_dist = True
diff --git a/autoPyTorch/pipeline/create_searchspace_util.py b/autoPyTorch/pipeline/create_searchspace_util.py
index 640a787e2..7b13542df 100644
--- a/autoPyTorch/pipeline/create_searchspace_util.py
+++ b/autoPyTorch/pipeline/create_searchspace_util.py
@@ -64,6 +64,7 @@ def find_active_choices(
 ) -> List[str]:
     if not hasattr(node, "get_available_components"):
         raise ValueError()
+
     available_components = node.get_available_components(dataset_properties,
                                                          include=include,
                                                          exclude=exclude)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index c368c2877..2551437d7 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -199,6 +199,17 @@ def _get_hyperparameter_search_space(self,
                                     forbidden_regression_losses_all.append(forbidden_hp_dist)
                             """
 
+                # NBEATS only works with NoEmbedding
+                if 'network_backbone:flat_encoder:__choice__' in cs:
+                    hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
+                    if 'NBEATSEncoder' in hp_flat_encoder.choices:
+                        cs.add_forbidden_clause(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                            cs.get_hyperparameter(
+                                'network_embedding:__choice__'), 'LearnedEntityEmbedding')
+                        )
+
+
         # dist_cls and auto_regressive are only activate if the network outputs distribution
         if 'loss' in self.named_steps.keys() and 'network_backbone' in self.named_steps.keys():
             hp_loss = cs.get_hyperparameter('loss:__choice__')
@@ -225,44 +236,47 @@ def _get_hyperparameter_search_space(self,
                         forbidden_hp_dist = ForbiddenAndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
                         forbidden_losses_all.append(forbidden_hp_dist)
 
-            decoder_auto_regressive = cs.get_hyperparameter("network_backbone:seq_encoder:decoder_auto_regressive")
-            forecast_strategy = cs.get_hyperparameter("loss:DistributionLoss:forecast_strategy")
-            use_tf = cs.get_hyperparameter("network_backbone:seq_encoder:use_temporal_fusion")
-
-            if True in decoder_auto_regressive.choices and\
-                    'sample' in forecast_strategy.choices and True in use_tf.choices:
-                cs.add_forbidden_clause(
-                    ForbiddenAndConjunction(
-                        ForbiddenEqualsClause(decoder_auto_regressive, True),
-                        ForbiddenEqualsClause(forecast_strategy, 'sample'),
-                        ForbiddenEqualsClause(use_tf, True)
+            if "network_backbone:seq_encoder:decoder_auto_regressive" in cs:
+                decoder_auto_regressive = cs.get_hyperparameter("network_backbone:seq_encoder:decoder_auto_regressive")
+                forecast_strategy = cs.get_hyperparameter("loss:DistributionLoss:forecast_strategy")
+                use_tf = cs.get_hyperparameter("network_backbone:seq_encoder:use_temporal_fusion")
+
+                if True in decoder_auto_regressive.choices and\
+                        'sample' in forecast_strategy.choices and True in use_tf.choices:
+                    cs.add_forbidden_clause(
+                        ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(decoder_auto_regressive, True),
+                            ForbiddenEqualsClause(forecast_strategy, 'sample'),
+                            ForbiddenEqualsClause(use_tf, True)
+                        )
                     )
-                )
-
-            network_flat_encoder_hp = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
 
-            if 'MLPEncoder' in network_flat_encoder_hp.choices:
-                forbidden = ['MLPEncoder']
-                forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices]
-                for hp_ar in hp_deepAR:
-                    if True in hp_ar.choices:
-                        forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                        forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp, forbidden_deepAREncoder)
-                        forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
-                        forbidden_losses_all.append(forbidden_hp_ar_mlp)
-
-            forecast_strategy = cs.get_hyperparameter('loss:DistributionLoss:forecast_strategy')
-            if 'mean' in forecast_strategy.choices:
-                for hp_ar in hp_deepAR:
-                    if True in hp_ar.choices:
-
-                        forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                        forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
-                        forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
-                                                                                    forbidden_hp_forecast_strategy)
-                        forbidden_losses_all.append(forbidden_hp_ar_forecast_strategy)
-
-            cs.add_forbidden_clauses(forbidden_losses_all)
+            if 'network_backbone:flat_encoder:__choice__' in cs:
+                network_flat_encoder_hp = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
+
+                if 'MLPEncoder' in network_flat_encoder_hp.choices:
+                    forbidden = ['MLPEncoder']
+                    forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices]
+                    for hp_ar in hp_deepAR:
+                        if True in hp_ar.choices:
+                            forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                            forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp, forbidden_deepAREncoder)
+                            forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
+                            forbidden_losses_all.append(forbidden_hp_ar_mlp)
+
+            if 'loss:DistributionLoss:forecast_strategy' in cs:
+                forecast_strategy = cs.get_hyperparameter('loss:DistributionLoss:forecast_strategy')
+                if 'mean' in forecast_strategy.choices:
+                    for hp_ar in hp_deepAR:
+                        if True in hp_ar.choices:
+
+                            forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
+                            forbidden_hp_forecast_strategy = ForbiddenEqualsClause(forecast_strategy, 'mean')
+                            forbidden_hp_ar_forecast_strategy = ForbiddenAndConjunction(forbidden_hp_ar,
+                                                                                        forbidden_hp_forecast_strategy)
+                            forbidden_losses_all.append(forbidden_hp_ar_forecast_strategy)
+            if forbidden_losses_all:
+                cs.add_forbidden_clauses(forbidden_losses_all)
 
             # NBEATS
             network_encoder_hp = cs.get_hyperparameter("network_backbone:__choice__")
@@ -275,21 +289,22 @@ def _get_hyperparameter_search_space(self,
             forbidden_loss_non_regression = ForbiddenInClause(hp_loss, loss_non_regression)
             forbidden_backcast = ForbiddenEqualsClause(data_loader_backcast, True)
 
-            hp_flat_encoder = cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
-
-            # Ensure that NBEATS encoder only works with NBEATS decoder
-            if 'NBEATSEncoder' in hp_flat_encoder.choices:
-                forbidden_NBEATS.append(ForbiddenAndConjunction(
-                    ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
-                    forbidden_loss_non_regression)
-                )
-                transform_time_features = "data_loader:transform_time_features"
-                if transform_time_features in cs:
-                    hp_ttf = cs.get_hyperparameter(transform_time_features)
+            if 'network_backbone:flat_encoder:__choice__' in cs:
+                hp_flat_encoder = cs.get_hyperparameter("network_backbone:flat_encoder:__choice__")
+
+                # Ensure that NBEATS encoder only works with NBEATS decoder
+                if 'NBEATSEncoder' in hp_flat_encoder.choices:
                     forbidden_NBEATS.append(ForbiddenAndConjunction(
                         ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
-                        ForbiddenEqualsClause(hp_ttf, True))
+                        forbidden_loss_non_regression)
                     )
+                    transform_time_features = "data_loader:transform_time_features"
+                    if transform_time_features in cs:
+                        hp_ttf = cs.get_hyperparameter(transform_time_features)
+                        forbidden_NBEATS.append(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                            ForbiddenEqualsClause(hp_ttf, True))
+                        )
 
             forbidden_NBEATS.append(ForbiddenAndConjunction(
                 forbidden_backcast,
@@ -320,7 +335,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             default_dataset_properties.update(dataset_properties)
 
         if not default_dataset_properties.get("uni_variant", False):
-            steps.extend([("imputer", TimeSeriesFeatureImputer(random_state=self.random_state)),
+            steps.extend([("impute", TimeSeriesFeatureImputer(random_state=self.random_state)),
                           ("scaler", BaseScaler(random_state=self.random_state)),
                           ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
                                                                random_state=self.random_state)),
diff --git a/test/conftest.py b/test/conftest.py
index 4949d85c9..9b08d506d 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -727,7 +727,7 @@ def generate_forecasting_features(feature_type, length):
     return features, targets, input_validator.fit(features, targets, start_times=start_times)
 
 
-def get_forecasting_datamangaer(X, y, validator, with_y_test=True, forecast_horizon=5, freq='1D'):
+def get_forecasting_datamangaer(X, y, validator, with_y_test=True, forecast_horizon=3, freq='1D'):
     if X is not None:
         X_test = []
         for x in X:
@@ -761,7 +761,7 @@ def get_forecasting_datamangaer(X, y, validator, with_y_test=True, forecast_hori
     return datamanager
 
 
-def get_forecasting_fit_dictionary(datamanager, backend, budget_type='epochs'):
+def get_forecasting_fit_dictionary(datamanager, backend, forecasting_budgets='epochs'):
     info = datamanager.get_required_dataset_info()
 
     dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
@@ -775,29 +775,29 @@ def get_forecasting_fit_dictionary(datamanager, backend, budget_type='epochs'):
         'working_dir': './tmp/example_ensemble_1',  # Hopefully generated by backend
         'device': 'cpu',
         'torch_num_threads': 1,
-        'early_stopping': 10,
+        'early_stopping': 1,
         'use_tensorboard_logger': False,
         'use_pynisher': False,
         'metrics_during_training': False,
         'seed': 1,
         'budget_type': 'epochs',
-        'epochs': 5,
+        'epochs': 1,
         'split_id': 0,
         'backend': backend,
         'logger_port': logging.handlers.DEFAULT_TCP_LOGGING_PORT,
     }
-    if budget_type == 'epochs':
-        fit_dictionary.update({'budget_type': 'epochs',
-                               'epochs': 5})
-    elif budget_type == 'resolution':
-        fit_dictionary.update({'budget_type': 'resolution',
-                               'sample_interval': 10})
-    elif budget_type == 'num_sample_per_seq':
-        fit_dictionary.update({'budget_type': 'num_samples',
-                               'fraction_samples_per_seq': 0.1})
-    elif budget_type == 'num_seq':
-        fit_dictionary.update({'budget_type': 'num_samples',
-                               'fraction_seq': 0.1})
+    if forecasting_budgets == 'epochs':
+        fit_dictionary.update({'forecasting_budgets': 'epochs',
+                               'epochs': 1})
+    elif forecasting_budgets == 'resolution':
+        fit_dictionary.update({'forecasting_budgets': 'resolution',
+                               'sample_interval': 2})
+    elif forecasting_budgets == 'num_sample_per_seq':
+        fit_dictionary.update({'forecasting_budgets': 'num_sample_per_seq',
+                               'fraction_samples_per_seq': 0.5})
+    elif forecasting_budgets == 'num_seq':
+        fit_dictionary.update({'forecasting_budgets': 'num_seq',
+                               'fraction_seq': 0.5})
     else:
         raise NotImplementedError
     backend.save_datamanager(datamanager)
@@ -863,11 +863,16 @@ def get_forecasting_datamanager(request):
     return datamanager
 
 
+@pytest.fixture(params=['epochs'])
+def forecasting_budgets(request):
+    return request.param
+
+
 @pytest.fixture
-def get_fit_dictionary_forecasting(request, backend):
+def fit_dictionary_forecasting(request, forecasting_budgets, backend):
     X, y, validator = get_forecasting_data(request.param)
     datamanager = get_forecasting_datamangaer(X, y, validator)
-    return get_forecasting_fit_dictionary(datamanager, backend)
+    return get_forecasting_fit_dictionary(datamanager, backend, forecasting_budgets=forecasting_budgets)
 
 
 # Fixtures for forecasting validators.
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 55457d149..4352d73b7 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -211,11 +211,11 @@ def test_exception(self):
             seq_2.get_test_target(5)
 
 
-@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing',
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing',
                                                             'uni_variant_w_missing',
                                                             'multi_variant_wo_missing',
                                                             'uni_variant_w_missing'], indirect=True)
-def test_dataset_properties(backend, get_fit_dictionary_forecasting):
+def test_dataset_properties(backend, fit_dictionary_forecasting):
     # The fixture creates a datamanager by itself
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
     info = {'task_type': datamanager.task_type,
@@ -235,12 +235,12 @@ def test_dataset_properties(backend, get_fit_dictionary_forecasting):
     assert isinstance(dataset_properties['time_feature_transform'], List)
     for item in dataset_properties['time_feature_transform']:
         assert isinstance(item, Callable)
-    assert dataset_properties['uni_variant'] == (get_fit_dictionary_forecasting['X_train'] is None)
+    assert dataset_properties['uni_variant'] == (fit_dictionary_forecasting['X_train'] is None)
     assert dataset_properties['targets_have_missing_values'] == \
-           get_fit_dictionary_forecasting['y_train'].isnull().values.any()
-    if get_fit_dictionary_forecasting['X_train'] is not None:
+           fit_dictionary_forecasting['y_train'].isnull().values.any()
+    if fit_dictionary_forecasting['X_train'] is not None:
         assert dataset_properties['features_have_missing_values'] == \
-               get_fit_dictionary_forecasting['X_train'].isnull().values.any()
+               fit_dictionary_forecasting['X_train'].isnull().values.any()
 
 
 def test_freq_valeus():
@@ -275,8 +275,8 @@ def test_target_normalization():
                        np.hstack([(y - np.mean(y))/np.std(y, ddof=1) for y in Y]))
 
 
-@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
-def test_dataset_index(backend, get_fit_dictionary_forecasting):
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
+def test_dataset_index(backend, fit_dictionary_forecasting):
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
     assert np.allclose(datamanager[5][0]['past_targets'][-1].numpy(), 5.0)
     assert np.allclose(datamanager[50][0]['past_targets'][-1].numpy(), 1005.0)
@@ -292,8 +292,8 @@ def test_dataset_index(backend, get_fit_dictionary_forecasting):
     assert np.allclose(val_targets, datamanager.get_test_target(val_indices))
 
 
-@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
-def test_update_dataset(backend, get_fit_dictionary_forecasting):
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
+def test_update_dataset(backend, fit_dictionary_forecasting):
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
     X = datamanager.train_tensors[0]
     for col in X.columns:
@@ -316,8 +316,8 @@ def test_update_dataset(backend, get_fit_dictionary_forecasting):
         assert test_seq.X.shape[0] - seq_len == 2 * datamanager.n_prediction_steps
 
 
-@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
-def test_test_tensors(backend, get_fit_dictionary_forecasting):
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
+def test_test_tensors(backend, fit_dictionary_forecasting):
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
     test_tensors = datamanager.test_tensors
     forecast_horizon = datamanager.n_prediction_steps
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
index 3a6464124..386258c22 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
@@ -13,17 +13,17 @@
 )
 
 
-@pytest.mark.parametrize("get_fit_dictionary_forecasting", ['uni_variant_wo_missing',
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing',
                                                             'uni_variant_w_missing',
                                                             'multi_variant_wo_missing',
                                                             'multi_variant_w_missing',
                                                             'multi_variant_w_missing_only_cat',
                                                             'multi_variant_w_missing_only_num',
                                                             ], indirect=True)
-def test_time_series_preprocess(get_fit_dictionary_forecasting):
-    pipeline = ForecastingPipeline(dataset_properties=get_fit_dictionary_forecasting['dataset_properties'])
-    pipeline = pipeline.fit(get_fit_dictionary_forecasting)
-    X = pipeline.transform(get_fit_dictionary_forecasting)
+def test_time_series_preprocess(fit_dictionary_forecasting):
+    pipeline = ForecastingPipeline(dataset_properties=fit_dictionary_forecasting['dataset_properties'])
+    pipeline = pipeline.fit(fit_dictionary_forecasting)
+    X = pipeline.transform(fit_dictionary_forecasting)
 
     assert 'time_series_target_transformer' in X.keys()
     target_transformer = X['time_series_target_transformer']
@@ -55,11 +55,11 @@ def test_time_series_preprocess(get_fit_dictionary_forecasting):
         assert isinstance(time_series_feature_transformer.get_column_transformer(), ColumnTransformer)
 
         # Make sure no columns are unintentionally dropped after preprocessing
-        if len(get_fit_dictionary_forecasting['dataset_properties']["numerical_columns"]) == 0:
+        if len(fit_dictionary_forecasting['dataset_properties']["numerical_columns"]) == 0:
             categorical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_['categorical_pipeline']
             categorical_data = categorical_pipeline.transform(X['X_train'])
             assert features.shape[1] == categorical_data.shape[1]
-        elif len(get_fit_dictionary_forecasting['dataset_properties']["categorical_columns"]) == 0:
+        elif len(fit_dictionary_forecasting['dataset_properties']["categorical_columns"]) == 0:
             numerical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_['numerical_pipeline']
             numerical_data = numerical_pipeline.transform(X['X_train'])
             assert features.shape[1] == numerical_data.shape[1]
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
index e5ff0f3c0..446dda6bd 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -1,5 +1,6 @@
 import copy
 import unittest
+
 from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
     generate_fit_dict_and_dataset_property
 )
@@ -127,7 +128,7 @@ def test_nbeats_network(self):
         nbeats_cs = NBEATSDecoder.get_hyperparameter_search_space(self.dataset_properties)
 
         nbeatsI_cfg = {
-            "backcast_loss_ratio": 0.0,
+            "backcast_loss_ration": 0.0,
             "normalization": "LN",
             "activation": "relu",
 
@@ -154,7 +155,7 @@ def test_nbeats_network(self):
         }
 
         nbeatsG_cfg = {
-            "backcast_loss_ratio": 0.0,
+            "backcast_loss_ration": 0.0,
             "normalization": "NoNorm",
             "activation": "relu",
 
@@ -218,4 +219,3 @@ def test_nbeats_network(self):
                 backcast_block, forecast_block = block([None], input_tensor)
                 self.assertListEqual(list(backcast_block.shape), [10, window_size * 1])
                 self.assertListEqual(list(forecast_block.shape), [10, n_prediction_steps * 1])
-
diff --git a/test/test_pipeline/components/training/test_forecasting_training.py b/test/test_pipeline/components/training/test_forecasting_training.py
new file mode 100644
index 000000000..4734c6ab0
--- /dev/null
+++ b/test/test_pipeline/components/training/test_forecasting_training.py
@@ -0,0 +1,23 @@
+import unittest
+
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+
+
+class TestGetBudgetTracker(unittest.TestCase):
+    def test_get_budget_tracker(self):
+        trainer = ForecastingTrainerChoice({})
+        max_epoch = 50
+
+        X = {'budget_type': 'epochs',
+             'epochs': 5,
+             }
+        budget_tracker = trainer.get_budget_tracker(X)
+        self.assertEqual(budget_tracker.max_epochs, 5)
+
+        for budeget_type in FORECASTING_BUDGET_TYPE:
+            budget_tracker = trainer.get_budget_tracker({'budget_type': budeget_type})
+            self.assertEqual(budget_tracker.max_epochs, max_epoch)
+
+        budget_tracker = trainer.get_budget_tracker({'budget_type': 'runtime'})
+        self.assertIsNone(budget_tracker.max_epochs)
diff --git a/test/test_pipeline/test_time_series_forecasting_pipeline.py b/test/test_pipeline/test_time_series_forecasting_pipeline.py
new file mode 100644
index 000000000..96ca6a1b9
--- /dev/null
+++ b/test/test_pipeline/test_time_series_forecasting_pipeline.py
@@ -0,0 +1,127 @@
+import copy
+
+import pytest
+
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+@pytest.fixture(params=['ForecastingNet', 'ForecastingSeq2SeqNet', 'ForecastingDeepARNet', 'NBEATSNet'])
+def network_type(request):
+    return request.param
+
+@pytest.fixture(params=['NBEATSNet'])
+def network_type(request):
+    return request.param
+
+class TestTimeSeriesForecastingPipeline:
+    @pytest.mark.parametrize("fit_dictionary_forecasting", ["uni_variant_wo_missing",
+                                                            "uni_variant_w_missing",
+                                                            "multi_variant_wo_missing",
+                                                            "multi_variant_w_missing",
+                                                            "multi_variant_only_cat",
+                                                            "multi_variant_only_num"], indirect=True)
+    def test_fit_predict(self, fit_dictionary_forecasting, forecasting_budgets):
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+        if not dataset_properties['uni_variant'] and len(dataset_properties['categories']) > 0:
+            include = {'network_embedding': ['LearnedEntityEmbedding']}
+        else:
+            include = None
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 include=include)
+        step_names = pipeline.named_steps.keys()
+        step_names_multi_processing = ['impute', 'scaler', 'encoding', 'time_series_transformer', 'preprocessing']
+
+        steps_multi_in_pipeline = [step_name_multi in step_names for step_name_multi in step_names_multi_processing]
+
+        if not dataset_properties['uni_variant']:
+            assert sum(steps_multi_in_pipeline) == len(steps_multi_in_pipeline)
+        else:
+            assert sum(steps_multi_in_pipeline) == 0
+
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        pipeline = pipeline.fit(fit_dict)
+        datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
+        test_sets = datamanager.generate_test_seqs()
+        predict = pipeline.predict(test_sets)
+
+        assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]
+
+    @pytest.mark.parametrize("fit_dictionary_forecasting, forecasting_budgets", [
+        ["multi_variant_wo_missing", 'resolution'],
+        ["multi_variant_wo_missing", 'num_seq'],
+        ["multi_variant_wo_missing", 'num_sample_per_seq'],
+    ], indirect=True)
+    def test_fit_budgets_types(self, fit_dictionary_forecasting, forecasting_budgets):
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties)
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        pipeline = pipeline.fit(fit_dict)
+        datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
+        test_sets = datamanager.generate_test_seqs()
+        predict = pipeline.predict(test_sets)
+
+        assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]
+
+    @pytest.mark.parametrize("fit_dictionary_forecasting", ["multi_variant_w_missing"], indirect=True)
+    def test_networks(self, fit_dictionary_forecasting, network_type):
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+
+        updates = HyperparameterSearchSpaceUpdates()
+
+        if network_type == 'NBEATSNet':
+            include = {'network_backbone': ['flat_encoder:NBEATSEncoder'],
+                       'loss': ['RegressionLoss']}
+
+            updates.append(node_name='network_backbone',
+                           hyperparameter='flat_encoder:NBEATSDecoder:backcast_loss_ration',
+                           value_range=[0.1, 0.9],
+                           default_value=0.5)
+        else:
+            updates.append(node_name='network_backbone',
+                           hyperparameter='seq_encoder:num_blocks',
+                           value_range=[1, 1],
+                           default_value=1)
+            include = None
+            if network_type == 'ForecastingNet':
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                               value_range=[False, ],
+                               default_value=False)
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:decoder_auto_regressive',
+                               value_range=[False, ],
+                               default_value=False)
+
+            elif network_type == 'ForecastingSeq2SeqNet':
+                include = {'network_backbone': ['seq_encoder']}
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:decoder_auto_regressive',
+                               value_range=[True, ],
+                               default_value=True)
+
+            elif network_type == 'ForecastingDeepARNet':
+                include = {'network_backbone': ['seq_encoder:RNNEncoder'],
+                           'loss': ['DistributionLoss']}
+
+                updates.append(node_name='network_backbone',
+                               hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
+                               value_range=[False, ],
+                               default_value=False)
+
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 include=include,
+                                                 search_space_updates=updates)
+
+        cs = pipeline.get_hyperparameter_search_space()
+
+        pipeline.set_hyperparameters(cs.get_default_configuration())
+
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        pipeline = pipeline.fit(fit_dict)
+        datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
+        test_sets = datamanager.generate_test_seqs()
+        predict = pipeline.predict(test_sets)
+
+        assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]

From ccab50e85bceada0aba44383810104fdc1b60f9a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 21 May 2022 15:25:33 +0200
Subject: [PATCH 275/347] fixed sampler

---
 .../training/data_loader/time_series_util.py           | 10 +++++-----
 .../training/test_time_series_data_loader.py           |  9 +++++----
 .../test_time_series_forecasting_pipeline.py           |  3 ---
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 1a36460f0..30900ec62 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -166,7 +166,7 @@ def __init__(self,
                                  f'However, they are {len(seq_lengths)} versus {len(num_instances_per_seqs)}')
             seq_intervals_int = []
             seq_intervals_decimal = []
-            # seq_intervals_decimal_length = []
+
             num_expected_ins_decimal = []
             idx_tracker = 0
             for seq_idx, (num_instances, seq_length) in enumerate(zip(num_instances_per_seqs, seq_lengths)):
@@ -176,15 +176,15 @@ def __init__(self,
                     idx_start = idx_tracker
 
                 num_interval = int(np.ceil(num_instances))
-                if num_interval > idx_end - idx_start:
+                if num_interval > idx_end - idx_start or num_interval == 0:
                     interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
-                    # we consider
+                    # In this case, seq_intervals_decimal contains the entire interval of the sequence.
                     num_expected_ins_decimal.append(num_instances)
                     seq_intervals_decimal.append(interval[:2])
-                    seq_intervals_int.append(interval[1:])
                 else:
                     interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
-
+                    # The first two item determines the first sequence interval where most of the samples need to be
+                    # padded, we then make it the interval for the expected decimal
                     num_expected_ins_decimal.append(np.modf(num_instances)[0])
                     seq_intervals_decimal.append(interval[:2])
 
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
index 2ec96d144..e6cc2c473 100644
--- a/test/test_pipeline/components/training/test_time_series_data_loader.py
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -433,22 +433,23 @@ def test_pad_sequence_controller(self):
     def test_time_series_sampler(self):
         indices = np.arange(100)
         seq_lengths = [5, 10, 15, 20, 50]
-        num_instances_per_seqs = [3.3, 1.3, 7.5, 10, 20.1]
+        num_instances_per_seqs = [3.3, 1.3, 0.0, 10, 20.1]
 
         sampler = TimeSeriesSampler(indices, seq_lengths, num_instances_per_seqs, min_start=2)
         self.assertEqual(sampler.num_instances, int(np.round(np.sum(num_instances_per_seqs))))
         # The first sequence does not contain enough data to allow 3.3 sequences, so it only has 1 interval
         # For the others, Interval should be np.floor(n_inst) + 1 (resulting in  np.floor(n_inst) intervals)
-        self.assertEqual(list(map(len, sampler.seq_intervals_int)), [1, 2, 8, 10, 21])
+
+        self.assertEqual(list(map(len, sampler.seq_intervals_int)), [1, 2, 1, 10, 21])
         self.assertTrue(torch.equal(sampler.seq_intervals_decimal, torch.tensor([[2, 5],
                                                                                  [7, 11],
-                                                                                 [17, 18],
+                                                                                 [17, 30],
                                                                                  [32, 33],
                                                                                  [52, 54]])))
         self.assertTrue(
             torch.allclose(sampler.num_expected_ins_decimal,
                            torch.Tensor(
-                               [3.3000e+00, 3.0000e-01, 5.0000e-01, 1.0000e-08, 1.0000e-01]).type(torch.float64))
+                               [3.3000e+00, 3.0000e-01, 1.0000e-08, 1.0000e-08, 1.0000e-01]).type(torch.float64))
         )
 
         for i in range(5):
diff --git a/test/test_pipeline/test_time_series_forecasting_pipeline.py b/test/test_pipeline/test_time_series_forecasting_pipeline.py
index 96ca6a1b9..7fedf48ec 100644
--- a/test/test_pipeline/test_time_series_forecasting_pipeline.py
+++ b/test/test_pipeline/test_time_series_forecasting_pipeline.py
@@ -10,9 +10,6 @@
 def network_type(request):
     return request.param
 
-@pytest.fixture(params=['NBEATSNet'])
-def network_type(request):
-    return request.param
 
 class TestTimeSeriesForecastingPipeline:
     @pytest.mark.parametrize("fit_dictionary_forecasting", ["uni_variant_wo_missing",

From 54acaa65f7f30667a546309e47a58750196173b1 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 21 May 2022 15:47:51 +0200
Subject: [PATCH 276/347] maint sampler

---
 .../components/training/data_loader/time_series_util.py     | 1 +
 test/test_datasets/test_time_series_datasets.py             | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 30900ec62..051eb27ea 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -181,6 +181,7 @@ def __init__(self,
                     # In this case, seq_intervals_decimal contains the entire interval of the sequence.
                     num_expected_ins_decimal.append(num_instances)
                     seq_intervals_decimal.append(interval[:2])
+                    seq_intervals_int.append(interval[1:])
                 else:
                     interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
                     # The first two item determines the first sequence interval where most of the samples need to be
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 4352d73b7..b1062ed9e 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -279,9 +279,9 @@ def test_target_normalization():
 def test_dataset_index(backend, fit_dictionary_forecasting):
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
     assert np.allclose(datamanager[5][0]['past_targets'][-1].numpy(), 5.0)
-    assert np.allclose(datamanager[50][0]['past_targets'][-1].numpy(), 1005.0)
-    assert np.allclose(datamanager[150][0]['past_targets'][-1].numpy(), 2050.0)
-    assert np.allclose(datamanager[-1][0]['past_targets'][-1].numpy(), 9134.0)
+    assert np.allclose(datamanager[50][0]['past_targets'][-1].numpy(), 1003.0)
+    assert np.allclose(datamanager[150][0]['past_targets'][-1].numpy(), 2046.0)
+    assert np.allclose(datamanager[-1][0]['past_targets'][-1].numpy(), 9136.0)
 
     assert datamanager.get_time_series_seq(50) == datamanager.datasets[1]
 

From da6e92d25c90da01e789e90f68d46138e060a001 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 21 May 2022 17:02:29 +0200
Subject: [PATCH 277/347] resolve conflict between embedding and net encoder

---
 autoPyTorch/api/time_series_forecasting.py    | 32 ++++++++++---------
 .../setup/network/forecasting_architecture.py |  1 -
 .../setup/network_embedding/NoEmbedding.py    |  4 +--
 .../base_network_embedding.py                 | 19 ++++++-----
 .../pipeline/time_series_forecasting.py       | 32 +++++++++----------
 5 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 5461f730a..6b6263554 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -162,9 +162,10 @@ def _get_dataset_input_validator(
             dataset_name: Optional[str] = None,
             dataset_compression: Optional[DatasetCompressionSpec] = None,
             freq: Optional[Union[str, int, List[int]]] = None,
-            start_times_train: List[pd.DatetimeIndex] = [],
-            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+            start_times: List[pd.DatetimeIndex] = [],
             n_prediction_steps: int = 1,
+            known_future_features: Tuple[Union[int, str]] = (),
+            **forecasting_dataset_kwargs,
     ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -216,22 +217,23 @@ def _get_dataset_input_validator(
             dataset_compression=dataset_compression
         )
 
-        # Fit a input validator to check the provided data
+        # Fit an input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
-        input_validator.fit(X_train=X_train, y_train=y_train, start_times_train=start_times_train,
-                            X_test=X_test, y_test=y_test, start_times_test=start_times_test)
+        input_validator.fit(X_train=X_train, y_train=y_train, start_times=start_times,
+                            X_test=X_test, y_test=y_test)
 
         dataset = TimeSeriesForecastingDataset(
             X=X_train, Y=y_train,
             X_test=X_test, Y_test=y_test,
             freq=freq,
-            start_times_train=start_times_train,
-            start_times_test=start_times_test,
+            start_times=start_times,
             validator=input_validator,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
             n_prediction_steps=n_prediction_steps,
+            known_future_features=known_future_features,
+            **forecasting_dataset_kwargs
         )
 
         return dataset, input_validator
@@ -245,8 +247,7 @@ def search(
             y_test: Optional[Union[List, pd.DataFrame]] = None,
             n_prediction_steps: int = 1,
             freq: Optional[Union[str, int, List[int]]] = None,
-            start_times_train: List[pd.DatetimeIndex] = [],
-            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
+            start_times: List[pd.DatetimeIndex] = [],
             dataset_name: Optional[str] = None,
             budget_type: str = 'epochs',
             min_budget: Union[int, str] = 5,
@@ -266,6 +267,7 @@ def search(
             custom_init_setting_path: Optional[str] = None,
             min_num_test_instances: Optional[int] = None,
             dataset_compression: Union[Mapping[str, Any], bool] = False,
+            **forecasting_dataset_kwargs
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -291,10 +293,8 @@ def search(
             freq: Optional[Union[str, int, List[int]]]
                 frequency information, it determines the configuration space of the window size, if it is not given,
                 we will use the default configuration
-            start_times_train: : List[pd.DatetimeIndex]
+            start_times: : List[pd.DatetimeIndex]
                 A list indicating the start time of each series in the training sets
-            start_times_test: Optional[List[pd.DatetimeIndex]] = None,
-            A list indicating the start time of each series in the test sets
             dataset_name: Optional[str],
                 dataset name
             budget_type (str):
@@ -378,6 +378,8 @@ def search(
                 if it is set None, then full validation sets will be evaluated in each fidelity. Otherwise, the number
                 of instances in the test sets should be a value that is at least as great as this value, otherwise, the
                 number of test instance is proportional to its fidelity
+            forecasting_dataset_kwargs: Dict[Any]
+                Forecasting dataset kwargs used to initialize forecasting dataset
         Returns:
             self
 
@@ -395,9 +397,9 @@ def search(
             dataset_name=dataset_name,
             dataset_compression=self._dataset_compression,
             freq=freq,
-            start_times_train=start_times_train,
-            start_times_test=start_times_test,
-            n_prediction_steps=n_prediction_steps
+            start_times=start_times,
+            n_prediction_steps=n_prediction_steps,
+            **forecasting_dataset_kwargs
         )
 
         if self.dataset.base_window_size is not None or not self.customized_window_size:
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index b5259e011..4f84a028e 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -393,7 +393,6 @@ def pre_processing(self,
                 past_observed_targets = past_observed_targets[:, -self.window_size:]
             past_targets, _, loc, scale = self.target_scaler(past_targets, past_observed_targets)
             truncated_past_targets = past_targets
-
         if past_features is not None:
             if self.window_size <= past_features.shape[1]:
                 past_features = past_features[:, -self.window_size:]
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index 31afb3d51..0c2fe2d4d 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -29,8 +29,8 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
 
     def build_embedding(self,
                         num_input_features: np.ndarray,
-                        num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
-        return _NoEmbedding(), list(num_input_features)
+                        num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]:
+        return _NoEmbedding(), None
 
     @staticmethod
     def get_hyperparameter_search_space(
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 546b3fb9f..5091424d7 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -26,13 +26,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             num_numerical_features=num_numerical_columns,
         )
         if "feature_shapes" in X['dataset_properties']:
-            feature_shapes = X['dataset_properties']['feature_shapes']
-            # forecasting tasks
-            feature_names = X['dataset_properties']['feature_names']
-            for idx_cat, n_output_cat in enumerate(num_output_features[num_numerical_columns:]):
-                cat_feature_name = feature_names[idx_cat + num_numerical_columns]
-                feature_shapes[cat_feature_name] = n_output_cat
-            self.feature_shapes = feature_shapes
+            if num_output_features is not None:
+                feature_shapes = X['dataset_properties']['feature_shapes']
+                # forecasting tasks
+                feature_names = X['dataset_properties']['feature_names']
+                for idx_cat, n_output_cat in enumerate(num_output_features[num_numerical_columns:]):
+                    cat_feature_name = feature_names[idx_cat + num_numerical_columns]
+                    feature_shapes[cat_feature_name] = n_output_cat
+                self.feature_shapes = feature_shapes
+            else:
+                self.feature_shapes = X['dataset_properties']['feature_shapes']
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -43,7 +46,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     def build_embedding(self,
                         num_input_features: np.ndarray,
-                        num_numerical_features: int) -> Tuple[nn.Module, List[int]]:
+                        num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]:
         raise NotImplementedError
 
     def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 2551437d7..c03b05bea 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -159,23 +159,23 @@ def _get_hyperparameter_search_space(self,
         # Here we add custom code, like this with this
         # is not a valid configuration
         # Learned Entity Embedding is only valid when encoder is one hot encoder
-        if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys():
+        if 'network_embedding' in self.named_steps.keys() and 'feature_encoding' in self.named_steps.keys():
             embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
             if 'LearnedEntityEmbedding' in embeddings:
-                encoders = cs.get_hyperparameter('encoder:__choice__').choices
+                feature_encodings = cs.get_hyperparameter('feature_encoding:__choice__').choices
                 default = cs.get_hyperparameter('network_embedding:__choice__').default_value
                 possible_default_embeddings = copy.copy(list(embeddings))
                 del possible_default_embeddings[possible_default_embeddings.index(default)]
 
-                for encoder in encoders:
-                    if encoder == 'OneHotEncoder':
+                for encoding in feature_encodings:
+                    if encoding == 'OneHotEncoder':
                         continue
                     while True:
                         try:
                             cs.add_forbidden_clause(ForbiddenAndConjunction(
                                 ForbiddenEqualsClause(cs.get_hyperparameter(
                                     'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
-                                ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder)
+                                ForbiddenEqualsClause(cs.get_hyperparameter('feature_encoding:__choice__'), encoding)
                             ))
                             break
                         except ValueError:
@@ -199,16 +199,16 @@ def _get_hyperparameter_search_space(self,
                                     forbidden_regression_losses_all.append(forbidden_hp_dist)
                             """
 
-                # NBEATS only works with NoEmbedding
-                if 'network_backbone:flat_encoder:__choice__' in cs:
-                    hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
-                    if 'NBEATSEncoder' in hp_flat_encoder.choices:
-                        cs.add_forbidden_clause(ForbiddenAndConjunction(
-                            ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
-                            cs.get_hyperparameter(
-                                'network_embedding:__choice__'), 'LearnedEntityEmbedding')
-                        )
-
+        if 'network_embedding' in self.named_steps.keys():
+            # NBEATS only works with NoEmbedding
+            if 'network_backbone:flat_encoder:__choice__' in cs:
+                hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
+                if 'NBEATSEncoder' in hp_flat_encoder.choices:
+                    cs.add_forbidden_clause(ForbiddenAndConjunction(
+                        ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                        ForbiddenEqualsClause(cs.get_hyperparameter(
+                            'network_embedding:__choice__'), 'LearnedEntityEmbedding'))
+                    )
 
         # dist_cls and auto_regressive are only activate if the network outputs distribution
         if 'loss' in self.named_steps.keys() and 'network_backbone' in self.named_steps.keys():
@@ -337,7 +337,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         if not default_dataset_properties.get("uni_variant", False):
             steps.extend([("impute", TimeSeriesFeatureImputer(random_state=self.random_state)),
                           ("scaler", BaseScaler(random_state=self.random_state)),
-                          ('encoding', TimeSeriesEncoderChoice(default_dataset_properties,
+                          ('feature_encoding', TimeSeriesEncoderChoice(default_dataset_properties,
                                                                random_state=self.random_state)),
                           ("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),
                           ("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),

From fba012ca4fd56795b061b96dfb6edbb6e7b68f62 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 21 May 2022 17:52:46 +0200
Subject: [PATCH 278/347] fix scaling

---
 autoPyTorch/datasets/time_series_dataset.py   |  4 +-
 .../scaling/utils.py                          | 57 ++++++++++++-------
 2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index bb29d96bb..113974ff8 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -743,7 +743,6 @@ def transform_data_into_time_series_sequence(self,
             start_times=start_times,
             time_features=time_features,
             is_test_set=is_test_set,
-            dataset_with_future_features=dataset_with_future_features,
             **self.sequences_builder_kwargs)
         return sequence_datasets, train_tensors, test_tensors, sequence_lengths
 
@@ -755,7 +754,6 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
                                 X_test: Optional[pd.DataFrame] = None,
                                 Y_test: Optional[pd.DataFrame] = None,
                                 is_test_set: bool = False,
-                                dataset_with_future_features: bool = False,
                                 **sequences_kwargs: Optional[Dict]) -> Tuple[
         List[TimeSeriesSequence],
         Tuple[Optional[pd.DataFrame], pd.DataFrame],
@@ -825,7 +823,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
 
         train_tensors = (X, Y)
         # we could guarantee that Y_test has shape [len(seq) * n_prediction_steps, num_targets]
-        test_tensors = (None, Y_test.values) if Y_test is not None else None
+        test_tensors = (X_test, Y_test.values) if Y_test is not None else None
 
         return sequence_datasets, train_tensors, test_tensors
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 8e5193b1e..0a43ffaa9 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,4 +1,4 @@
-from typing import Any, Union, Tuple
+from typing import Any, Union, Tuple, List
 
 import numpy as np
 import pandas as pd
@@ -16,25 +16,16 @@ def __init__(self, mode: str,
         self.dataset_is_small_preprocess = dataset_is_small_preprocess
         self.static_features = static_features
 
-    def fit(self, X: pd.DataFrame, y: Any = None) -> "TimeSeriesScaler":
+    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Any = None) -> "TimeSeriesScaler":
         """
         The transformer is transformed on the fly (for each batch)
         """
-        static_features = [static_fea for static_fea in self.static_features if static_fea in X.columns]
+        if self.dataset_is_small_preprocess:
+            static_features = [static_fea for static_fea in self.static_features if static_fea in X.columns]
+        else:
+            static_features = [static_fea for static_fea in self.static_features if static_fea < X.shape[1]]
         self.static_features = static_features
-        return self
 
-    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...]:
-        """
-        X = sklearn.utils.check_array(
-            X,
-            force_all_finite=True,
-            ensure_2d=False,
-            allow_nd=True,
-            accept_sparse=False,
-            accept_large_sparse=False
-        ) # type: np.ndarray
-        """
         if self.mode == "standard":
             if self.dataset_is_small_preprocess:
                 X_grouped = X.groupby(X.index)
@@ -57,7 +48,6 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...
                 self.scale = np.where(self.scale == 0, self.loc, self.scale)
                 self.scale[self.scale == 0] = 1.
 
-            return (X - self.loc) / self.scale
 
         elif self.mode == "min_max":
             if self.dataset_is_small_preprocess:
@@ -84,8 +74,6 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...
                 self.scale = np.where(self.scale == 0., self.loc, self.scale)
                 self.scale[self.scale == 0.0] = 1.0
 
-            return (X - self.loc) / self.scale
-
         elif self.mode == "max_abs":
             if self.dataset_is_small_preprocess:
                 X_abs = X.transform("abs")
@@ -99,8 +87,6 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...
             self.loc = None
             self.scale = max_abs_
 
-            return X / self.scale
-
         elif self.mode == 'mean_abs':
             if self.dataset_is_small_preprocess:
                 X_abs = X.transform("abs")
@@ -116,12 +102,39 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...
             self.scale[self.scale == 0] = 1
             self.loc = None
 
-            return X / self.scale
-
         elif self.mode == "none":
             self.loc = None
             self.scale = None
 
+        else:
+            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
+
+        return self
+
+    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...]:
+        """
+        X = sklearn.utils.check_array(
+            X,
+            force_all_finite=True,
+            ensure_2d=False,
+            allow_nd=True,
+            accept_sparse=False,
+            accept_large_sparse=False
+        ) # type: np.ndarray
+        """
+        if self.mode == "standard":
+            return (X - self.loc) / self.scale
+
+        elif self.mode == "min_max":
+            return (X - self.loc) / self.scale
+
+        elif self.mode == "max_abs":
+            return X / self.scale
+
+        elif self.mode == 'mean_abs':
+            return X / self.scale
+
+        elif self.mode == "none":
             return X
         else:
             raise ValueError(f"Unknown mode {self.mode} for time series scaler")

From 2ed1197844c459817d755dae3d540fc7c1a06ed9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 21 May 2022 18:33:30 +0200
Subject: [PATCH 279/347] allow transform for test dataloader

---
 autoPyTorch/datasets/time_series_dataset.py   |   2 +
 .../scaling/utils.py                          |   1 -
 .../TimeSeriesEarlyPreProcessing.py           |   2 +-
 .../time_series_forecasting_data_loader.py    |  37 ++++-
 .../preprocessing/forecasting/test_scaling.py | 131 ++++++++++++------
 5 files changed, 129 insertions(+), 44 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 113974ff8..1bc445ca7 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,4 +1,5 @@
 import os
+import pdb
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from numbers import Real
 import uuid
@@ -840,6 +841,7 @@ def replace_data(self,
             ser_id = x[0]
             x_ser = x[1].transform(np.array).values
             seq.X = x_ser
+
             if X_test is not None:
                 seq.X_test = X_test_group.get_group(ser_id).transform(np.array).values
             seq.known_future_features_index = known_future_features_index
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 0a43ffaa9..1e77db194 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -48,7 +48,6 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Any = None) -> "TimeSeriesS
                 self.scale = np.where(self.scale == 0, self.loc, self.scale)
                 self.scale[self.scale == 0] = 1.
 
-
         elif self.mode == "min_max":
             if self.dataset_is_small_preprocess:
                 X_grouped = X.groupby(X.index)
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index 95377edef..5a4a88f8f 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Union, Tuple, List
 
 import numpy as np
-
 import pandas as pd
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
@@ -51,6 +50,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
                 X_train = X['backend'].load_datamanager().train_tensors[0]
 
             X['X_train'] = time_series_preprocess(dataset=X_train, transforms=transforms)
+
         feature_names = X['dataset_properties']['feature_names']
         numerical_columns = X['dataset_properties']['numerical_columns']
         categorical_columns = X['dataset_properties']['categorical_columns']
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index b6a46cc6a..40847fcae 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -2,12 +2,14 @@
 import warnings
 from functools import partial
 
+
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
 from ConfigSpace.conditions import EqualsCondition
 
 import numpy as np
 import pandas as pd
+from sklearn.compose import ColumnTransformer
 
 import torch
 from torch.utils.data.sampler import SubsetRandomSampler
@@ -91,6 +93,9 @@ def __init__(self,
         self.dataset_columns = []
         self.sampler_train = None
 
+        # Applied for get loader
+        self.feature_preprocessor: Optional[ColumnTransformer] = None
+
         self.add_fit_requirements(
             [FitRequirement("known_future_features", (Tuple,), user_defined=True, dataset_property=True),
              FitRequirement("feature_shapes", (Dict,), user_defined=True, dataset_property=True),
@@ -213,6 +218,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
         self.test_transform = self.build_transform(X, mode='test')
+        self.feature_preprocessor = X['time_series_feature_transformer'].preprocessor
         datamanager.update_transform(
             self.train_transform,
             train=True,
@@ -360,13 +366,39 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     sequence_lengths[seq_idx] = len(x_seq.X)
 
                 x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
+
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
                 x_all.index = series_number
 
-                x_all = pd.DataFrame(self.test_transform(x_all))
+                if self.dataset_small_preprocess:
+                    self.feature_preprocessor = self.feature_preprocessor.fit(x_all)
+                    x_all = self.feature_preprocessor.transform(x_all)
+
+                x_all = pd.DataFrame(x_all)
                 x_all.index = series_number
+
                 x_all = x_all.groupby(x_all.index)
 
+                if len(self.known_future_features_index) > 0:
+                    sequence_lengths_test = [0] * num_sequences
+                    for seq_idx, x_seq in enumerate(X):
+                        sequence_lengths_test[seq_idx] = len(x_seq.X_test)
+
+                    x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]),
+                                              columns=self.dataset_columns)
+
+                    series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
+
+                    x_all_test.index = series_number_test
+
+                    if self.dataset_small_preprocess:
+                        x_all_test = self.feature_preprocessor.transform(x_all_test)
+
+                    x_all_test = pd.DataFrame(x_all_test)
+                    x_all_test.index = series_number_test
+
+                    x_all_test = x_all_test.groupby(x_all_test.index)
+
             for i, x_seq in enumerate(X):
                 if not isinstance(x_seq, TimeSeriesSequence):
                     raise NotImplementedError('Test Set must be a TimeSeriesSequence or a'
@@ -378,6 +410,9 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                 if self.dataset_small_preprocess and not self._is_uni_variant:
                     x_seq.X = x_all.get_group(i).transform(np.array).values
                     update_dict = {"known_future_features_index": self.known_future_features_index}
+                    if len(self.known_future_features_index) > 0:
+                        x_seq.X_test = x_all_test.get_group(i).transform(np.array).values
+
                 else:
                     update_dict = {}
                 update_dict.update(dict(freq=self.freq,
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
index 0c2ff1c0b..b4c85ba85 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -28,6 +28,8 @@ def setUp(self) -> None:
         self.raw_data = [data_seq_1, data_seq_2]
         self.data = pd.DataFrame(np.concatenate([data_seq_1, data_seq_2]), columns=columns, index=[0] * 3 + [1] * 4)
         self.static_features = ('s',)
+        self.static_features_column = (1, )
+
         categorical_columns = list()
         numerical_columns = [0, 1, 2]
 
@@ -57,6 +59,13 @@ def test_base_and_standard_scaler(self):
         column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
                                                      remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
+        transformer = column_transformer.named_transformers_['timeseriesscaler']
+
+        self.assertTrue(np.allclose(transformer.loc.values, np.asarray([[1.0, 1.428571, 3.00],
+                                                                        [0.0, 1.428571, 3.25]])))
+
+        self.assertTrue(np.allclose(transformer.scale.values, np.asarray([[1.0, 0.534522, 1.000000],
+                                                                          [1.0, 0.534522, 2.217356]])))
         transformed = column_transformer.transform(self.data)
 
         self.assertTrue(np.allclose(transformed, np.asarray([[0., 1.06904497, 0.],
@@ -67,31 +76,33 @@ def test_base_and_standard_scaler(self):
                                                              [0., -0.80178373, 0.33824071],
                                                              [0., -0.80178373, 1.24021595]])))
 
-        transformer = column_transformer.named_transformers_['timeseriesscaler']
-        self.assertTrue(np.allclose(transformer.loc.values, np.asarray([[1.0, 1.428571, 3.00],
-                                                                        [0.0, 1.428571, 3.25]])))
-
-        self.assertTrue(np.allclose(transformer.scale.values, np.asarray([[1.0, 0.534522, 1.000000],
-                                                                          [1.0, 0.534522, 2.217356]])))
 
         # second column is static features, those it need to be the mean and std value across all sequences
         scaler.dataset_is_small_preprocess = False
-        scaler = scaler.fit(self.data)
-        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        scaler.static_features = self.static_features_column
+        scaler = scaler.fit(self.raw_data[0])
+
+        self.assertTrue(np.allclose(scaler.loc, np.asarray([[1., 2., 3]])))
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 2., 3.]])))
+
+        transformed_test = scaler.transform(self.raw_data[0])
         self.assertIsInstance(transformed_test, np.ndarray)
         # should have the same value as the second part of transformed except for the static values
-        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed[:, [0, -1]]))
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed[:len(self.raw_data[0]), [0, -1]]))
         self.assertTrue(np.all(transformed_test[:, 1] == 0.))
 
-        self.assertTrue(np.allclose(scaler.loc, np.asarray([[0., 1., 3.25]])))
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 1., 2.21735578]])))
-
     def test_min_max(self):
         scaler = TimeSeriesScaler(mode='min_max',
                                   static_features=self.static_features
                                   )
 
         scaler = scaler.fit(self.data)
+        self.assertTrue(np.allclose(scaler.loc.values, np.asarray([[0, 1, 3],
+                                                                   [0, 1, 1]])))
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 1, 1],
+                                                                     [1, 1, 5]])))
+
         transformed_data = scaler.transform(self.data).values
         self.assertTrue(np.allclose(transformed_data, np.asarray([[0.5, 1., 0.],
                                                                   [0., 1., 0.],
@@ -100,19 +111,25 @@ def test_min_max(self):
                                                                   [0., 0., 0.2],
                                                                   [0., 0., 0.6],
                                                                   [0., 0., 1.]])))
-        self.assertTrue(np.allclose(scaler.loc.values, np.asarray([[0, 1, 3],
-                                                                   [0, 1, 1]])))
-
-        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 1, 1],
-                                                                     [1, 1, 5]])))
 
         scaler.dataset_is_small_preprocess = False
-        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        scaler.static_features = self.static_features_column
+        scaler = scaler.fit(self.raw_data[0])
 
-        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
-        self.assertTrue(np.all(transformed_test[:, 1] == 0.))
-        self.assertTrue(np.allclose(scaler.loc, np.asarray([[0., 1., 1.]])))
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 1., 5.]])))
+        self.assertTrue(np.allclose(scaler.loc, np.asarray([[0., 2., 3.]])))
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[2., 2., 3.]])))
+
+        idx_start = 0
+        for i, raw_data in enumerate(self.raw_data):
+            idx_end = idx_start + len(raw_data)
+            scaler = scaler.fit(raw_data)
+
+            transformed_test = scaler.transform(self.raw_data[i])
+            self.assertIsInstance(transformed_test, np.ndarray)
+            # should have the same value as the second part of transformed except for the static values
+            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
+            self.assertTrue(np.all(transformed_test[:, 1] == 0.))
+            idx_start = idx_end
 
     def test_max_abs_scaler(self):
         scaler = TimeSeriesScaler(mode='max_abs',
@@ -120,6 +137,12 @@ def test_max_abs_scaler(self):
                                   )
 
         scaler = scaler.fit(self.data)
+
+        self.assertIsNone(scaler.loc)
+
+        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 2, 3],
+                                                                     [1, 2, 6]])))
+
         transformed_data = scaler.transform(self.data).values
 
         self.assertTrue(np.allclose(transformed_data, np.asarray([[0.5, 1., 1.],
@@ -129,18 +152,24 @@ def test_max_abs_scaler(self):
                                                                   [0., 0.5, 0.33333333],
                                                                   [0., 0.5, 0.66666667],
                                                                   [0., 0.5, 1.]])))
-        self.assertIsNone(scaler.loc)
-
-        self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[2, 2, 3],
-                                                                     [1, 2, 6]])))
 
         scaler.dataset_is_small_preprocess = False
-        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
-
-        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
-        self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+        scaler.static_features = self.static_features_column
+        scaler = scaler.fit(self.raw_data[0])
         self.assertIsNone(scaler.loc)
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 1., 6.]])))
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[2., 2., 3.]])))
+
+        idx_start = 0
+        for i, raw_data in enumerate(self.raw_data):
+            idx_end = idx_start + len(raw_data)
+            scaler = scaler.fit(raw_data)
+
+            transformed_test = scaler.transform(self.raw_data[i])
+            self.assertIsInstance(transformed_test, np.ndarray)
+            # should have the same value as the second part of transformed except for the static values
+            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
+            self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+            idx_start = idx_end
 
     def test_mean_abs_scaler(self):
         scaler = TimeSeriesScaler(mode='mean_abs',
@@ -162,12 +191,23 @@ def test_mean_abs_scaler(self):
         self.assertTrue(np.allclose(scaler.scale.values, np.asarray([[1., 1.5, 3.],
                                                                      [1., 1.5, 3.25]])))
         scaler.dataset_is_small_preprocess = False
-        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        scaler.static_features = self.static_features_column
+        scaler = scaler.fit(self.raw_data[0])
 
-        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
-        self.assertTrue(np.all(transformed_test[:, 1] == 1.))
         self.assertIsNone(scaler.loc)
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[6., 1., 3.25]])))
+        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 2., 3.]])))
+
+        idx_start = 0
+        for i, raw_data in enumerate(self.raw_data):
+            idx_end = idx_start + len(raw_data)
+            scaler = scaler.fit(raw_data)
+
+            transformed_test = scaler.transform(self.raw_data[i])
+            self.assertIsInstance(transformed_test, np.ndarray)
+            # should have the same value as the second part of transformed except for the static values
+            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
+            self.assertTrue(np.all(transformed_test[:, 1] == 1.))
+            idx_start = idx_end
 
     def test_no_scaler(self):
         scaler = TimeSeriesScaler(mode='none',
@@ -182,16 +222,25 @@ def test_no_scaler(self):
         self.assertIsNone(scaler.scale)
 
         scaler.dataset_is_small_preprocess = False
-        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        scaler.static_features = self.static_features_column
+        scaler = scaler.fit(self.raw_data[0])
 
-        self.assertTrue(np.allclose(transformed_data[:, [0, -1]], transformed_test[:, [0, -1]]))
-        self.assertTrue(np.allclose(transformed_test, np.concatenate(self.raw_data)))
+        idx_start = 0
+        for i, raw_data in enumerate(self.raw_data):
+            idx_end = idx_start + len(raw_data)
+            scaler = scaler.fit(raw_data)
 
-        self.assertIsNone(scaler.loc)
-        self.assertIsNone(scaler.scale)
+            transformed_test = scaler.transform(self.raw_data[i])
+            self.assertIsInstance(transformed_test, np.ndarray)
+            # should have the same value as the second part of transformed except for the static values
+            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
+
+            self.assertIsNone(scaler.loc)
+            self.assertIsNone(scaler.scale)
+            idx_start = idx_end
 
         with self.assertRaises(ValueError):
             scaler = TimeSeriesScaler(mode='random',
                                       static_features=self.static_features
                                       )
-            _ = scaler.transform(self.data)
+            _ = scaler.fit(self.data)

From 95eb7834904dbfde6dfa14bb524f85d703d783cc Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sat, 21 May 2022 18:41:59 +0200
Subject: [PATCH 280/347] maint dataloader

---
 .../data_loader/time_series_forecasting_data_loader.py     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 40847fcae..9726858e6 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -218,7 +218,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         self.train_transform = self.build_transform(X, mode='train')
         self.val_transform = self.build_transform(X, mode='val')
         self.test_transform = self.build_transform(X, mode='test')
-        self.feature_preprocessor = X['time_series_feature_transformer'].preprocessor
+        if 'time_series_feature_transformer' in X:
+            self.feature_preprocessor = X['time_series_feature_transformer'].preprocessor
         datamanager.update_transform(
             self.train_transform,
             train=True,
@@ -370,7 +371,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
                 x_all.index = series_number
 
-                if self.dataset_small_preprocess:
+                if self.dataset_small_preprocess and self.feature_preprocessor is not None:
                     self.feature_preprocessor = self.feature_preprocessor.fit(x_all)
                     x_all = self.feature_preprocessor.transform(x_all)
 
@@ -391,7 +392,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
 
                     x_all_test.index = series_number_test
 
-                    if self.dataset_small_preprocess:
+                    if self.dataset_small_preprocess and self.feature_preprocessor is not None:
                         x_all_test = self.feature_preprocessor.transform(x_all_test)
 
                     x_all_test = pd.DataFrame(x_all_test)

From 8035221b04c3835cf8971d872c72dcd64198d2c3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Sun, 22 May 2022 17:42:38 +0200
Subject: [PATCH 281/347] fix updates

---
 .../setup/network/forecasting_architecture.py |  6 +-
 .../forecasting_backbone/__init__.py          |  7 +-
 .../time_series_forecasting_data_loader.py    |  6 +-
 .../pipeline/time_series_forecasting.py       | 95 ++++++++++---------
 .../training/test_time_series_data_loader.py  | 14 +++
 .../test_time_series_forecasting_pipeline.py  | 73 +++++++++++++-
 6 files changed, 138 insertions(+), 63 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 4f84a028e..0447923d5 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -529,11 +529,11 @@ def pred_from_net_output(self, net_output):
                 elif self.aggregation == 'median':
                     return torch.median(samples, 0)[0]
                 else:
-                    raise ValueError(f'Unknown aggregation: {self.aggregation}')
+                    raise NotImplementedError(f'Unknown aggregation: {self.aggregation}')
             else:
-                raise ValueError(f'Unknown forecast_strategy: {self.forecast_strategy}')
+                raise NotImplementedError(f'Unknown forecast_strategy: {self.forecast_strategy}')
         else:
-            raise ValueError(f'Unknown output_type: {self.output_type}')
+            raise NotImplementedError(f'Unknown output_type: {self.output_type}')
 
     def predict(self,
                 past_targets: torch.Tensor,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 1bcd1b59e..178408663 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -30,8 +30,8 @@ def __init__(self,
                  random_state: Optional[np.random.RandomState] = None
                  ):
         super().__init__(dataset_properties, random_state)
-        self.include_components = None
-        self.exclude_components = None
+        self.include_components = {}
+        self.exclude_components = {}
 
         self.default_components = OrderedDict(
             {"flat_encoder": FlatForecastingEncoderChoice(dataset_properties=self.dataset_properties,
@@ -88,7 +88,6 @@ def get_available_components(
 
         if include is not None:
             include_top = set()
-            self.include_components = {}
             for incl in include:
                 if incl not in available_comp:
                     for comp in available_comp.keys():
@@ -106,7 +105,6 @@ def get_available_components(
                 raise ValueError(f"Trying to include unknown component: {include}")
             include = list(include_top)
         elif exclude is not None:
-            self.exclude_components = {}
             for excl in exclude:
                 for comp in available_comp.keys():
                     if excl.startswith(comp):
@@ -174,7 +172,6 @@ def get_hyperparameter_search_space(
             dataset_properties = {}
 
         cs = ConfigurationSpace()
-
         # Compile a list of legal preprocessors for this problem
         available_encoders = self.get_available_components(
             dataset_properties=dataset_properties,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 9726858e6..6573d4c2a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -477,7 +477,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
                                         HyperparameterSearchSpace(hyperparameter='window_size',
                                                                   value_range=(20, 50),
                                                                   default_value=30),
-                                        num_batch_per_epoch: HyperparameterSearchSpace =
+                                        num_batches_per_epoch: HyperparameterSearchSpace =
                                         HyperparameterSearchSpace(hyperparameter="num_batches_per_epoch",
                                                                   value_range=(30, 100),
                                                                   default_value=50),
@@ -510,7 +510,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
             batch_size (int): batch size
             window_size (int): window size, (if activate) this value directly determines the window_size of the
                                data loader
-            num_batch_per_epoch (int): how many batches are trained at each iteration
+            num_batches_per_epoch (int): how many batches are trained at each iteration
             sample_strategy(str): how samples are distributed. if it is LengthUnifrom, then every single data point
                                   has the same probability to be sampled, in which case longer sequence will occupy more
                                   samples. If it is SeqUniform, then every sequence has the same probability to be
@@ -527,7 +527,7 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
         """
         cs = ConfigurationSpace()
         add_hyperparameter(cs, batch_size, UniformIntegerHyperparameter)
-        add_hyperparameter(cs, num_batch_per_epoch, UniformIntegerHyperparameter)
+        add_hyperparameter(cs, num_batches_per_epoch, UniformIntegerHyperparameter)
         add_hyperparameter(cs, sample_strategy, CategoricalHyperparameter)
 
         if dataset_properties is None:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index c03b05bea..1e1f97b42 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -159,56 +159,57 @@ def _get_hyperparameter_search_space(self,
         # Here we add custom code, like this with this
         # is not a valid configuration
         # Learned Entity Embedding is only valid when encoder is one hot encoder
-        if 'network_embedding' in self.named_steps.keys() and 'feature_encoding' in self.named_steps.keys():
+        if 'network_embedding' in self.named_steps.keys():
             embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
             if 'LearnedEntityEmbedding' in embeddings:
-                feature_encodings = cs.get_hyperparameter('feature_encoding:__choice__').choices
-                default = cs.get_hyperparameter('network_embedding:__choice__').default_value
-                possible_default_embeddings = copy.copy(list(embeddings))
-                del possible_default_embeddings[possible_default_embeddings.index(default)]
-
-                for encoding in feature_encodings:
-                    if encoding == 'OneHotEncoder':
-                        continue
-                    while True:
-                        try:
-                            cs.add_forbidden_clause(ForbiddenAndConjunction(
-                                ForbiddenEqualsClause(cs.get_hyperparameter(
-                                    'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
-                                ForbiddenEqualsClause(cs.get_hyperparameter('feature_encoding:__choice__'), encoding)
-                            ))
-                            break
-                        except ValueError:
-                            # change the default and try again
+                if 'feature_encoding' in self.named_steps.keys():
+                    feature_encodings = cs.get_hyperparameter('feature_encoding:__choice__').choices
+                    default = cs.get_hyperparameter('network_embedding:__choice__').default_value
+                    possible_default_embeddings = copy.copy(list(embeddings))
+                    del possible_default_embeddings[possible_default_embeddings.index(default)]
+
+                    for encoding in feature_encodings:
+                        if encoding == 'OneHotEncoder':
+                            continue
+                        while True:
                             try:
-                                default = possible_default_embeddings.pop()
-                            except IndexError:
-                                raise ValueError("Cannot find a legal default configuration")
-                            cs.get_hyperparameter('network_embedding:__choice__').default_value = default
-                            """
-                            # in this case we cannot deactivate the hps, we might need to think about this
-                            if 'RegressionLoss' in hp_loss.choices:
-                                forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
-                                for hp_dist in hp_distribution_children:
-                                    forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
-                                    forbidden_hp_dist = AndConjunction(forbidden_hp_dist, forbidden_hp_regression_loss)
-                                    forbidden_regression_losses_all.append(forbidden_hp_dist)
-                            else:
-                                for hp_dist in hp_distribution_children:
-                                    forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
-                                    forbidden_regression_losses_all.append(forbidden_hp_dist)
-                            """
-
-        if 'network_embedding' in self.named_steps.keys():
-            # NBEATS only works with NoEmbedding
-            if 'network_backbone:flat_encoder:__choice__' in cs:
-                hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
-                if 'NBEATSEncoder' in hp_flat_encoder.choices:
-                    cs.add_forbidden_clause(ForbiddenAndConjunction(
-                        ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
-                        ForbiddenEqualsClause(cs.get_hyperparameter(
-                            'network_embedding:__choice__'), 'LearnedEntityEmbedding'))
-                    )
+                                cs.add_forbidden_clause(ForbiddenAndConjunction(
+                                    ForbiddenEqualsClause(cs.get_hyperparameter(
+                                        'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
+                                    ForbiddenEqualsClause(
+                                        cs.get_hyperparameter('feature_encoding:__choice__'), encoding
+                                    )))
+                                break
+                            except ValueError:
+                                # change the default and try again
+                                try:
+                                    default = possible_default_embeddings.pop()
+                                except IndexError:
+                                    raise ValueError("Cannot find a legal default configuration")
+                                cs.get_hyperparameter('network_embedding:__choice__').default_value = default
+                                """
+                                # in this case we cannot deactivate the hps, we might need to think about this
+                                if 'RegressionLoss' in hp_loss.choices:
+                                    forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
+                                    for hp_dist in hp_distribution_children:
+                                        forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
+                                        forbidden_hp_dist = AndConjunction(forbidden_hp_dist, 
+                                                                           forbidden_hp_regression_loss)
+                                        forbidden_regression_losses_all.append(forbidden_hp_dist)
+                                else:
+                                    for hp_dist in hp_distribution_children:
+                                        forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
+                                        forbidden_regression_losses_all.append(forbidden_hp_dist)
+                                """
+
+                if 'network_backbone:flat_encoder:__choice__' in cs:
+                    hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
+                    if 'NBEATSEncoder' in hp_flat_encoder.choices:
+                        cs.add_forbidden_clause(ForbiddenAndConjunction(
+                            ForbiddenEqualsClause(hp_flat_encoder, 'NBEATSEncoder'),
+                            ForbiddenEqualsClause(cs.get_hyperparameter(
+                                'network_embedding:__choice__'), 'LearnedEntityEmbedding'))
+                        )
 
         # dist_cls and auto_regressive are only activate if the network outputs distribution
         if 'loss' in self.named_steps.keys() and 'network_backbone' in self.named_steps.keys():
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
index e6cc2c473..5ac84c23b 100644
--- a/test/test_pipeline/components/training/test_time_series_data_loader.py
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -345,6 +345,20 @@ def test_get_loader(self, loader_init_mock):
             self.assertTrue(seq.is_test_set)
             self.assertEqual(seq.freq, time_series_dataloader.freq)
 
+        class DummyEncoder:
+            def fit(self, data):
+                return self
+
+            def transform(self, data: pd.DataFrame):
+                return np.concatenate([data.values, data.values], axis=-1)
+
+        transform = DummyEncoder()
+        time_series_dataloader.feature_preprocessor = transform
+        _ = time_series_dataloader.get_loader(X=copy.deepcopy(x_test))
+        test_set = loader_init_mock.call_args[0][0]
+        for seq_raw, seq in zip(x_test, test_set):
+            self.assertTrue(seq.X.shape[-1] == 2 * seq_raw.X.shape[-1])
+
 
 class TestTimeSeriesUtil(unittest.TestCase):
     def test_test_seq_length(self):
diff --git a/test/test_pipeline/test_time_series_forecasting_pipeline.py b/test/test_pipeline/test_time_series_forecasting_pipeline.py
index 7fedf48ec..3e34b71b7 100644
--- a/test/test_pipeline/test_time_series_forecasting_pipeline.py
+++ b/test/test_pipeline/test_time_series_forecasting_pipeline.py
@@ -11,6 +11,32 @@ def network_type(request):
     return request.param
 
 
+@pytest.fixture(params=['LearnedEntityEmbedding', 'NoEmbedding'])
+def embedding(request):
+    return request.param
+
+
+@pytest.fixture(params=['OneHotEncoder', 'NoEncoder'])
+def feature_encoding(request):
+    return request.param
+
+
+def generate_light_updates(updates: HyperparameterSearchSpaceUpdates):
+    updates.append(node_name='data_loader',
+                   hyperparameter='window_size',
+                   value_range=[3, 10],
+                   default_value=5)
+    updates.append(node_name='data_loader',
+                   hyperparameter='batch_size',
+                   value_range=[2, 5],
+                   default_value=4)
+    updates.append(node_name='data_loader',
+                   hyperparameter="num_batches_per_epoch",
+                   value_range=(3, 10),
+                   default_value=5)
+    return updates
+
+
 class TestTimeSeriesForecastingPipeline:
     @pytest.mark.parametrize("fit_dictionary_forecasting", ["uni_variant_wo_missing",
                                                             "uni_variant_w_missing",
@@ -24,10 +50,14 @@ def test_fit_predict(self, fit_dictionary_forecasting, forecasting_budgets):
             include = {'network_embedding': ['LearnedEntityEmbedding']}
         else:
             include = None
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
         pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
-                                                 include=include)
+                                                 include=include,
+                                                 search_space_updates=updates)
         step_names = pipeline.named_steps.keys()
-        step_names_multi_processing = ['impute', 'scaler', 'encoding', 'time_series_transformer', 'preprocessing']
+        step_names_multi_processing = ['impute', 'scaler', 'feature_encoding',
+                                       'time_series_transformer', 'preprocessing']
 
         steps_multi_in_pipeline = [step_name_multi in step_names for step_name_multi in step_names_multi_processing]
 
@@ -51,8 +81,11 @@ def test_fit_predict(self, fit_dictionary_forecasting, forecasting_budgets):
     ], indirect=True)
     def test_fit_budgets_types(self, fit_dictionary_forecasting, forecasting_budgets):
         dataset_properties = fit_dictionary_forecasting['dataset_properties']
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
 
-        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties)
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 search_space_updates=updates)
         fit_dict = copy.copy(fit_dictionary_forecasting)
         pipeline = pipeline.fit(fit_dict)
         datamanager = fit_dictionary_forecasting['backend'].load_datamanager()
@@ -61,11 +94,41 @@ def test_fit_budgets_types(self, fit_dictionary_forecasting, forecasting_budgets
 
         assert list(predict.shape) == [len(test_sets) * dataset_properties['n_prediction_steps']]
 
+    @pytest.mark.parametrize("fit_dictionary_forecasting", [["multi_variant_wo_missing"]], indirect=True)
+    def test_network_encoding_variable_selection(self, fit_dictionary_forecasting, embedding, feature_encoding):
+        if embedding == 'LearnedEntityEmbedding' and feature_encoding == 'NoEncoder':
+            return
+        include = {'network_embedding': [embedding],
+                   'feature_encoding': [feature_encoding],
+                   'network_backbone': ['seq_encoder']
+                   }
+        updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
+
+        updates.append(node_name='network_backbone',
+                       hyperparameter='seq_encoder:num_blocks',
+                       value_range=[1, 1],
+                       default_value=1)
+        updates.append(node_name='network_backbone',
+                       hyperparameter='seq_encoder:variable_selection',
+                       value_range=[True, ],
+                       default_value=True)
+
+        dataset_properties = fit_dictionary_forecasting['dataset_properties']
+
+        pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
+                                                 include=include,
+                                                 search_space_updates=updates)
+        fit_dict = copy.copy(fit_dictionary_forecasting)
+        # No error should be raised
+        _ = pipeline.fit(fit_dict)
+
     @pytest.mark.parametrize("fit_dictionary_forecasting", ["multi_variant_w_missing"], indirect=True)
     def test_networks(self, fit_dictionary_forecasting, network_type):
         dataset_properties = fit_dictionary_forecasting['dataset_properties']
 
         updates = HyperparameterSearchSpaceUpdates()
+        updates = generate_light_updates(updates)
 
         if network_type == 'NBEATSNet':
             include = {'network_backbone': ['flat_encoder:NBEATSEncoder'],
@@ -104,8 +167,8 @@ def test_networks(self, fit_dictionary_forecasting, network_type):
 
                 updates.append(node_name='network_backbone',
                                hyperparameter='seq_encoder:block_1:MLPDecoder:auto_regressive',
-                               value_range=[False, ],
-                               default_value=False)
+                               value_range=[True, ],
+                               default_value=True)
 
         pipeline = TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
                                                  include=include,

From f3cb2dedefc3b322ac364557a2dec8019624c0d2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 23 May 2022 21:14:08 +0200
Subject: [PATCH 282/347] fix dataset

---
 autoPyTorch/api/base_task.py                  |  3 +-
 autoPyTorch/api/time_series_forecasting.py    |  4 +-
 autoPyTorch/datasets/time_series_dataset.py   | 97 +++++++++++--------
 autoPyTorch/evaluation/tae.py                 |  3 +-
 autoPyTorch/optimizer/smbo.py                 |  1 -
 autoPyTorch/optimizer/utils.py                |  1 +
 .../forecasting_base_trainer.py               |  2 +
 test/conftest.py                              |  7 ++
 test/test_api/utils.py                        | 33 ++++++-
 .../test_time_series_datasets.py              | 33 +++++--
 .../test_forecasting_evaluators.py            | 36 +++++++
 11 files changed, 162 insertions(+), 58 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 9f108cf71..9688259f9 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -260,7 +260,7 @@ def __init__(
                 raise ValueError("Expected search space updates to be of instance"
                                  " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
 
-        self.time_series_forecasting = False
+        self.time_series_forecasting = task_type == 'time_series_forecasting'
 
     @abstractmethod
     def build_pipeline(
@@ -1126,7 +1126,6 @@ def _search(
 
         self.search_space = self.get_search_space(dataset)
 
-
         # Incorporate budget to pipeline config
         if budget_type not in ('epochs', 'runtime') and (budget_type in FORECASTING_BUDGET_TYPE
                                                          and not self.time_series_forecasting):
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 6b6263554..c5d6f2fb3 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -1,5 +1,3 @@
-import os
-import uuid
 from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Mapping
 
 import numpy as np
@@ -247,7 +245,7 @@ def search(
             y_test: Optional[Union[List, pd.DataFrame]] = None,
             n_prediction_steps: int = 1,
             freq: Optional[Union[str, int, List[int]]] = None,
-            start_times: List[pd.DatetimeIndex] = [],
+            start_times: Optional[List[pd.DatetimeIndex]] = None,
             dataset_name: Optional[str] = None,
             budget_type: str = 'epochs',
             min_budget: Union[int, str] = 5,
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 1bc445ca7..1fa7e28df 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,5 +1,4 @@
 import os
-import pdb
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from numbers import Real
 import uuid
@@ -93,6 +92,7 @@ def compute_time_features(start_time: pd.Timestamp,
 
 
 class TimeSeriesSequence(Dataset):
+    _is_test_set = False
     def __init__(self,
                  X: Optional[np.ndarray],
                  Y: np.ndarray,
@@ -137,7 +137,7 @@ def __init__(self,
             X_test = X_test[:, np.newaxis]
 
         self.X_test = X_test
-        self.Y_tet = Y_test
+        self.Y_test = Y_test
 
         self.time_feature_transform = time_feature_transform
 
@@ -161,7 +161,8 @@ def __init__(self,
 
         self.transform_time_features = False
         self._cached_time_features: Optional[np.ndarray] = time_features
-        self._is_test_set = is_test_set
+
+        self.future_observed_target = None
         self.is_test_set = is_test_set
 
     @property
@@ -170,14 +171,13 @@ def is_test_set(self):
 
     @is_test_set.setter
     def is_test_set(self, value: bool):
-        if value != self._is_test_set and self.known_future_features_index:
-            if self.X_test is None:
-                raise ValueError("If future features are required, X_test must be given for"
-                                 " setting TimeSeriesSequence as test set!")
-            if value is True:
-                self.X = np.concatenate([self.X, self.X_test])
-            else:
-                self.X = self.X[:-len(self.X_test)]
+        if value and value != self._is_test_set:
+            if self.known_future_features_index:
+                if self.X_test is None:
+                    raise ValueError('When future features are known, X_test '
+                                     'for Time Series Sequences must be given!')
+        if self.Y_test is not None:
+            self.future_observed_target = ~np.isnan(self.Y_test)
         self._is_test_set = value
 
     def __getitem__(self, index: int, train: bool = True) \
@@ -202,9 +202,12 @@ def __getitem__(self, index: int, train: bool = True) \
             past_features = self.X[:index + 1]
 
             if self.known_future_features_index:
-                future_features = self.X[
-                                  index + 1: index + self.n_prediction_steps + 1, self.known_future_features_index
-                                  ]
+                if not self.is_test_set:
+                    future_features = self.X[
+                                      index + 1: index + self.n_prediction_steps + 1, self.known_future_features_index
+                                      ]
+                else:
+                    future_features = self.X_test[:, self.known_future_features_index]
             else:
                 future_features = None
         else:
@@ -242,7 +245,13 @@ def __getitem__(self, index: int, train: bool = True) \
         # In case of prediction, the targets are not provided
         targets = self.Y
         if self.is_test_set:
-            future_targets = None
+            if self.Y_test is not None:
+                future_targets = {
+                    'future_targets': torch.from_numpy(self.Y_test),
+                    'future_observed_targets': torch.from_numpy(self.future_observed_target)
+                }
+            else:
+                future_targets = None
         else:
             future_targets = targets[index + 1: index + self.n_prediction_steps + 1]
             future_targets = torch.from_numpy(future_targets)
@@ -320,29 +329,49 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
         if index < 0:
             index = self.__len__() + index
         if index >= self.__len__() - 1:
-            return copy.copy(self)
+            # TODO consider X_test?
+            val_set = copy.deepcopy(self)
+            if val_set.X is not None:
+                val_set.X_test = val_set.X[-self.n_prediction_steps:]
+                val_set.X = val_set.X[:-self.n_prediction_steps]
+            val_set.Y_test = val_set.Y[-self.n_prediction_steps:]
+            val_set.Y = val_set.Y[:-self.n_prediction_steps]
+            val_set.future_observed_target = val_set.observed_target[-self.n_prediction_steps:]
+            val_set.observed_target = val_set.observed_target[:-self.n_prediction_steps]
+            val_set.is_test_set = True
+
+            return val_set
         else:
             if self.X is not None:
-                X = self.X[:index + 1 + self.n_prediction_steps]
+                X = self.X[:index + 1]
             else:
                 X = None
+            if self.known_future_features_index:
+                X_test = self.X[index + 1: index + 1 + self.n_prediction_steps]
+            else:
+                X_test = None
             if self._cached_time_features is None:
                 cached_time_features = None
             else:
                 cached_time_features = self._cached_time_features[:index + 1 + self.n_prediction_steps]
 
-            return TimeSeriesSequence(X=X,
-                                      Y=self.Y[:index + 1 + self.n_prediction_steps],
-                                      start_time=self.start_time,
-                                      freq=self.freq,
-                                      time_feature_transform=self.time_feature_transform,
-                                      train_transforms=self.train_transform,
-                                      val_transforms=self.val_transform,
-                                      n_prediction_steps=self.n_prediction_steps,
-                                      known_future_features_index=self.known_future_features_index,
-                                      sp=self.sp,
-                                      compute_mase_coefficient_value=False,
-                                      time_features=cached_time_features)
+            val_set = TimeSeriesSequence(X=X,
+                                         Y=self.Y[:index + 1],
+                                         X_test=X_test,
+                                         Y_test=self.Y[index + 1: index + 1 + self.n_prediction_steps],
+                                         start_time=self.start_time,
+                                         freq=self.freq,
+                                         time_feature_transform=self.time_feature_transform,
+                                         train_transforms=self.train_transform,
+                                         val_transforms=self.val_transform,
+                                         n_prediction_steps=self.n_prediction_steps,
+                                         known_future_features_index=self.known_future_features_index,
+                                         sp=self.sp,
+                                         compute_mase_coefficient_value=False,
+                                         time_features=cached_time_features,
+                                         is_test_set=True)
+
+            return val_set
 
     def get_test_target(self, test_idx: int):
         if self.is_test_set:
@@ -518,7 +547,6 @@ def __init__(self,
             future_feature_shapes: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
         self.encoder_can_be_auto_regressive = (self.input_shape[-1] == future_feature_shapes[-1])
 
-
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1][0].fillna(method="pad"))
 
@@ -685,7 +713,7 @@ def transform_data_into_time_series_sequence(self,
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                                                  Y_test: Optional[
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
-                                                 is_test_set: bool = False) ->[
+                                                 is_test_set: bool = False) -> [
 
     ]:
         """
@@ -714,8 +742,6 @@ def transform_data_into_time_series_sequence(self,
                 training tensors
         """
         dataset_with_future_features = X is not None and len(self.known_future_features) > 0
-        if dataset_with_future_features and X_test is None:
-            raise ValueError('When constructing test sets and known future features exist, X_test must be given!')
         X, Y, sequence_lengths = self.validator.transform(X, Y)
         time_features = self.compute_time_features(start_times,
                                                    sequence_lengths,
@@ -1195,9 +1221,4 @@ def generate_test_seqs(self) -> List[TimeSeriesSequence]:
         test_sets = copy.deepcopy(self.datasets)
         for test_seq in test_sets:
             test_seq.is_test_set = True
-            if len(self.known_future_features) > 0:
-                if test_seq.X_test is None:
-                    raise ValueError("If future features are required, X_test must be given!")
-                test_seq.X = np.concatenate([test_seq.X, test_seq.X_test])
         return test_sets
-
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index c58383b0d..705c6cb76 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -128,7 +128,7 @@ def __init__(
             logger_port: int = None,
             all_supported_metrics: bool = True,
             search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-            **eval_func_kwargs: Dict):
+            **eval_func_kwargs: Any):
 
         self.backend = backend
 
@@ -151,7 +151,6 @@ def __init__(
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
             eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
             self.output_y_hat_optimization = False
-
         self.worst_possible_result = cost_for_crash
 
         eval_function = functools.partial(
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 5341c9d6b..998b98f32 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -285,7 +285,6 @@ def __init__(self,
 
         self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else None
 
-
     def reset_data_manager(self) -> None:
         if self.datamanager is not None:
             del self.datamanager
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 58d8e57e0..c2e753ba1 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -83,6 +83,7 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
             configuration = Configuration(config_space, configuration_dict)
             initial_configurations.append(configuration)
         except Exception as e:
+            continue
             warnings.warn(f"Failed to convert {configuration_dict} into"
                           f" a Configuration with error {e}. "
                           f"Therefore, it can't be used as an initial "
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 0c1fe145b..c48e004a0 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -176,6 +176,8 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
             if isinstance(self.model, ForecastingDeepARNet) and self.model.encoder_bijective_seq_output:
                 if self.window_size > past_target.shape[1]:
                     all_targets = torch.cat([past_target[:, 1:, ], future_targets_values], dim=1)
+                    future_observed_targets = torch.cat([past_observed_targets[:, 1:, ],
+                                                         future_observed_targets], dim=1)
                 else:
                     if self.window_size == 1:
                         all_targets = future_targets_values
diff --git a/test/conftest.py b/test/conftest.py
index 9b08d506d..9ffe0c6ee 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -740,6 +740,7 @@ def get_forecasting_datamangaer(X, y, validator, with_y_test=True, forecast_hori
     else:
         X_test = None
         known_future_features = None
+
     if with_y_test:
         y_test = []
         for y_seq in y:
@@ -863,6 +864,12 @@ def get_forecasting_datamanager(request):
     return datamanager
 
 
+@pytest.fixture
+def forecasting_toy_dataset(request):
+    x, y, _ = get_forecasting_data(request.param)
+    return x, y
+
+
 @pytest.fixture(params=['epochs'])
 def forecasting_budgets(request):
     return request.param
diff --git a/test/test_api/utils.py b/test/test_api/utils.py
index f8a11db88..0c209fac5 100644
--- a/test/test_api/utils.py
+++ b/test/test_api/utils.py
@@ -2,13 +2,15 @@
 
 from smac.runhistory.runhistory import DataOrigin, RunHistory, RunKey, RunValue, StatusType
 
-from autoPyTorch.constants import REGRESSION_TASKS
+from autoPyTorch.constants import REGRESSION_TASKS, CLASSIFICATION_TASKS, FORECASTING_TASKS
 from autoPyTorch.evaluation.abstract_evaluator import (
     DummyClassificationPipeline,
     DummyRegressionPipeline,
+    DummyTimeSeriesForecastingPipeline,
     fit_and_suppress_warnings
 )
 from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.pipeline.traditional_tabular_classification import TraditionalTabularClassificationPipeline
 
 
@@ -33,8 +35,9 @@ def _fit_and_predict(self, pipeline, fold: int, train_indices,
                          test_indices,
                          add_pipeline_to_self
                          ):
-
-        if self.task_type in REGRESSION_TASKS:
+        if self.task_type in FORECASTING_TASKS:
+            pipeline = DummyTimeSeriesForecastingPipeline(config=1)
+        elif self.task_type in REGRESSION_TASKS:
             pipeline = DummyRegressionPipeline(config=1)
         else:
             pipeline = DummyClassificationPipeline(config=1)
@@ -68,6 +71,16 @@ def _fit_and_predict(self, pipeline, fold: int, train_indices,
         return Y_train_pred, Y_opt_pred, Y_valid_pred, Y_test_pred
 
 
+class DummyForecastingEvaluator(TimeSeriesForecastingTrainEvaluator):
+    def _fit_and_predict(self, pipeline, fold: int, train_indices,
+                         test_indices,
+                         add_pipeline_to_self
+                         ):
+        return DummyTrainEvaluator._fit_and_predict(self,
+                                                    pipeline, fold, train_indices, test_indices,
+                                                    add_pipeline_to_self)
+
+
 # create closure for evaluating an algorithm
 def dummy_eval_train_function(
         backend,
@@ -88,8 +101,17 @@ def dummy_eval_train_function(
         all_supported_metrics=True,
         search_space_updates=None,
         instance: str = None,
+        evaluator_class=None,
+        **evaluator_kwargs,
 ) -> None:
-    evaluator = DummyTrainEvaluator(
+    if evaluator_class is None:
+        evaluator_class = DummyTrainEvaluator
+    elif isinstance(evaluator_class, FORECASTING_TASKS):
+        evaluator_class = DummyForecastingEvaluator
+    import pdb
+    pdb.set_trace()
+
+    evaluator = evaluator_class(
         backend=backend,
         queue=queue,
         metric=metric,
@@ -106,7 +128,8 @@ def dummy_eval_train_function(
         logger_port=logger_port,
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
-        search_space_updates=search_space_updates
+        search_space_updates=search_space_updates,
+        **evaluator_kwargs
     )
     evaluator.fit_predict_and_loss()
 
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index b1062ed9e..95504d5fe 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -38,7 +38,7 @@ def setUp(self) -> None:
         self.y_test = rng.rand(self.n_prediction_steps, 1)
         self.time_feature_transform = [DayOfMonth(), ConstantTransform(10.0)]
         self.known_future_features_index = [0, 2]
-        self.seq_uni = TimeSeriesSequence(X=None, Y=self.y, Y_test=self.y_test,
+        self.seq_uni = TimeSeriesSequence(X=None, Y=self.y,
                                           n_prediction_steps=self.n_prediction_steps,
                                           time_feature_transform=self.time_feature_transform)
         self.seq_multi = TimeSeriesSequence(X=self.x_data,
@@ -77,7 +77,7 @@ def test_sequence_uni_variant_base(self):
 
         self.assertTrue(self.seq_uni[-2][0]["past_targets"].size, self.data_length - self.n_prediction_steps - 2 + 1)
 
-    def test_get_val_seq_and_test_targets(self):
+    def test_uni_get_val_seq_and_test_targets(self):
         val_seq = self.seq_uni.get_val_seq_set(-1)
         self.assertEqual(len(val_seq), len(self.seq_uni))
 
@@ -92,6 +92,22 @@ def test_get_val_seq_and_test_targets(self):
         test_targets = self.seq_uni.get_test_target(5)
         self.assertTrue(np.all(self.y[5 + 1: 5 + 1 + self.n_prediction_steps] == test_targets))
 
+    def test_multi_get_val_seq(self):
+        val_seq = self.seq_multi_with_future.get_val_seq_set(-1)
+        self.assertTrue(len(val_seq), len(self.seq_multi_with_future))
+
+        val_seq = self.seq_multi_with_future.get_val_seq_set(3)
+        self.assertTrue(np.array_equal(val_seq.X, self.seq_multi_with_future.X[:4]))
+        self.assertTrue(np.array_equal(val_seq.X_test, self.seq_multi_with_future.X[4:7]))
+
+        val_seq = self.seq_multi_with_future.get_val_seq_set(len(self.seq_multi_with_future) - 1)
+        self.assertTrue(len(val_seq), len(self.seq_multi_with_future))
+
+        val_seq = self.seq_multi_with_future.get_val_seq_set(len(self.seq_multi_with_future) - 2)
+
+        self.assertTrue(np.array_equal(val_seq.X, self.seq_multi_with_future.X[:6]))
+        self.assertTrue(np.array_equal(val_seq.X_test, self.seq_multi_with_future.X[6:9]))
+
     def test_uni_get_update_time_features(self):
         self.seq_uni.update_attribute(transform_time_features=True)
 
@@ -172,7 +188,7 @@ def test_multi_transform_features(self):
 
     def test_multi_to_test_set(self):
         self.seq_multi_with_future.is_test_set = True
-        self.assertEqual(len(self.seq_multi_with_future.X), len(self.x_data) + len(self.x_test_data))
+        self.assertEqual(len(self.seq_multi_with_future.X), len(self.x_data))
         data, _ = self.seq_multi_with_future[-1]
 
         self.assertTrue(np.allclose(data["past_features"].numpy(), self.x_data))
@@ -197,11 +213,13 @@ def test_transformation(self):
 
     def test_exception(self):
         seq_1 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=None,
-                                   known_future_features_index=self.known_future_features_index)
+                                   known_future_features_index=self.known_future_features_index,
+                                   is_test_set=False)
+
         with self.assertRaises(ValueError):
             seq_1.is_test_set = True
 
-        seq_2 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=None,
+        seq_2 = TimeSeriesSequence(X=self.x_data, Y=self.y, X_test=self.x_test_data,
                                    is_test_set=True)
 
         with self.assertRaises(ValueError):
@@ -288,6 +306,7 @@ def test_dataset_index(backend, fit_dictionary_forecasting):
     # test for validation indices
     val_indices = datamanager.splits[0][1]
     val_set = [datamanager.get_validation_set(val_idx) for val_idx in val_indices]
+
     val_targets = np.concatenate([val_seq[-1][1]['future_targets'].numpy() for val_seq in val_set])
     assert np.allclose(val_targets, datamanager.get_test_target(val_indices))
 
@@ -313,7 +332,7 @@ def test_update_dataset(backend, fit_dictionary_forecasting):
     new_test_seq = datamanager.generate_test_seqs()
     for seq_len, test_seq in zip(seq_lengths, new_test_seq):
         # seq_len is len(y) - n_prediction_steps, here we expand X_test with another n_prediction_steps
-        assert test_seq.X.shape[0] - seq_len == 2 * datamanager.n_prediction_steps
+        assert test_seq.X.shape[0] - seq_len == datamanager.n_prediction_steps
 
 
 @pytest.mark.parametrize("fit_dictionary_forecasting", ['multi_variant_wo_missing'], indirect=True)
@@ -322,7 +341,7 @@ def test_test_tensors(backend, fit_dictionary_forecasting):
     test_tensors = datamanager.test_tensors
     forecast_horizon = datamanager.n_prediction_steps
     n_seq = len(datamanager.datasets)
-    assert test_tensors[0] is None
+    assert test_tensors[0].shape == (n_seq * forecast_horizon, datamanager.num_features)
     assert test_tensors[1].shape == (n_seq * forecast_horizon, datamanager.num_targets)
 
     datamanager2 = TimeSeriesForecastingDataset(X=None, Y=[[1, 2]])
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
index 2423d400f..fd55abd7c 100644
--- a/test/test_evaluation/test_forecasting_evaluators.py
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -41,6 +41,41 @@ def setUp(self):
     def tearDown(self):
         TestTrainEvaluator.tearDown(self)
 
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    def test_budget_type_choices(self, pipeline_mock):
+        D = get_forecasting_dataset()
+        n_prediction_steps = D.n_prediction_steps
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        budget_value = 0.1
+
+        for budget_type in ['resolution', 'num_seq', 'num_sample_per_seq']:
+            evaluator = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                            queue_,
+                                                            configuration=configuration,
+                                                            metric=mean_MASE_forecasting, budget=0,
+                                                            pipeline_config={'budget_type': budget_type,
+                                                                             budget_type: 0.1},
+                                                            min_num_test_instances=100)
+            self.assertTrue('epochs' not in evaluator.fit_dictionary)
+            if budget_type == 'resolution':
+                self.assertTrue('sample_interval' in evaluator.fit_dictionary)
+                self.assertEqual(int(np.ceil(1.0 / budget_value)), evaluator.fit_dictionary['sample_interval'])
+            elif budget_type == 'num_seq':
+                self.assertTrue('fraction_seq' in evaluator.fit_dictionary)
+                self.assertEqual(budget_value, evaluator.fit_dictionary['fraction_seq'])
+            if budget_type == 'num_sample_per_seq':
+                self.assertTrue('fraction_samples_per_seq' in evaluator.fit_dictionary)
+                self.assertEqual(budget_value, evaluator.fit_dictionary['fraction_samples_per_seq'])
+
     @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
     def test_holdout(self, pipeline_mock):
         pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 50}
@@ -62,6 +97,7 @@ def test_holdout(self, pipeline_mock):
                                                         metric=mean_MASE_forecasting, budget=0,
                                                         pipeline_config={'budget_type': 'epochs', 'epochs': 50},
                                                         min_num_test_instances=100)
+        self.assertTrue('epochs' in evaluator.fit_dictionary)
         evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output)
         evaluator.file_output.return_value = (None, {})
 

From 0af1217cddb97a3a281457899da2dd58e56b348d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 24 May 2022 11:49:23 +0200
Subject: [PATCH 283/347] tests on api, initial design on multi-variant

---
 autoPyTorch/api/base_task.py                  |   6 +-
 autoPyTorch/api/time_series_forecasting.py    |   2 +-
 .../configs/forecasting_init_cfgs.json        |  21 ++-
 autoPyTorch/datasets/time_series_dataset.py   |  51 ++++--
 autoPyTorch/evaluation/abstract_evaluator.py  |   4 +-
 ...time_series_forecasting_train_evaluator.py |  64 +++++--
 autoPyTorch/optimizer/smbo.py                 |   6 +-
 autoPyTorch/optimizer/utils.py                |  12 +-
 .../time_series_forecasting_data_loader.py    |  40 +++--
 test/test_api/test_api.py                     | 166 ++++++++++++++++++
 test/test_api/test_base_api.py                |  41 +++++
 .../test_time_series_datasets.py              |   8 +
 test/test_evaluation/evaluation_util.py       |   2 +-
 .../test_forecasting_evaluators.py            |  66 ++++++-
 14 files changed, 421 insertions(+), 68 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 9688259f9..e33d4e415 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1346,10 +1346,12 @@ def _get_fit_dictionary(
     ) -> Dict[str, Any]:
         X_test = dataset.test_tensors[0].copy() if dataset.test_tensors is not None else None
         y_test = dataset.test_tensors[1].copy() if dataset.test_tensors is not None else None
+        X_train = dataset.train_tensors[0].copy() if dataset.train_tensors[0] is not None else None
+        y_train = dataset.train_tensors[1].copy()
         X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
                                   'backend': self._backend,
-                                  'X_train': dataset.train_tensors[0].copy(),
-                                  'y_train': dataset.train_tensors[1].copy(),
+                                  'X_train': X_train,
+                                  'y_train': y_train,
                                   'X_test': X_test,
                                   'y_test': y_test,
                                   'train_indices': dataset.splits[split_id][0],
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index c5d6f2fb3..85058649a 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -472,7 +472,7 @@ def predict(
                                                                                  is_test_set=True
                                                                                  )
         flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
-        if self.dataset.num_target == 1:
+        if self.dataset.num_targets == 1:
             forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps])
         else:
             forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps, self.dataset.num_target])
diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index adb7c66ef..f423b912a 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -15,11 +15,22 @@
         "optimizer:AdamOptimizer:beta1": 0.9,
         "optimizer:AdamOptimizer:beta2": 0.999,
         "network_init:__choice__": "XavierInit",
-        "network_init:XavierInit:bias_strategy": "Zero",
+        "network_init:XavierInit:bias_strategy": "Normal",
         "target_scaler:__choice__": "TargetMeanAbsScaler",
         "trainer:__choice__": "ForecastingStandardTrainer",
         "network_embedding:__choice__": "NoEmbedding"
     },
+    "feature_preprocessing": {
+        "feature_encoding:__choice__": "OneHotEncoder",
+        "scaler:scaling_mode": "standard",
+        "network_embedding:__choice__": "NoEmbedding"
+    },
+    "feature_imputer": {
+        "imputer:numerical_strategy": "ffill"
+    },
+    "target_imputer": {
+        "target_imputer:numerical_strategy": "ffill"
+    },
     "models": {
         "MLP": {
             "loss:__choice__": "DistributionLoss",
@@ -50,7 +61,7 @@
             "network_backbone:seq_encoder:use_temporal_fusion": false,
             "network_backbone:seq_encoder:variable_selection": false,
             "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
-            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
             "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "lstm",
             "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 2,
             "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 40,
@@ -145,7 +156,6 @@
             "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
             "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
-            "network_backbone:seq_encoder:block_1:TransformerEncoder:norm_first": true,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:decoder_type": "TransformerDecoder",
             "network_backbone:seq_encoder:block_1:TransformerEncoder:use_dropout": true,
@@ -159,7 +169,6 @@
             "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
             "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
-            "network_backbone:seq_encoder:block_1:TransformerDecoder:norm_first": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:use_positional_decoder": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:dropout_positional_decoder": 0.1,
@@ -171,6 +180,7 @@
             "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
         },
         "NBEATS-I": {
+            "target_scaler:__choice__": "TargetNoScaler",
             "data_loader:backcast": true,
             "data_loader:backcast_period": 2,
             "loss:__choice__": "RegressionLoss",
@@ -199,7 +209,6 @@
             "network_backbone:flat_encoder:NBEATSDecoder:dropout_i_2": 0.1
         },
         "NBEATS-G": {
-            "target_scaler:__choice__": "TargetNoScaler",
             "loss:__choice__": "RegressionLoss",
             "loss:RegressionLoss:loss_name": "mape",
             "network_backbone:__choice__": "flat_encoder",
@@ -229,7 +238,7 @@
             "network_backbone:seq_encoder:variable_selection": true,
             "network_backbone:seq_encoder:variable_selection_use_dropout": true,
             "network_backbone:seq_encoder:variable_selection_dropout_rate": 0.1,
-            "network_backbone:seq_encoder:share_single_variable_networks": false,
+            "network_backbone:seq_encoder:share_single_variable_networks": true,
             "network_backbone:seq_encoder:skip_connection_type": "gate_add_norm",
             "network_backbone:seq_encoder:grn_use_dropout": true,
             "network_backbone:seq_encoder:grn_dropout_rate": 0.1,
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 1fa7e28df..acd2c4846 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -93,6 +93,8 @@ def compute_time_features(start_time: pd.Timestamp,
 
 class TimeSeriesSequence(Dataset):
     _is_test_set = False
+    is_pre_processed = False
+
     def __init__(self,
                  X: Optional[np.ndarray],
                  Y: np.ndarray,
@@ -156,7 +158,7 @@ def __init__(self,
                 self.mase_coefficient = compute_mase_coefficient(self.Y[:-n_prediction_steps], sp=self.sp)
 
         else:
-            self.mase_coefficient = 1.0
+            self.mase_coefficient = np.asarray([1.0])
         self.known_future_features_index = known_future_features_index
 
         self.transform_time_features = False
@@ -282,6 +284,19 @@ def __getitem__(self, index: int, train: bool = True) \
     def __len__(self) -> int:
         return self.Y.shape[0] if self.is_test_set else self.Y.shape[0] - self.n_prediction_steps
 
+    def get_target_values(self, index: int):
+        """
+        Get the visible targets in the datasets without generating a tensor. This can be used to create a dummy pipeline
+        Args:
+            index: target index
+
+        Returns:
+            y: the last visible target value
+        """
+        if index < 0:
+            index = self.__len__() + index
+        return self.Y[index]
+
     def cache_time_features(self, ):
         if self._cached_time_features is None:
             periods = self.Y.shape[0]
@@ -871,6 +886,7 @@ def replace_data(self,
             if X_test is not None:
                 seq.X_test = X_test_group.get_group(ser_id).transform(np.array).values
             seq.known_future_features_index = known_future_features_index
+            seq.is_pre_processed = True
 
         return self
 
@@ -1052,25 +1068,26 @@ def get_split_strategy(sequence_lengths: List[int],
                     n_repeats = int(np.ceil(100.0 / num_seqs))
                 else:
                     n_repeats = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
-        else:
-            if n_repeats is None:
-                n_repeats = 1
-            if resampling_strategy == CrossValTypes.time_series_cross_validation:
-                n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps * num_splits))
-            elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
-                seasonality_h_value = int(np.round(
-                    (n_prediction_steps // int(freq_value) + 1) * freq_value)
-                )
-                while minimal_seq_length // 5 < (num_splits - 1) * n_repeats * seasonality_h_value - n_prediction_steps:
-                    n_repeats -= 1
 
-            elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
-                n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps) - 1)
+        if n_repeats is None:
+            n_repeats = 1
+        if resampling_strategy == CrossValTypes.time_series_cross_validation:
+            n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps * num_splits))
+        elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
+            seasonality_h_value = int(np.round(
+                (n_prediction_steps // int(freq_value) + 1) * freq_value)
+            )
+            while minimal_seq_length // 5 < (num_splits - 1) * n_repeats * seasonality_h_value - n_prediction_steps:
+                n_repeats -= 1
 
-            else:
-                n_repeats = 1
+        elif resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
+            n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps) - 1)
+
+        else:
+            n_repeats = 1
+
+        n_repeats = max(n_repeats, 1)
 
-            n_repeats = max(n_repeats, 1)
         if n_repeats is None:
             n_repeats = 1
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 27d4ed7f4..b11d86f4b 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -327,9 +327,10 @@ def fit(self, X: Dict[str, Any], y: Any,
         y_train = subsampler(X['y_train'], X['train_indices'])
         return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train, sample_weight)
 
+
     def _genreate_dummy_forecasting(self, X):
         if isinstance(X[0], TimeSeriesSequence):
-            X_tail = [x.Y[-1 - self.n_prediction_steps] for x in X]
+            X_tail = [x.get_target_values(-1) for x in X]
         else:
             X_tail = [x[-1] for x in X]
         return X_tail
@@ -801,6 +802,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
             additional_info (Dict):
                 Additional run information, like train/test loss
         """
+
         self.duration = time.time() - self.starttime
 
         if file_output:
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 9a5225ecd..0e43c14db 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -1,3 +1,4 @@
+import copy
 import warnings
 from multiprocessing.queues import Queue
 from typing import Any, Dict, List, Optional, Tuple, Union, Sequence
@@ -107,27 +108,32 @@ def fit_predict_and_loss(self) -> None:
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
 
-            mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)
+            mase_coefficient_val = self.generate_mase_coefficient_for_validation(test_split)
 
             forecasting_kwargs = {'sp': self.seasonality,
                                   'n_prediction_steps': self.n_prediction_steps,
-                                  'mase_coefficient': mase_coefficient,
                                   }
+            forecasting_kwargs_val = copy.copy(forecasting_kwargs)
+            forecasting_kwargs_val['mase_coefficient'] = mase_coefficient_val
+            if self.y_test is not None:
+                mase_coefficient_test = self.generate_mase_coefficient_for_test_set()
+                forecasting_kwargs['mase_coefficient'] = mase_coefficient_test
 
             train_loss = None
-            loss = self._loss(self.Y_optimization, y_opt_pred, **forecasting_kwargs)
+
+            loss = self._loss(self.Y_optimization, y_opt_pred, **forecasting_kwargs_val)
 
             additional_run_info = pipeline.get_additional_run_info() if hasattr(
                 pipeline, 'get_additional_run_info') else {}
 
             status = StatusType.SUCCESS
-
-            self.Y_optimization *= mase_coefficient
+            # self.Y_optimization and y_opt_pred need to be applied to construct ensembles. We simply scale them here
+            self.Y_optimization *= mase_coefficient_val
 
             self.finish_up(
                 loss=loss,
                 train_loss=train_loss,
-                opt_pred=y_opt_pred * mase_coefficient,
+                opt_pred=y_opt_pred * mase_coefficient_val,
                 valid_pred=y_valid_pred,
                 test_pred=y_test_pred,
                 additional_run_info=additional_run_info,
@@ -153,10 +159,17 @@ def fit_predict_and_loss(self) -> None:
             # weights for opt_losses.
             opt_fold_weights = [np.NaN] * self.num_folds
 
-            mase_coefficient_all = []
+            mase_coefficient_val_all = []
             for train_split, test_split in self.splits:
                 mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)
-                mase_coefficient_all.append(mase_coefficient)
+                mase_coefficient_val_all.append(mase_coefficient)
+
+            forecasting_kwargs = {'sp': self.seasonality,
+                                  'n_prediction_steps': self.n_prediction_steps}
+
+            if self.y_test is not None:
+                mase_coefficient_test = self.generate_mase_coefficient_for_test_set()
+                forecasting_kwargs['mase_coefficient'] = mase_coefficient_test
 
             for i, (train_split, test_split) in enumerate(self.splits):
                 if i > 0:
@@ -181,16 +194,14 @@ def fit_predict_and_loss(self) -> None:
                 # the average.
                 train_fold_weights[i] = len(train_split)
 
-                forecasting_kwargs = {'mase_coefficient': mase_coefficient_all[i],
-                                      'sp': self.seasonality,
-                                      'n_prediction_steps': self.n_prediction_steps,
-                                      }
+                forecasting_kwargs_val = copy.copy(forecasting_kwargs)
+                forecasting_kwargs_val['mase_coefficient'] = mase_coefficient_val_all[i]
 
                 # Compute validation loss of this fold and store it.
                 optimization_loss = self._loss(
                     self.Y_targets[i],
                     opt_pred,
-                    **forecasting_kwargs
+                    **forecasting_kwargs_val
                 )
                 opt_losses[i] = optimization_loss
                 # number of optimization data points for this fold.
@@ -219,10 +230,10 @@ def fit_predict_and_loss(self) -> None:
             Y_train_targets = self.Y_train_targets
 
             Y_optimization_preds = np.concatenate(
-                [Y_optimization_pred[i] * mase_coefficient_all[i] for i in range(self.num_folds)
+                [Y_optimization_pred[i] * mase_coefficient_val_all[i] for i in range(self.num_folds)
                  if Y_optimization_pred[i] is not None])
             Y_targets = np.concatenate([
-                Y_targets[i] * mase_coefficient_all[i] for i in range(self.num_folds)
+                Y_targets[i] * mase_coefficient_val_all[i] for i in range(self.num_folds)
                 if Y_targets[i] is not None
             ])
 
@@ -231,7 +242,7 @@ def fit_predict_and_loss(self) -> None:
             Y_valid_preds = None
 
             if self.y_test is not None:
-                Y_test_preds = np.array([Y_test_pred[i] * mase_coefficient_all[0]
+                Y_test_preds = np.array([Y_test_pred[i] * mase_coefficient_val_all[0]
                                          for i in range(self.num_folds)
                                          if Y_test_pred[i] is not None])
                 # Average the predictions of several pipelines
@@ -281,6 +292,27 @@ def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.n
         mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
         return mase_coefficient
 
+    def generate_mase_coefficient_for_test_set(self) -> np.ndarray:
+        """
+        Compute the denominator for Mean Absolute Scaled Losses,
+        For detail, please check sktime.performance_metrics.forecasting._functions.mean_absolute_scaled_error
+
+        Parameters:
+        ----------
+        test_split: Sequence
+            test splits, consistent of int
+        Return:
+        ----------
+        mase_coefficient: np.ndarray(self.num_sequence * self.n_prediction_steps)
+            inverse of the mase_denominator
+        """
+        mase_coefficient = np.ones([len(self.datamanager.datasets), self.num_targets])
+        if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
+            for seq_idx, test_idx in enumerate(self.datamanager.datasets):
+                mase_coefficient[seq_idx] = self.datamanager.datasets[seq_idx].mase_coefficient
+        mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
+        return mase_coefficient
+
     def create_validation_sub_set(self, test_indices: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         num_test_instances = len(test_indices)
 
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 998b98f32..8a7f7b857 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -272,10 +272,14 @@ def __init__(self,
             # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
             # do not provide any initial configurations
             if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
+                datamanager = self.backend.load_datamanager()
+                dataset_properties = datamanager.get_dataset_properties([])
                 initial_configurations = read_forecasting_init_configurations(
                     config_space=config_space,
                     suggested_init_models=suggested_init_models,
-                    custom_init_setting_path=custom_init_setting_path)
+                    custom_init_setting_path=custom_init_setting_path,
+                    dataset_properties=dataset_properties
+                )
             # proxy-validation sets
             self.min_num_test_instances = kwargs.get('min_num_test_instances', None)
         else:
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index c2e753ba1..31a9b3e00 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -35,10 +35,14 @@ def read_return_initial_configurations(
 def read_forecasting_init_configurations(config_space: ConfigurationSpace,
                                          suggested_init_models: Optional[List[str]] = None,
                                          custom_init_setting_path: Optional[str] = None,
+                                         dataset_properties: Dict = {}
                                          ):
     forecasting_init_path = os.path.join(os.path.dirname(__file__), '../configs/forecasting_init_cfgs.json')
     initial_configurations_dict: List[Dict] = list()
     initial_configurations = []
+    uni_variant = dataset_properties.get('uni_variant', True)
+    targets_have_missing_values = dataset_properties.get('targets_have_missing_values', False)
+    features_have_missing_values = dataset_properties.get('features_have_missing_values', False)
 
     if suggested_init_models or suggested_init_models is None:
         with open(forecasting_init_path, 'r') as f:
@@ -63,6 +67,13 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
                 cfg_tmp['data_loader:window_size'] = window_size
 
             cfg_tmp.update(model_cfg)
+            if not uni_variant:
+                cfg_tmp.update(forecasting_init_dict['feature_preprocessing'])
+                if features_have_missing_values:
+                    cfg_tmp.update(forecasting_init_dict['feature_imputer'])
+            if targets_have_missing_values:
+                cfg_tmp.update(forecasting_init_dict['target_imputer'])
+
             initial_configurations_dict.append(cfg_tmp)
 
     if custom_init_setting_path is not None:
@@ -83,7 +94,6 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
             configuration = Configuration(config_space, configuration_dict)
             initial_configurations.append(configuration)
         except Exception as e:
-            continue
             warnings.warn(f"Failed to convert {configuration_dict} into"
                           f" a Configuration with error {e}. "
                           f"Therefore, it can't be used as an initial "
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 6573d4c2a..5e5beefdc 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -360,50 +360,58 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         if isinstance(X, TimeSeriesSequence):
             X = [X]
         if isinstance(X, List):
-            if self.dataset_small_preprocess and not self._is_uni_variant:
-                num_sequences = len(X)
-                sequence_lengths = [0] * num_sequences
+            num_sequences = len(X)
+            sequence_lengths = [0] * num_sequences
+            for seq_idx, x_seq in enumerate(X):
+                sequence_lengths[seq_idx] = len(x_seq.X)
+            series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+
+            if len(self.known_future_features_index) > 0:
+                sequence_lengths_test = [0] * num_sequences
                 for seq_idx, x_seq in enumerate(X):
-                    sequence_lengths[seq_idx] = len(x_seq.X)
+                    sequence_lengths_test[seq_idx] = len(x_seq.X_test)
+                series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
+
+            if self.dataset_small_preprocess and not self._is_uni_variant and not X[0].is_pre_processed:
 
                 x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
 
-                series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
                 x_all.index = series_number
 
                 if self.dataset_small_preprocess and self.feature_preprocessor is not None:
                     self.feature_preprocessor = self.feature_preprocessor.fit(x_all)
-                    x_all = self.feature_preprocessor.transform(x_all)
+                    x_all = self.feature_preprocessor.transform(x_all.copy())
 
                 x_all = pd.DataFrame(x_all)
                 x_all.index = series_number
 
-                x_all = x_all.groupby(x_all.index)
-
                 if len(self.known_future_features_index) > 0:
-                    sequence_lengths_test = [0] * num_sequences
-                    for seq_idx, x_seq in enumerate(X):
-                        sequence_lengths_test[seq_idx] = len(x_seq.X_test)
-
                     x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]),
                                               columns=self.dataset_columns)
 
-                    series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
-
                     x_all_test.index = series_number_test
 
                     if self.dataset_small_preprocess and self.feature_preprocessor is not None:
-                        x_all_test = self.feature_preprocessor.transform(x_all_test)
+                        x_all_test = self.feature_preprocessor.transform(x_all_test.copy())
 
                     x_all_test = pd.DataFrame(x_all_test)
                     x_all_test.index = series_number_test
 
-                    x_all_test = x_all_test.groupby(x_all_test.index)
+            else:
+                x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]))
+                x_all.index = series_number
+                if len(self.known_future_features_index) > 0:
+                    x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]))
+                    x_all_test.index = series_number_test
+
+            x_all = x_all.groupby(x_all.index)
+            x_all_test = x_all_test.groupby(x_all_test.index)
 
             for i, x_seq in enumerate(X):
                 if not isinstance(x_seq, TimeSeriesSequence):
                     raise NotImplementedError('Test Set must be a TimeSeriesSequence or a'
                                               ' list of time series objects!')
+                x_seq.is_pre_processed = True
                 if x_seq.freq != self.freq:
                     # WE need to recompute the cached time features (However, this should not happen)
                     x_seq._cached_time_features = None
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 4346ff2b6..1ec93185b 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -25,6 +25,7 @@
 
 from autoPyTorch.api.tabular_classification import TabularClassificationTask
 from autoPyTorch.api.tabular_regression import TabularRegressionTask
+from autoPyTorch.api.time_series_forecasting import TimeSeriesForecastingTask
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
@@ -405,6 +406,169 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
     assert 'Estimator' in representation
 
 
+@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', new=dummy_eval_train_function)
+@pytest.mark.parametrize('forecasting_toy_dataset', ['multi_variant_wo_missing'], indirect=True)
+@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
+                         (#(HoldoutValTypes.time_series_hold_out_validation, None),
+                          (CrossValTypes.time_series_cross_validation, {'num_splits': CV_NUM_SPLITS}),
+                          ))
+def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, backend, resampling_strategy_args):
+    forecast_horizon = 3
+    freq = '1Y'
+    X, Y = forecasting_toy_dataset
+
+    X_train = []
+    X_test = []
+
+    for x in X:
+        if hasattr(x, 'iloc'):
+            X_train.append(x.iloc[:-forecast_horizon].copy())
+            X_test.append(x.iloc[-forecast_horizon:].copy())
+        else:
+            X_train.append(x[:-forecast_horizon].copy())
+            X_test.append(x[-forecast_horizon:].copy())
+    known_future_features = tuple(X[0].columns) if isinstance(X[0], pd.DataFrame) else \
+        np.arange(X[0].shape[-1]).tolist()
+
+    y_train = []
+    y_test = []
+
+    for y in Y:
+        if hasattr(y, 'iloc'):
+            y_train.append(y.iloc[:-forecast_horizon].copy())
+            y_test.append(y.iloc[-forecast_horizon:].copy())
+        else:
+            y_train.append(y[:-forecast_horizon].copy())
+            y_test.append(y[-forecast_horizon:].copy())
+
+    # Search for a good configuration
+    estimator = TimeSeriesForecastingTask(
+        backend=backend,
+        resampling_strategy=resampling_strategy,
+        resampling_strategy_args=resampling_strategy_args,
+        seed=42,
+    )
+
+    with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction):
+        estimator.search(
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+            y_test=y_test,
+            optimize_metric='mean_MASE_forecasting',
+            n_prediction_steps=forecast_horizon,
+            freq=freq,
+            total_walltime_limit=50,
+            func_eval_time_limit_secs=20,
+            known_future_features=known_future_features,
+        )
+
+    # Internal dataset has expected settings
+    assert estimator.dataset.task_type == 'time_series_forecasting'
+    expected_num_splits = HOLDOUT_NUM_SPLITS if resampling_strategy ==\
+                                                HoldoutValTypes.time_series_hold_out_validation\
+        else CV_NUM_SPLITS
+    assert estimator.resampling_strategy == resampling_strategy
+    assert estimator.dataset.resampling_strategy == resampling_strategy
+
+    assert len(estimator.dataset.splits) == expected_num_splits
+
+    # Check for the created files
+    tmp_dir = estimator._backend.temporary_directory
+    loaded_datamanager = estimator._backend.load_datamanager()
+    assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors)
+
+    expected_files = [
+        'smac3-output/run_42/configspace.json',
+        'smac3-output/run_42/runhistory.json',
+        'smac3-output/run_42/scenario.txt',
+        'smac3-output/run_42/stats.json',
+        'smac3-output/run_42/train_insts.txt',
+        'smac3-output/run_42/trajectory.json',
+        '.autoPyTorch/datamanager.pkl',
+        '.autoPyTorch/ensemble_read_preds.pkl',
+        '.autoPyTorch/start_time_42',
+        '.autoPyTorch/ensemble_history.json',
+        '.autoPyTorch/ensemble_read_losses.pkl',
+        '.autoPyTorch/true_targets_ensemble.npy',
+    ]
+    for expected_file in expected_files:
+        assert os.path.exists(os.path.join(tmp_dir, expected_file)), expected_file
+
+    # Check that smac was able to find proper models
+    succesful_runs = [run_value.status for run_value in estimator.run_history.data.values(
+    ) if 'SUCCESS' in str(run_value.status)]
+    assert len(succesful_runs) >= 1, [(k, v) for k, v in estimator.run_history.data.items()]
+
+    # Search for an existing run key in disc. A individual model might have
+    # a timeout and hence was not written to disc
+    successful_num_run = None
+    SUCCESS = False
+    for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
+        if 'SUCCESS' in str(value.status):
+            run_key_model_run_dir = estimator._backend.get_numrun_directory(
+                estimator.seed, run_key.config_id + 1, run_key.budget)
+            successful_num_run = run_key.config_id + 1
+            if os.path.exists(run_key_model_run_dir):
+                # Runkey config id is different from the num_run
+                # more specifically num_run = config_id + 1(dummy)
+                SUCCESS = True
+                break
+
+    assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
+
+    if resampling_strategy == HoldoutValTypes.time_series_hold_out_validation:
+        model_file = os.path.join(run_key_model_run_dir,
+                                  f"{estimator.seed}.{successful_num_run}.{run_key.budget}.model")
+        assert os.path.exists(model_file), model_file
+        model = estimator._backend.load_model_by_seed_and_id_and_budget(
+            estimator.seed, successful_num_run, run_key.budget)
+    elif resampling_strategy == CrossValTypes.time_series_cross_validation:
+        model_file = os.path.join(
+            run_key_model_run_dir,
+            f"{estimator.seed}.{successful_num_run}.{run_key.budget}.cv_model"
+        )
+        assert os.path.exists(model_file), model_file
+        model = estimator._backend.load_cv_model_by_seed_and_id_and_budget(
+            estimator.seed, successful_num_run, run_key.budget)
+        assert isinstance(model, VotingRegressor)
+        assert len(model.estimators_) == CV_NUM_SPLITS
+    else:
+        pytest.fail(resampling_strategy)
+
+    # Make sure that predictions on the test data are printed and make sense
+    test_prediction = os.path.join(run_key_model_run_dir,
+                                   estimator._backend.get_prediction_filename(
+                                       'test', estimator.seed, successful_num_run,
+                                       run_key.budget))
+    assert os.path.exists(test_prediction), test_prediction
+    assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == forecast_horizon * np.shape(X_test)[0]
+
+    # Also, for ensemble builder, the OOF predictions should be there and match
+    # the Ground truth that is also physically printed to disk
+    ensemble_prediction = os.path.join(run_key_model_run_dir,
+                                       estimator._backend.get_prediction_filename(
+                                           'ensemble',
+                                           estimator.seed, successful_num_run,
+                                           run_key.budget))
+    assert os.path.exists(ensemble_prediction), ensemble_prediction
+    assert np.shape(np.load(ensemble_prediction, allow_pickle=True))[0] == np.shape(
+        estimator._backend.load_targets_ensemble()
+    )[0]
+
+    # Ensemble Builder produced an ensemble
+    estimator.ensemble_ is not None
+
+    # There should be a weight for each element of the ensemble
+    assert len(estimator.ensemble_.identifiers_) == len(estimator.ensemble_.weights_)
+
+    X_test = backend.load_datamanager().generate_test_seqs()
+
+    y_pred = estimator.predict(X_test)
+
+    assert np.shape(y_pred) == np.shape(y_test)
+
+
 @pytest.mark.parametrize('openml_id', (
     1590,  # Adult to test NaN in categorical columns
 ))
@@ -930,3 +1094,5 @@ def test_task_inference(ans, task_class, backend):
             estimator.get_dataset(X, y)
     else:
         estimator.get_dataset(X, y)
+
+
diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py
index f487ad5ea..7a9a972f0 100644
--- a/test/test_api/test_base_api.py
+++ b/test/test_api/test_base_api.py
@@ -160,3 +160,44 @@ def test_no_resampling_error(backend):
             seed=42,
             ensemble_size=1
         )
+
+
+@pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
+@pytest.mark.parametrize(
+    "min_budget,max_budget,budget_type,expected", [
+        (5, 75, 'epochs', {'budget_type': 'epochs', 'epochs': 75}),
+        (0.01, 1.0, 'resolution', {'budget_type': 'resolution', 'resolution': 1.0}),
+        (0.01, 1.0, 'num_seq', {'budget_type': 'num_seq', 'num_seq': 1.0}),
+        (0.01, 1.0, 'num_sample_per_seq', {'budget_type': 'num_sample_per_seq', 'num_sample_per_seq': 1.0}),
+    ])
+def test_pipeline_get_budget(fit_dictionary_forecasting, min_budget, max_budget, budget_type, expected):
+    BaseTask.__abstractmethods__ = set()
+    estimator = BaseTask(task_type='time_series_forecasting', ensemble_size=0)
+    # Fixture pipeline config
+    default_pipeline_config = {
+        'device': 'cpu', 'budget_type': 'epochs', 'epochs': 50, 'runtime': 3600,
+        'torch_num_threads': 1, 'early_stopping': 20, 'use_tensorboard_logger': False,
+        'metrics_during_training': True, 'optimize_metric': 'mean_MASE_forecasting'
+    }
+    default_pipeline_config.update(expected)
+
+    # Create pre-requisites
+    dataset = fit_dictionary_forecasting['backend'].load_datamanager()
+    pipeline_fit = unittest.mock.Mock()
+
+    smac = unittest.mock.Mock()
+    smac.solver.runhistory = RunHistory()
+    smac.solver.intensifier.traj_logger.trajectory = []
+    smac.solver.tae_runner = unittest.mock.Mock(spec=SerialRunner)
+    smac.solver.tae_runner.budget_type = 'epochs'
+    with unittest.mock.patch('autoPyTorch.optimizer.smbo.get_smac_object') as smac_mock:
+        smac_mock.return_value = smac
+        estimator._search(optimize_metric='mean_MASE_forecasting', dataset=dataset, tae_func=pipeline_fit,
+                          min_budget=min_budget, max_budget=max_budget, budget_type=budget_type,
+                          enable_traditional_pipeline=False,
+                          total_walltime_limit=20, func_eval_time_limit_secs=10,
+                          memory_limit=8192,
+                          load_models=False)
+        assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config
+        assert list(smac_mock.call_args)[1]['max_budget'] == max_budget
+        assert list(smac_mock.call_args)[1]['initial_budget'] == min_budget
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 95504d5fe..f130a9c23 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -202,6 +202,14 @@ def test_multi_to_test_set(self):
         seq_2 = self.seq_multi_with_future.get_val_seq_set(6)
         self.assertEqual(len(seq_2), 6 + 1)
 
+    def test_get_target_values(self):
+        last_visible_target = self.seq_uni.get_target_values(-1)
+        self.assertEqual(last_visible_target, self.seq_uni[-1][0]['past_targets'][-1].numpy())
+
+        self.seq_uni.is_test_set = True
+        last_visible_target = self.seq_uni.get_target_values(-1)
+        self.assertEqual(last_visible_target, self.seq_uni[-1][0]['past_targets'][-1].numpy())
+
     def test_transformation(self):
         self.seq_multi.update_transform(ZeroTransformer(), train=True)
         data, _ = self.seq_multi[-1]
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index 71c6844f5..afbe7648f 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -246,7 +246,7 @@ def get_500_classes_datamanager(resampling_strategy=HoldoutValTypes.holdout_vali
 
 
 def get_forecasting_dataset(n_seq=10,
-                            n_prediction_steps=5,
+                            n_prediction_steps=3,
                             resampling_strategy=HoldoutValTypes.time_series_hold_out_validation):
     base_length = 50
     X = []
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
index fd55abd7c..44b39e04b 100644
--- a/test/test_evaluation/test_forecasting_evaluators.py
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -8,7 +8,7 @@
 from ConfigSpace import Configuration
 
 import numpy as np
-
+from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import create
 from autoPyTorch.datasets.resampling_strategy import CrossValTypes
@@ -112,7 +112,7 @@ def test_holdout(self, pipeline_mock):
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(evaluator.file_output.call_count, 1)
-        self.assertEqual(result, 4592.0)
+        self.assertAlmostEqual(result, 4591.5, places=4)
         self.assertEqual(pipeline_mock.fit.call_count, 1)
         # As forecasting inference could be quite expensive, we only allow one opt prediction and test prediction
         self.assertEqual(pipeline_mock.predict.call_count, 2)
@@ -164,7 +164,7 @@ def test_cv(self, pipeline_mock):
         self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1)
 
         self.assertEqual(evaluator.file_output.call_count, 1)
-        self.assertAlmostEqual(result, 4587.208333333334)
+        self.assertAlmostEqual(result, 4590.06977, places=4)
         self.assertEqual(pipeline_mock.fit.call_count, 3)
         # 3 calls because of the 3 times validation evaluations, however, we only evaluate test target once
         self.assertEqual(pipeline_mock.predict.call_count, 4)
@@ -183,7 +183,7 @@ def test_cv(self, pipeline_mock):
     @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
     def test_proxy_val_set(self, pipeline_mock):
         pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 0.1}
-        D = get_forecasting_dataset()
+        D = get_forecasting_dataset(n_prediction_steps=5)
         n_prediction_steps = D.n_prediction_steps
         pipeline_mock.predict.side_effect = \
             lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
@@ -211,10 +211,11 @@ def test_proxy_val_set(self, pipeline_mock):
         self.assertEqual(len(rval), 1)
         result = rval[0]['loss']
 
-        self.assertEqual(result, 925.2)
+        self.assertAlmostEqual(result, 925.2, places=4)
         res = evaluator.file_output.call_args[0][0].reshape(-1, n_prediction_steps, evaluator.num_targets)
 
         n_evaluated_pip_mock = 0
+        val_split = D.splits[0][1]
 
         for i_seq, seq_output in enumerate(res):
             if i_seq % 3 == 0 and n_evaluated_pip_mock < 3:
@@ -222,4 +223,57 @@ def test_proxy_val_set(self, pipeline_mock):
                 assert np.all(seq_output == 0.)
             else:
                 # predict with dummy predictor
-                assert np.all(seq_output == D.datasets[i_seq][-1][0]['past_targets'][-1].numpy())
+                assert np.all(seq_output == D.get_validation_set(val_split[i_seq])[-1][0]['past_targets'][-1].numpy())
+
+    @unittest.mock.patch('autoPyTorch.pipeline.time_series_forecasting.TimeSeriesForecastingPipeline')
+    @unittest.mock.patch('multiprocessing.Queue', )
+    def test_finish_up(self, pipeline_mock, queue_mock):
+        pipeline_mock.fit_dictionary = {'budget_type': 'epochs', 'epochs': 50}
+
+        rs = np.random.RandomState(1)
+        D = get_forecasting_dataset(n_prediction_steps=3)
+
+        n_prediction_steps = D.n_prediction_steps
+
+        pipeline_mock.predict.side_effect = \
+            lambda X, batch_size=None: np.tile([0.], (len(X), n_prediction_steps))
+
+        pipeline_mock.side_effect = lambda **kwargs: pipeline_mock
+        pipeline_mock.get_additional_run_info.return_value = None
+
+        configuration = unittest.mock.Mock(spec=Configuration)
+        backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
+        backend_api.load_datamanager = lambda: D
+        queue_ = multiprocessing.Queue()
+
+        ae = TimeSeriesForecastingTrainEvaluator(backend_api,
+                                                 queue_mock,
+                                                 configuration=configuration,
+                                                 metric=mean_MASE_forecasting, budget=0.3,
+                                                 pipeline_config={'budget_type': 'epochs', 'epochs': 50},
+                                                 min_num_test_instances=1)
+
+        val_splits = D.splits[0][1]
+        mase_val = ae.generate_mase_coefficient_for_validation(val_splits)
+
+        ae.Y_optimization = rs.rand(len(val_splits) * n_prediction_steps, D.num_targets) * mase_val
+        predictions_ensemble = rs.rand(len(val_splits) * n_prediction_steps, D.num_targets) * mase_val
+        predictions_test = rs.rand(len(D.datasets) * n_prediction_steps, D.num_targets)
+
+        metric_kwargs = {'sp': ae.seasonality,
+                         'n_prediction_steps': ae.n_prediction_steps,
+                         'mase_coefficient': ae.generate_mase_coefficient_for_test_set()}
+
+        # NaNs in prediction ensemble
+        ae.finish_up(
+            loss={'mean_MASE_forecasting': 0.1},
+            train_loss=None,
+            opt_pred=predictions_ensemble,
+            valid_pred=None,
+            test_pred=predictions_test,
+            additional_run_info=None,
+            file_output=True,
+            status=StatusType.SUCCESS,
+            **metric_kwargs
+        )
+        self.assertTrue('test_loss' in queue_mock.put.call_args.args[0]['additional_run_info'])

From c717faea8981ce43fd03521f479744f5d0404578 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 24 May 2022 12:23:02 +0200
Subject: [PATCH 284/347] maint

---
 autoPyTorch/configs/forecasting_init_cfgs.json | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index f423b912a..685ce35c2 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -84,7 +84,7 @@
             "network_backbone:seq_encoder:use_temporal_fusion": false,
             "network_backbone:seq_encoder:variable_selection": false,
             "network_backbone:seq_encoder:block_1:__choice__": "RNNEncoder",
-            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
             "network_backbone:seq_encoder:block_1:RNNEncoder:cell_type": "gru",
             "network_backbone:seq_encoder:block_1:RNNEncoder:num_layers": 1,
             "network_backbone:seq_encoder:block_1:RNNEncoder:hidden_size": 50,
@@ -108,7 +108,7 @@
             "network_backbone:seq_encoder:use_temporal_fusion": false,
             "network_backbone:seq_encoder:variable_selection": false,
             "network_backbone:seq_encoder:block_1:__choice__": "TCNEncoder",
-            "network_backbone:seq_encoder:decoder_auto_regressive": true,
+            "network_backbone:seq_encoder:decoder_auto_regressive": false,
             "network_backbone:seq_encoder:block_1:TCNEncoder:use_dropout": false,
             "network_backbone:seq_encoder:block_1:TCNEncoder:num_blocks": 3,
             "network_backbone:seq_encoder:block_1:TCNEncoder:num_filters_1": 30,
@@ -154,6 +154,7 @@
             "network_backbone:seq_encoder:variable_selection": false,
             "network_backbone:seq_encoder:decoder_auto_regressive": true,
             "network_backbone:seq_encoder:block_1:__choice__": "TransformerEncoder",
+            "network_backbone:seq_encoder:block_1:TransformerEncoder:norm_first": true,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:d_model_log": 5,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:activation": "gelu",
             "network_backbone:seq_encoder:block_1:TransformerEncoder:num_layers": 1,
@@ -167,6 +168,7 @@
             "network_backbone:seq_encoder:block_1:TransformerEncoder:dropout": 0.1,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:use_layer_norm_output": true,
             "network_backbone:seq_encoder:block_1:TransformerEncoder:layer_norm_eps_output": 1e-05,
+            "network_backbone:seq_encoder:block_1:TransformerDecoder:norm_first": true,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:activation": "gelu",
             "network_backbone:seq_encoder:block_1:TransformerDecoder:num_layers": 1,
             "network_backbone:seq_encoder:block_1:TransformerDecoder:use_dropout": true,
@@ -187,7 +189,7 @@
             "loss:RegressionLoss:loss_name": "mase",
             "network_backbone:__choice__": "flat_encoder",
             "network_backbone:flat_encoder:__choice__": "NBEATSEncoder",
-            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ratio": 0.0,
+            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ration": 0.0,
             "network_backbone:flat_encoder:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:flat_encoder:NBEATSDecoder:activation": "relu",
             "network_backbone:flat_encoder:NBEATSDecoder:n_beats_type": "I",
@@ -213,7 +215,7 @@
             "loss:RegressionLoss:loss_name": "mape",
             "network_backbone:__choice__": "flat_encoder",
             "network_backbone:flat_encoder:__choice__": "NBEATSEncoder",
-            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ratio": 0.0,
+            "network_backbone:flat_encoder:NBEATSDecoder:backcast_loss_ration": 0.0,
             "network_backbone:flat_encoder:NBEATSDecoder:normalization": "NoNorm",
             "network_backbone:flat_encoder:NBEATSDecoder:activation": "relu",
             "network_backbone:flat_encoder:NBEATSDecoder:n_beats_type": "G",

From 78d7a515ad2461c31be501c0ed022e963f0cc3af Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 24 May 2022 12:58:03 +0200
Subject: [PATCH 285/347] fix dataloader

---
 .../time_series_forecasting_data_loader.py    | 70 ++++++++++---------
 .../training/test_time_series_data_loader.py  | 16 +++--
 2 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5e5beefdc..acc3372a0 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -360,52 +360,54 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
         if isinstance(X, TimeSeriesSequence):
             X = [X]
         if isinstance(X, List):
-            num_sequences = len(X)
-            sequence_lengths = [0] * num_sequences
-            for seq_idx, x_seq in enumerate(X):
-                sequence_lengths[seq_idx] = len(x_seq.X)
-            series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
-
-            if len(self.known_future_features_index) > 0:
-                sequence_lengths_test = [0] * num_sequences
+            if self.dataset_small_preprocess and not self._is_uni_variant:
+
+                num_sequences = len(X)
+                sequence_lengths = [0] * num_sequences
                 for seq_idx, x_seq in enumerate(X):
-                    sequence_lengths_test[seq_idx] = len(x_seq.X_test)
-                series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
+                    sequence_lengths[seq_idx] = len(x_seq.X)
+                series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
 
-            if self.dataset_small_preprocess and not self._is_uni_variant and not X[0].is_pre_processed:
+                if len(self.known_future_features_index) > 0:
+                    sequence_lengths_test = [0] * num_sequences
+                    for seq_idx, x_seq in enumerate(X):
+                        sequence_lengths_test[seq_idx] = len(x_seq.X_test)
+                    series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
 
-                x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
+                if not X[0].is_pre_processed:
 
-                x_all.index = series_number
+                    x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
 
-                if self.dataset_small_preprocess and self.feature_preprocessor is not None:
-                    self.feature_preprocessor = self.feature_preprocessor.fit(x_all)
-                    x_all = self.feature_preprocessor.transform(x_all.copy())
+                    x_all.index = series_number
 
-                x_all = pd.DataFrame(x_all)
-                x_all.index = series_number
+                    if self.dataset_small_preprocess and self.feature_preprocessor is not None:
+                        self.feature_preprocessor = self.feature_preprocessor.fit(x_all)
+                        x_all = self.feature_preprocessor.transform(x_all.copy())
 
-                if len(self.known_future_features_index) > 0:
-                    x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]),
-                                              columns=self.dataset_columns)
+                    x_all = pd.DataFrame(x_all)
+                    x_all.index = series_number
 
-                    x_all_test.index = series_number_test
+                    if len(self.known_future_features_index) > 0:
+                        x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]),
+                                                  columns=self.dataset_columns)
 
-                    if self.dataset_small_preprocess and self.feature_preprocessor is not None:
-                        x_all_test = self.feature_preprocessor.transform(x_all_test.copy())
+                        x_all_test.index = series_number_test
 
-                    x_all_test = pd.DataFrame(x_all_test)
-                    x_all_test.index = series_number_test
+                        if self.dataset_small_preprocess and self.feature_preprocessor is not None:
+                            x_all_test = self.feature_preprocessor.transform(x_all_test.copy())
 
-            else:
-                x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]))
-                x_all.index = series_number
-                if len(self.known_future_features_index) > 0:
-                    x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]))
-                    x_all_test.index = series_number_test
+                        x_all_test = pd.DataFrame(x_all_test)
+                        x_all_test.index = series_number_test
+
+                else:
+                    x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]))
+                    x_all.index = series_number
+                    if len(self.known_future_features_index) > 0:
+                        x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]))
+                        x_all_test.index = series_number_test
 
-            x_all = x_all.groupby(x_all.index)
-            x_all_test = x_all_test.groupby(x_all_test.index)
+                x_all = x_all.groupby(x_all.index)
+                x_all_test = x_all_test.groupby(x_all_test.index)
 
             for i, x_seq in enumerate(X):
                 if not isinstance(x_seq, TimeSeriesSequence):
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
index 5ac84c23b..d109be5df 100644
--- a/test/test_pipeline/components/training/test_time_series_data_loader.py
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -26,7 +26,7 @@
 )
 
 
-class TestTimeSeriesForecastingDataSets(unittest.TestCase):
+class TestTimeSeriesForecastingDataLoader(unittest.TestCase):
     def setUp(self) -> None:
         feature_names = ['f1']
         feature_shapes = {'f1': 1}
@@ -328,7 +328,7 @@ def test_get_loader(self, loader_init_mock):
         x_test = TimeSeriesSequence(X=np.array([1, 2, 3, 4, 5]),
                                     Y=np.array([1, 2, 3, 4, 5]),
                                     X_test=np.array([1, 2, 3]))
-        test_loader = time_series_dataloader.get_loader(X=x_test)
+        test_loader = time_series_dataloader.get_loader(X=copy.deepcopy(x_test))
         self.assertIsInstance(test_loader, torch.utils.data.DataLoader)
         self.assertIsInstance(test_loader.dataset, TestSequenceDataset)
         test_set = loader_init_mock.call_args[0][0]
@@ -336,7 +336,7 @@ def test_get_loader(self, loader_init_mock):
         self.assertEqual(len(test_set), 1)
 
         x_test = [x_test, x_test]
-        _ = time_series_dataloader.get_loader(X=x_test)
+        _ = time_series_dataloader.get_loader(X=copy.deepcopy(x_test))
         test_set = loader_init_mock.call_args[0][0]
         self.assertEqual(len(test_set), len(x_test))
 
@@ -354,7 +354,15 @@ def transform(self, data: pd.DataFrame):
 
         transform = DummyEncoder()
         time_series_dataloader.feature_preprocessor = transform
-        _ = time_series_dataloader.get_loader(X=copy.deepcopy(x_test))
+        x_test_copy = copy.deepcopy(x_test)
+        _ = time_series_dataloader.get_loader(X=x_test_copy)
+
+        test_set = loader_init_mock.call_args[0][0]
+        for seq_raw, seq in zip(x_test, test_set):
+            self.assertTrue(seq.X.shape[-1] == 2 * seq_raw.X.shape[-1])
+
+        # ensure that we do not transform the dataset twice
+        _ = time_series_dataloader.get_loader(X=x_test_copy)
         test_set = loader_init_mock.call_args[0][0]
         for seq_raw, seq in zip(x_test, test_set):
             self.assertTrue(seq.X.shape[-1] == 2 * seq_raw.X.shape[-1])

From fa5cc757d77a88284db08de333f4dd83519c96fb Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 24 May 2022 13:31:54 +0200
Subject: [PATCH 286/347] move test with for loop to unittest.subtest

---
 .../test_base_components.py                   | 305 +++++++++---------
 .../test_flat_backbones.py                    | 100 +++---
 .../forecasting_networks/test_seq_encoder.py  | 250 +++++++-------
 3 files changed, 333 insertions(+), 322 deletions(-)

diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index e3162c13d..f77877f74 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -187,127 +187,136 @@ def test_base_encoder(self):
             variable_selection = hp_values[1]
             transform_time_features = hp_values[2]
             is_small_preprocess = hp_values[3]
+            with self.subTest(uni_variant=uni_variant,
+                              variable_selection=variable_selection,
+                              transform_time_features=transform_time_features,
+                              is_small_preprocess=is_small_preprocess):
+                network_structure = NetworkStructure(variable_selection=variable_selection)
 
-            network_structure = NetworkStructure(variable_selection=variable_selection)
+                dataset_properties = copy.copy(self.dataset_properties)
+                fit_dictionary = copy.copy(self.fit_dictionary)
 
-            dataset_properties = copy.copy(self.dataset_properties)
-            fit_dictionary = copy.copy(self.fit_dictionary)
+                dataset_properties['is_small_preprocess'] = is_small_preprocess
+                dataset_properties['uni_variant'] = uni_variant
 
-            dataset_properties['is_small_preprocess'] = is_small_preprocess
-            dataset_properties['uni_variant'] = uni_variant
+                fit_dictionary['dataset_properties'] = self.dataset_properties
+                fit_dictionary['network_structure'] = network_structure
+                fit_dictionary['transform_time_features'] = transform_time_features
+                fit_dictionary['dataset_properties'] = dataset_properties
 
-            fit_dictionary['dataset_properties'] = self.dataset_properties
-            fit_dictionary['network_structure'] = network_structure
-            fit_dictionary['transform_time_features'] = transform_time_features
-            fit_dictionary['dataset_properties'] = dataset_properties
+                encoder_block_1 = copy.deepcopy(self.encoder)
 
-            encoder_block_1 = copy.deepcopy(self.encoder)
+                encoder_block_2 = copy.deepcopy(self.encoder)
+                encoder_block_2.block_number = 2
 
-            encoder_block_2 = copy.deepcopy(self.encoder)
-            encoder_block_2.block_number = 2
+                encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+                fit_dictionary = encoder_block_1.transform(fit_dictionary)
+                network_encoder = fit_dictionary['network_encoder']
+                self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
+                self.assertEqual(network_encoder['block_1'].encoder_output_shape, (1, 10))
 
-            encoder_block_1 = encoder_block_1.fit(fit_dictionary)
-            fit_dictionary = encoder_block_1.transform(fit_dictionary)
-            network_encoder = fit_dictionary['network_encoder']
-            self.assertIsInstance(network_encoder['block_1'], EncoderBlockInfo)
-            self.assertEqual(network_encoder['block_1'].encoder_output_shape, (1, 10))
-
-            if variable_selection:
-                self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
-            else:
-                if uni_variant:
-                    n_input_features = 0
+                if variable_selection:
+                    self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size, 10))
                 else:
-                    if is_small_preprocess:
-                        n_input_features = 40
+                    if uni_variant:
+                        n_input_features = 0
                     else:
-                        n_input_features = 15
+                        if is_small_preprocess:
+                            n_input_features = 40
+                        else:
+                            n_input_features = 15
 
-                if transform_time_features:
-                    n_input_features += len(dataset_properties['time_feature_transform'])
+                    if transform_time_features:
+                        n_input_features += len(dataset_properties['time_feature_transform'])
 
-                n_input_features += dataset_properties['output_shape'][-1]
-                self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size,
-                                                                                  n_input_features))
+                    n_input_features += dataset_properties['output_shape'][-1]
+                    self.assertEqual(network_encoder['block_1'].encoder_input_shape, (window_size,
+                                                                                      n_input_features))
 
-            encoder_block_2 = encoder_block_2.fit(fit_dictionary)
-            fit_dictionary = encoder_block_2.transform(fit_dictionary)
+                encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+                fit_dictionary = encoder_block_2.transform(fit_dictionary)
 
-            network_encoder = fit_dictionary['network_encoder']
-            self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
-            self.assertEqual(network_encoder['block_2'].encoder_output_shape, (1, 10))
-            self.assertEqual(network_encoder['block_2'].encoder_input_shape, (1, 10))
+                network_encoder = fit_dictionary['network_encoder']
+                self.assertIsInstance(network_encoder['block_2'], EncoderBlockInfo)
+                self.assertEqual(network_encoder['block_2'].encoder_output_shape, (1, 10))
+                self.assertEqual(network_encoder['block_2'].encoder_input_shape, (1, 10))
 
     def test_base_decoder(self):
         n_prediction_steps = self.dataset_properties['n_prediction_steps']
         for variable_selection in (True, False):
-            network_structure = NetworkStructure(variable_selection=variable_selection, num_blocks=2)
-            dataset_properties = copy.copy(self.dataset_properties)
-            fit_dictionary = copy.copy(self.fit_dictionary)
-
-            fit_dictionary['network_structure'] = network_structure
-            fit_dictionary['dataset_properties'] = dataset_properties
-
-            encoder_block_1 = copy.deepcopy(self.encoder)
-            encoder_block_2 = copy.deepcopy(self.encoder)
-            encoder_block_2.block_number = 2
-
-            encoder_block_1 = encoder_block_1.fit(fit_dictionary)
-            fit_dictionary = encoder_block_1.transform(fit_dictionary)
-            encoder_block_2 = encoder_block_2.fit(fit_dictionary)
-            fit_dictionary = encoder_block_2.transform(fit_dictionary)
-
-            decoder1 = copy.deepcopy(self.decoder_w_local)
-            decoder1 = decoder1.fit(fit_dictionary)
-            self.assertEqual(decoder1.n_prediction_heads, n_prediction_steps)
-            fit_dictionary = decoder1.transform(fit_dictionary)
-
-            network_decoder = fit_dictionary['network_decoder']
-            self.assertIsInstance(network_decoder['block_1'], DecoderBlockInfo)
-            if variable_selection:
-                self.assertEqual(network_decoder['block_1'].decoder_input_shape,
-                                 (n_prediction_steps, 10))  # Pure variable selection
-                self.assertEqual(network_decoder['block_1'].decoder_output_shape,
-                                 (n_prediction_steps, 26))  # 10 (input features) + 16 (n_output_dims)
-            else:
-                self.assertEqual(network_decoder['block_1'].decoder_input_shape,
-                                 (n_prediction_steps, 52))  # 50 (input features) + 2 (time_transforms)
-                self.assertEqual(network_decoder['block_1'].decoder_output_shape,
-                                 (n_prediction_steps, 68))  # 52 (input features) + 16 (n_out_dims)
-
-            for name, decoder in self.decoders.items():
-                fit_dictionary_ = copy.deepcopy(fit_dictionary)
-                decoder2 = copy.deepcopy(decoder)
-                decoder2.block_number = 2
-                decoder2 = decoder2.fit(fit_dictionary_)
-                fit_dictionary_ = decoder2.transform(fit_dictionary_)
-                self.assertTrue(decoder2.is_last_decoder)
-                if name == 'ar':
-                    self.assertEqual(fit_dictionary_['n_prediction_heads'], 1)
-                else:
-                    self.assertEqual(fit_dictionary_['n_prediction_heads'], n_prediction_steps)
-                n_prediction_heads = fit_dictionary_['n_prediction_heads']
-
-                network_decoder = fit_dictionary_['network_decoder']['block_2']
-                self.assertIsInstance(network_decoder, DecoderBlockInfo)
+            with self.subTest(variable_selection=variable_selection):
+                network_structure = NetworkStructure(variable_selection=variable_selection, num_blocks=2)
+                dataset_properties = copy.copy(self.dataset_properties)
+                fit_dictionary = copy.copy(self.fit_dictionary)
+
+                fit_dictionary['network_structure'] = network_structure
+                fit_dictionary['dataset_properties'] = dataset_properties
+
+                encoder_block_1 = copy.deepcopy(self.encoder)
+                encoder_block_2 = copy.deepcopy(self.encoder)
+                encoder_block_2.block_number = 2
+
+                encoder_block_1 = encoder_block_1.fit(fit_dictionary)
+                fit_dictionary = encoder_block_1.transform(fit_dictionary)
+                encoder_block_2 = encoder_block_2.fit(fit_dictionary)
+                fit_dictionary = encoder_block_2.transform(fit_dictionary)
+
+                decoder1 = copy.deepcopy(self.decoder_w_local)
+                decoder1 = decoder1.fit(fit_dictionary)
+                self.assertEqual(decoder1.n_prediction_heads, n_prediction_steps)
+                fit_dictionary = decoder1.transform(fit_dictionary)
+
+                network_decoder = fit_dictionary['network_decoder']
+                self.assertIsInstance(network_decoder['block_1'], DecoderBlockInfo)
                 if variable_selection:
-                    self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 26))
-
-                    if name == 'non_ar_w_local':
-                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 42))  # 26+16
-                    elif name == 'non_ar_wo_local':
-                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # num_global
-                    elif name == 'ar':
-                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+                    self.assertEqual(network_decoder['block_1'].decoder_input_shape,
+                                     (n_prediction_steps, 10))  # Pure variable selection
+                    self.assertEqual(network_decoder['block_1'].decoder_output_shape,
+                                     (n_prediction_steps, 26))  # 10 (input features) + 16 (n_output_dims)
                 else:
-                    self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 68))
-
-                    if name == 'non_ar_w_local':
-                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 84))  # 26+16
-                    elif name == 'non_ar_wo_local':
-                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # num_global
-                    elif name == 'ar':
-                        self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+                    self.assertEqual(network_decoder['block_1'].decoder_input_shape,
+                                     (n_prediction_steps, 52))  # 50 (input features) + 2 (time_transforms)
+                    self.assertEqual(network_decoder['block_1'].decoder_output_shape,
+                                     (n_prediction_steps, 68))  # 52 (input features) + 16 (n_out_dims)
+
+                for name, decoder in self.decoders.items():
+                    with self.subTest(decoder_name=name):
+                        fit_dictionary_ = copy.deepcopy(fit_dictionary)
+                        decoder2 = copy.deepcopy(decoder)
+                        decoder2.block_number = 2
+                        decoder2 = decoder2.fit(fit_dictionary_)
+                        fit_dictionary_ = decoder2.transform(fit_dictionary_)
+                        self.assertTrue(decoder2.is_last_decoder)
+                        if name == 'ar':
+                            self.assertEqual(fit_dictionary_['n_prediction_heads'], 1)
+                        else:
+                            self.assertEqual(fit_dictionary_['n_prediction_heads'], n_prediction_steps)
+                        n_prediction_heads = fit_dictionary_['n_prediction_heads']
+
+                        network_decoder = fit_dictionary_['network_decoder']['block_2']
+                        self.assertIsInstance(network_decoder, DecoderBlockInfo)
+                        if variable_selection:
+                            self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 26))
+
+                            if name == 'non_ar_w_local':
+                                # 26+16
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 42))
+                            elif name == 'non_ar_wo_local':
+                                # num_global
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))
+                            elif name == 'ar':
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
+                        else:
+                            self.assertEqual(network_decoder.decoder_input_shape, (n_prediction_heads, 68))
+
+                            if name == 'non_ar_w_local':
+                                # 26+16
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 84))
+                            elif name == 'non_ar_wo_local':
+                                # num_global
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))
+                            elif name == 'ar':
+                                self.assertEqual(network_decoder.decoder_output_shape, (n_prediction_heads, 32))  # 32
 
     def test_forecasting_heads(self):
         variable_selection = False
@@ -334,50 +343,50 @@ def test_forecasting_heads(self):
 
         quantiles = [0.5, 0.1, 0.9]
         for name, decoder in self.decoders.items():
-            fit_dictionary_ = copy.deepcopy(fit_dictionary)
-            decoder = decoder.fit(fit_dictionary_)
-            fit_dictionary_ = decoder.transform(fit_dictionary_)
-
-            for net_output_type in ['regression', 'distribution', 'quantile']:
-
-                def eval_heads_output(fit_dict):
-                    head = ForecastingHead()
-                    head = head.fit(fit_dict)
-                    fit_dictionary_copy = head.transform(fit_dict)
-
-                    encoder = fit_dictionary_copy['network_encoder']['block_1'].encoder
-                    decoder = fit_dictionary_copy['network_decoder']['block_1'].decoder
-
-                    head = fit_dictionary_copy['network_head']
-                    output = head(decoder(input_tensor_future, encoder(input_tensor, output_seq=False)))
-                    if name != "ar":
-                        if net_output_type == 'regression':
-                            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
-                        elif net_output_type == 'distribution':
-                            self.assertListEqual(list(output.sample().shape), [10, n_prediction_steps, 1])
-                        elif net_output_type == 'quantile':
-                            self.assertEqual(len(output), len(quantiles))
-                            for output_quantile in output:
-                                self.assertListEqual(list(output_quantile.shape), [10, n_prediction_steps, 1])
-                    else:
-                        if net_output_type == 'regression':
-                            self.assertListEqual(list(output.shape), [10, 1, 1])
-                        elif net_output_type == 'distribution':
-                            self.assertListEqual(list(output.sample().shape), [10, 1, 1])
+            with self.subTest(decoder_name=name):
+                fit_dictionary_ = copy.deepcopy(fit_dictionary)
+                decoder = decoder.fit(fit_dictionary_)
+                fit_dictionary_ = decoder.transform(fit_dictionary_)
+
+                for net_output_type in ['regression', 'distribution', 'quantile']:
+                    def eval_heads_output(fit_dict):
+                        head = ForecastingHead()
+                        head = head.fit(fit_dict)
+                        fit_dictionary_copy = head.transform(fit_dict)
+
+                        encoder = fit_dictionary_copy['network_encoder']['block_1'].encoder
+                        decoder = fit_dictionary_copy['network_decoder']['block_1'].decoder
+
+                        head = fit_dictionary_copy['network_head']
+                        output = head(decoder(input_tensor_future, encoder(input_tensor, output_seq=False)))
+                        if name != "ar":
+                            if net_output_type == 'regression':
+                                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+                            elif net_output_type == 'distribution':
+                                self.assertListEqual(list(output.sample().shape), [10, n_prediction_steps, 1])
+                            elif net_output_type == 'quantile':
+                                self.assertEqual(len(output), len(quantiles))
+                                for output_quantile in output:
+                                    self.assertListEqual(list(output_quantile.shape), [10, n_prediction_steps, 1])
+                        else:
+                            if net_output_type == 'regression':
+                                self.assertListEqual(list(output.shape), [10, 1, 1])
+                            elif net_output_type == 'distribution':
+                                self.assertListEqual(list(output.sample().shape), [10, 1, 1])
+                            elif net_output_type == 'quantile':
+                                self.assertEqual(len(output), len(quantiles))
+                                for output_quantile in output:
+                                    self.assertListEqual(list(output_quantile.shape), [10, 1, 1])
+                    with self.subTest(net_output_type=net_output_type):
+                        fit_dictionary_copy = copy.deepcopy(fit_dictionary_)
+                        fit_dictionary_copy['net_output_type'] = net_output_type
+
+                        if net_output_type == 'distribution':
+                            for dist in ALL_DISTRIBUTIONS.keys():
+                                fit_dictionary_copy['dist_forecasting_strategy'] = DisForecastingStrategy(dist_cls=dist)
+                                eval_heads_output(fit_dictionary_copy)
                         elif net_output_type == 'quantile':
-                            self.assertEqual(len(output), len(quantiles))
-                            for output_quantile in output:
-                                self.assertListEqual(list(output_quantile.shape), [10, 1, 1])
-
-                fit_dictionary_copy = copy.deepcopy(fit_dictionary_)
-                fit_dictionary_copy['net_output_type'] = net_output_type
-
-                if net_output_type == 'distribution':
-                    for dist in ALL_DISTRIBUTIONS.keys():
-                        fit_dictionary_copy['dist_forecasting_strategy'] = DisForecastingStrategy(dist_cls=dist)
-                        eval_heads_output(fit_dictionary_copy)
-                elif net_output_type == 'quantile':
-                    fit_dictionary_copy['quantile_values'] = quantiles
-                    eval_heads_output(fit_dictionary_copy)
-                else:
-                    eval_heads_output(fit_dictionary_copy)
+                            fit_dictionary_copy['quantile_values'] = quantiles
+                            eval_heads_output(fit_dictionary_copy)
+                        else:
+                            eval_heads_output(fit_dictionary_copy)
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
index 446dda6bd..cfb0cda9f 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -93,29 +93,30 @@ def test_mlp_network(self):
         fit_dict = encoder.transform(fit_dict)
 
         for name, decoder in decoders.items():
-            fit_dict_ = copy.copy(fit_dict)
+            with self.subTest(decoder_name=name):
+                fit_dict_ = copy.copy(fit_dict)
 
-            decoder = decoder.fit(fit_dict_)
-            fit_dict_ = decoder.transform(fit_dict_)
+                decoder = decoder.fit(fit_dict_)
+                fit_dict_ = decoder.transform(fit_dict_)
 
-            input_tensor = torch.randn([10, 20, 3 + fit_dict_['X_train'].shape[-1]])
-            input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dict_['X_train'].shape[-1]])
+                input_tensor = torch.randn([10, 20, 3 + fit_dict_['X_train'].shape[-1]])
+                input_tensor_future = torch.randn([10, n_prediction_steps, 2 + fit_dict_['X_train'].shape[-1]])
 
-            head = ForecastingHead()
-            head = head.fit(fit_dict_)
-            fit_dict_ = head.transform(fit_dict_)
+                head = ForecastingHead()
+                head = head.fit(fit_dict_)
+                fit_dict_ = head.transform(fit_dict_)
 
-            net_encoder = StackedEncoder(network_structure, False,
-                                         fit_dict_['network_encoder'], fit_dict_['network_decoder'])
-            net_decoder = StackedDecoder(network_structure, net_encoder.encoder, fit_dict_['network_encoder'],
-                                         fit_dict_['network_decoder'])
+                net_encoder = StackedEncoder(network_structure, False,
+                                             fit_dict_['network_encoder'], fit_dict_['network_decoder'])
+                net_decoder = StackedDecoder(network_structure, net_encoder.encoder, fit_dict_['network_encoder'],
+                                             fit_dict_['network_decoder'])
 
-            head = fit_dict_['network_head']
+                head = fit_dict_['network_head']
 
-            encoder2decoder, _ = net_encoder(input_tensor, [None])
-            output = head(net_decoder(input_tensor_future, encoder2decoder))
+                encoder2decoder, _ = net_encoder(input_tensor, [None])
+                output = head(net_decoder(input_tensor_future, encoder2decoder))
 
-            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
 
     def test_nbeats_network(self):
         n_prediction_steps = self.dataset_properties['n_prediction_steps']
@@ -186,36 +187,37 @@ def test_nbeats_network(self):
         fit_dict = encoder.transform(fit_dict)
 
         for decoder_idx, decoder in enumerate([nbeats_i, nbeats_g]):
-            fit_dict = copy.copy(fit_dict)
-            fit_dict_ = copy.copy(fit_dict)
-
-            decoder = decoder.fit(fit_dict_)
-            fit_dict_ = decoder.transform(fit_dict_)
-
-            input_tensor = torch.randn([10, 20, 1])
-
-            head = ForecastingHead()
-            head = head.fit(fit_dict_)
-            fit_dict_ = head.transform(fit_dict_)
-
-            encoder_net = fit_dict_['network_encoder']['block_1'].encoder
-            decoder_net = fit_dict_['network_decoder']['block_1'].decoder
-            idx_tracker = 0
-            if decoder_idx == 0:
-                # only check nbeats_i
-                for i_stack in range(1, 1 + nbeatsI_cfg['num_stacks_i']):
-                    num_blocks = nbeatsI_cfg[f'num_blocks_i_{i_stack}']
-                    idx_end = idx_tracker + num_blocks
-                    num_individual_models = len(set(decoder_net[idx_tracker:idx_end]))
-                    if nbeatsI_cfg[f'weight_sharing_i_{i_stack}']:
-                        self.assertEqual(num_individual_models, 1)
-                    else:
-                        self.assertEqual(num_individual_models, num_blocks)
-                    idx_tracker = idx_end
-
-            input_tensor = encoder_net(input_tensor, output_seq=False)
-
-            for block in decoder_net:
-                backcast_block, forecast_block = block([None], input_tensor)
-                self.assertListEqual(list(backcast_block.shape), [10, window_size * 1])
-                self.assertListEqual(list(forecast_block.shape), [10, n_prediction_steps * 1])
+            with self.subTest(decoder_idx=decoder_idx):
+                fit_dict = copy.copy(fit_dict)
+                fit_dict_ = copy.copy(fit_dict)
+
+                decoder = decoder.fit(fit_dict_)
+                fit_dict_ = decoder.transform(fit_dict_)
+
+                input_tensor = torch.randn([10, 20, 1])
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict_)
+                fit_dict_ = head.transform(fit_dict_)
+
+                encoder_net = fit_dict_['network_encoder']['block_1'].encoder
+                decoder_net = fit_dict_['network_decoder']['block_1'].decoder
+                idx_tracker = 0
+                if decoder_idx == 0:
+                    # only check nbeats_i
+                    for i_stack in range(1, 1 + nbeatsI_cfg['num_stacks_i']):
+                        num_blocks = nbeatsI_cfg[f'num_blocks_i_{i_stack}']
+                        idx_end = idx_tracker + num_blocks
+                        num_individual_models = len(set(decoder_net[idx_tracker:idx_end]))
+                        if nbeatsI_cfg[f'weight_sharing_i_{i_stack}']:
+                            self.assertEqual(num_individual_models, 1)
+                        else:
+                            self.assertEqual(num_individual_models, num_blocks)
+                        idx_tracker = idx_end
+
+                input_tensor = encoder_net(input_tensor, output_seq=False)
+
+                for block in decoder_net:
+                    backcast_block, forecast_block = block([None], input_tensor)
+                    self.assertListEqual(list(backcast_block.shape), [10, window_size * 1])
+                    self.assertListEqual(list(forecast_block.shape), [10, n_prediction_steps * 1])
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index 2155b53da..e2ba3b615 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -73,141 +73,141 @@ def test_config_space(self):
 
     def test_deepar(self):
         for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder', 'TCNEncoder', 'InceptionTimeEncoder']):
-            seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
-            update_ar = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                        hyperparameter='auto_regressive',
-                                                        value_range=(True,),
-                                                        default_value=True, )
-            update_rnn_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                             hyperparameter='decoder_type',
-                                                             value_range=('MLPDecoder',),
-                                                             default_value='MLPDecoder', )
-            update_transformer_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                                     hyperparameter='decoder_type',
-                                                                     value_range=('MLPDecoder',),
-                                                                     default_value='MLPDecoder', )
-            seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_mlp,
-                                              "block_1:TransformerEncoder:decoder_type": update_transformer_mlp,
-                                              "block_1:MLPDecoder:auto_regressive": update_ar}
+            with self.subTest(valid_encoder=valid_encoder):
+                seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+                update_ar = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                            hyperparameter='auto_regressive',
+                                                            value_range=(True,),
+                                                            default_value=True, )
+                update_rnn_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                 hyperparameter='decoder_type',
+                                                                 value_range=('MLPDecoder',),
+                                                                 default_value='MLPDecoder', )
+                update_transformer_mlp = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                         hyperparameter='decoder_type',
+                                                                         value_range=('MLPDecoder',),
+                                                                         default_value='MLPDecoder', )
+                seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_mlp,
+                                                  "block_1:TransformerEncoder:decoder_type": update_transformer_mlp,
+                                                  "block_1:MLPDecoder:auto_regressive": update_ar}
+
+                cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
+                                                                            include=[valid_encoder])
+                sample = cs_seq.get_default_configuration()
 
-            cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
-                                                                        include=[valid_encoder])
-            sample = cs_seq.get_default_configuration()
+                seq_encoder_choice.set_hyperparameters(sample)
+
+                fit_dict = copy.copy(self.fit_dictionary)
+                fit_dict['dataset_properties'] = self.dataset_properties
 
-            seq_encoder_choice.set_hyperparameters(sample)
+                encoder_choices = seq_encoder_choice.fit(fit_dict)
+                fit_dict = encoder_choices.transform(fit_dict)
 
-            fit_dict = copy.copy(self.fit_dictionary)
-            fit_dict['dataset_properties'] = self.dataset_properties
+                head = ForecastingHead()
+                head = head.fit(fit_dict)
+                fit_dict = head.transform(fit_dict)
 
-            encoder_choices = seq_encoder_choice.fit(fit_dict)
-            fit_dict = encoder_choices.transform(fit_dict)
+                net_encoder = StackedEncoder(fit_dict['network_structure'], False,
+                                             fit_dict['network_encoder'], fit_dict['network_decoder'])
+                net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                             fit_dict['network_encoder'],
+                                             fit_dict['network_decoder'])
 
-            head = ForecastingHead()
-            head = head.fit(fit_dict)
-            fit_dict = head.transform(fit_dict)
+                head = fit_dict['network_head']
+                if i < 2:
+                    input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
+                    input_tensor_future = torch.randn([10, 1, 59])
+                else:
+                    input_tensor = torch.randn([10, 20, 53])  # no lag
+                    input_tensor_future = torch.randn([10, 1, 53])
 
-            net_encoder = StackedEncoder(fit_dict['network_structure'], False,
-                                         fit_dict['network_encoder'], fit_dict['network_decoder'])
-            net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
-                                         fit_dict['network_encoder'],
-                                         fit_dict['network_decoder'])
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                              additional_input=[None],
+                                                              cache_intermediate_state=True,
+                                                              )
+                output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, 1, 1])
 
-            head = fit_dict['network_head']
-            if i < 2:
-                input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
-                input_tensor_future = torch.randn([10, 1, 59])
-            else:
-                input_tensor = torch.randn([10, 20, 53])  # no lag
-                input_tensor_future = torch.randn([10, 1, 53])
-
-            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
-                                                          additional_input=[None],
-                                                          cache_intermediate_state=True,
-                                                          )
-            output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
-            try:
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
+                                                              additional_input=[None],
+                                                              output_seq=False, cache_intermediate_state=True,
+                                                              incremental_update=True
+                                                              )
+                output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
                 self.assertListEqual(list(output.shape), [10, 1, 1])
-            except Exception:
-                import pdb
-                pdb.set_trace()
-
-            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
-                                                          additional_input=[None],
-                                                          output_seq=False, cache_intermediate_state=True,
-                                                          incremental_update=True
-                                                          )
-            output = head(net_decoder(x_future=None, encoder_output=encoder2decoder))
-            self.assertListEqual(list(output.shape), [10, 1, 1])
 
     def test_seq2seq(self):
         n_prediction_steps = self.dataset_properties['n_prediction_steps']
 
         for i, valid_encoder in enumerate(['RNNEncoder', 'TransformerEncoder']):
-            seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
+            with self.subTest(valid_encoder=valid_encoder):
+                seq_encoder_choice = SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties)
 
-            update_rnn_rnn = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
-                                                             hyperparameter='decoder_type',
-                                                             value_range=('RNNDecoder',),
-                                                             default_value='RNNDecoder', )
-            update_trans_trans = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                update_rnn_rnn = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
                                                                  hyperparameter='decoder_type',
-                                                                 value_range=('TransformerDecoder',),
-                                                                 default_value='TransformerDecoder', )
-
-            seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_rnn,
-                                              "block_1:TransformerEncoder:decoder_type": update_trans_trans}
-            decoder_auto_regressive = HyperparameterSearchSpace(
-                hyperparameter="decoder_auto_regressive",
-                value_range=(True,),
-                default_value=True,
-            )
-
-            cs_seq = seq_encoder_choice.get_hyperparameter_search_space(dataset_properties=self.dataset_properties,
-                                                                        decoder_auto_regressive=decoder_auto_regressive,
-                                                                        include=[valid_encoder])
-            sample = cs_seq.get_default_configuration()
-
-            seq_encoder_choice.set_hyperparameters(sample)
-
-            fit_dict = copy.copy(self.fit_dictionary)
-            fit_dict['dataset_properties'] = self.dataset_properties
-
-            encoder_choices = seq_encoder_choice.fit(fit_dict)
-            fit_dict = encoder_choices.transform(fit_dict)
-
-            head = ForecastingHead()
-            head = head.fit(fit_dict)
-            fit_dict = head.transform(fit_dict)
-
-            net_encoder = StackedEncoder(fit_dict['network_structure'], False,
-                                         fit_dict['network_encoder'], fit_dict['network_decoder'])
-            net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
-                                         fit_dict['network_encoder'],
-                                         fit_dict['network_decoder'])
-
-            head = fit_dict['network_head']
-
-            input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
-            input_tensor_future = torch.randn([10, n_prediction_steps, 59])
-
-            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
-                                                          additional_input=[None],
-                                                          cache_intermediate_state=True,
-                                                          )
-            output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
-            self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
-
-            net_encoder.eval()
-            net_decoder.eval()
-            input_tensor_future = torch.randn([10, 1, 59])
-
-            encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
-                                                          additional_input=[None],
-                                                          output_seq=False, cache_intermediate_state=True,
-                                                          incremental_update=True
-                                                          )
-            output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
-            self.assertListEqual(list(output.shape), [10, 1, 1])
+                                                                 value_range=('RNNDecoder',),
+                                                                 default_value='RNNDecoder', )
+                update_trans_trans = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
+                                                                     hyperparameter='decoder_type',
+                                                                     value_range=('TransformerDecoder',),
+                                                                     default_value='TransformerDecoder', )
+
+                seq_encoder_choice._cs_updates = {"block_1:RNNEncoder:decoder_type": update_rnn_rnn,
+                                                  "block_1:TransformerEncoder:decoder_type": update_trans_trans}
+                decoder_auto_regressive = HyperparameterSearchSpace(
+                    hyperparameter="decoder_auto_regressive",
+                    value_range=(True,),
+                    default_value=True,
+                )
+
+                cs_seq = seq_encoder_choice.get_hyperparameter_search_space(
+                    dataset_properties=self.dataset_properties,
+                    decoder_auto_regressive=decoder_auto_regressive,
+                    include=[valid_encoder]
+                )
+                sample = cs_seq.get_default_configuration()
+
+                seq_encoder_choice.set_hyperparameters(sample)
+
+                fit_dict = copy.copy(self.fit_dictionary)
+                fit_dict['dataset_properties'] = self.dataset_properties
+
+                encoder_choices = seq_encoder_choice.fit(fit_dict)
+                fit_dict = encoder_choices.transform(fit_dict)
+
+                head = ForecastingHead()
+                head = head.fit(fit_dict)
+                fit_dict = head.transform(fit_dict)
+
+                net_encoder = StackedEncoder(fit_dict['network_structure'], False,
+                                             fit_dict['network_encoder'], fit_dict['network_decoder'])
+                net_decoder = StackedDecoder(fit_dict['network_structure'], net_encoder.encoder,
+                                             fit_dict['network_encoder'],
+                                             fit_dict['network_decoder'])
+
+                head = fit_dict['network_head']
+
+                input_tensor = torch.randn([10, 20, 59])  # 53 + 6(lag values)
+                input_tensor_future = torch.randn([10, n_prediction_steps, 59])
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor,
+                                                              additional_input=[None],
+                                                              cache_intermediate_state=True,
+                                                              )
+                output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, n_prediction_steps, 1])
+
+                net_encoder.eval()
+                net_decoder.eval()
+                input_tensor_future = torch.randn([10, 1, 59])
+
+                encoder2decoder, encoder_output = net_encoder(encoder_input=input_tensor_future,
+                                                              additional_input=[None],
+                                                              output_seq=False, cache_intermediate_state=True,
+                                                              incremental_update=True
+                                                              )
+                output = head(net_decoder(x_future=input_tensor_future, encoder_output=encoder2decoder))
+                self.assertListEqual(list(output.shape), [10, 1, 1])
 
     def test_seq_models(self):
         update = HyperparameterSearchSpaceUpdate(node_name="network_backbone",
@@ -234,11 +234,11 @@ def test_seq_models(self):
             hp_decoder_auto_regressive = hp_values[2]
             hp_skip_connection = hp_values[3]
             hp_skip_connection_type = hp_values[4]
-            with self.subTest(hp_variable_selection=hp_values[0],
-                              hp_use_temporal_fusion=hp_values[1],
-                              hp_decoder_auto_regressive=hp_values[2],
-                              hp_skip_connection=hp_values[3],
-                              hp_skip_connection_type=hp_values[4]):
+            with self.subTest(hp_variable_selection=hp_variable_selection,
+                              hp_use_temporal_fusion=hp_use_temporal_fusion,
+                              hp_decoder_auto_regressive=hp_decoder_auto_regressive,
+                              hp_skip_connection=hp_skip_connection,
+                              hp_skip_connection_type=hp_skip_connection_type):
                 variable_selection = HyperparameterSearchSpace('variable_selection',
                                                                (hp_variable_selection,), hp_variable_selection)
                 use_temporal_fusion = HyperparameterSearchSpace('use_temporal_fusion',

From 2d2e039970334b0c9d7bbd10c7f11bf5aef99ca5 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 24 May 2022 14:30:23 +0200
Subject: [PATCH 287/347] flake 8 and update requirement

---
 autoPyTorch/api/base_task.py                  | 242 +++++++++---------
 autoPyTorch/data/base_target_validator.py     |   1 -
 .../data/time_series_feature_validator.py     |   4 +-
 .../data/time_series_forecasting_validator.py |   3 -
 .../data/time_series_target_validator.py      |   1 -
 autoPyTorch/datasets/time_series_dataset.py   |  16 +-
 autoPyTorch/ensemble/ensemble_selection.py    |   2 +-
 autoPyTorch/evaluation/abstract_evaluator.py  |   1 -
 .../scaling/utils.py                          |   2 +-
 .../setup/network/forecasting_architecture.py |   3 +-
 .../network_backbone/base_network_backbone.py |   1 -
 .../forecasting_backbone/cells.py             |   3 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  35 +--
 .../forecasting_decoder/RNNDecoder.py         |  10 +-
 .../forecasting_decoder/TransformerDecoder.py |  17 +-
 .../base_forecasting_decoder.py               |  14 +-
 .../forecasting_decoder/components.py         |   2 +-
 .../forecasting_encoder/__init__.py           |   4 -
 .../base_forecasting_encoder.py               |  12 +-
 .../forecasting_encoder/components.py         |   6 +-
 .../flat_encoder/MLPEncoder.py                |   5 +-
 .../flat_encoder/NBEATSEncoder.py             |  10 +-
 .../seq_encoder/InceptionTimeEncoder.py       |   5 +-
 .../seq_encoder/TCNEncoder.py                 |   3 +-
 .../seq_encoder/__init__.py                   |  12 +-
 .../other_components/TemporalFusion.py        |  14 +-
 .../setup/network_backbone/utils.py           |   4 +-
 .../forecasting_network_head/distribution.py  |   4 +-
 .../forecasting_head.py                       |  15 +-
 .../training/data_loader/base_data_loader.py  |   1 -
 .../time_series_forecasting_data_loader.py    |   3 +-
 .../pipeline/time_series_forecasting.py       |  11 +-
 setup.py                                      |   1 +
 test/test_api/test_api.py                     |   7 +-
 test/test_api/test_base_api.py                |   2 +-
 test/test_api/utils.py                        |   2 +-
 .../test_time_series_feature_validator.py     |   7 -
 .../test_resampling_strategies.py             |   1 -
 .../test_time_series_datasets.py              |   8 +-
 test/test_evaluation/evaluation_util.py       |   2 -
 .../test_forecasting_evaluators.py            |   1 -
 .../preprocessing/forecasting/base.py         |   1 -
 .../forecasting/test_encoder_choice.py        |   3 +-
 .../forecasting/test_encoders.py              |   6 +-
 .../preprocessing/forecasting/test_scaling.py |   4 +-
 .../test_time_series_transformer.py           |  18 +-
 .../test_base_components.py                   |   1 -
 .../test_forecasting_architecture.py          |  12 +-
 .../forecasting_networks/test_seq_encoder.py  |   2 +-
 .../test_forecasting_target_scaling.py        |   2 +-
 .../test_forecasting_training_losses.py       |   1 -
 .../test_pipeline/components/training/base.py |   1 -
 test/test_pipeline/test_metrics.py            |  10 +-
 53 files changed, 249 insertions(+), 309 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index e33d4e415..86b28ed7e 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -81,7 +81,7 @@ def _pipeline_predict(pipeline: BasePipeline,
                       batch_size: int,
                       logger: PicklableClientLogger,
                       task: int,
-                      forecasting_task: bool=False) -> np.ndarray:
+                      forecasting_task: bool = False) -> np.ndarray:
     @typing.no_type_check
     def send_warnings_to_log(
             message, category, filename, lineno, file=None, line=None):
@@ -168,25 +168,25 @@ class BaseTask(ABC):
     """
 
     def __init__(
-        self,
-        seed: int = 1,
-        n_jobs: int = 1,
-        n_threads: int = 1,
-        logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
-        ensemble_nbest: int = 50,
-        max_models_on_disc: int = 50,
-        temporary_directory: Optional[str] = None,
-        output_directory: Optional[str] = None,
-        delete_tmp_folder_after_terminate: bool = True,
-        delete_output_folder_after_terminate: bool = True,
-        include_components: Optional[Dict[str, Any]] = None,
-        exclude_components: Optional[Dict[str, Any]] = None,
-        backend: Optional[Backend] = None,
-        resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
-        resampling_strategy_args: Optional[Dict[str, Any]] = None,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        task_type: Optional[str] = None
+            self,
+            seed: int = 1,
+            n_jobs: int = 1,
+            n_threads: int = 1,
+            logging_config: Optional[Dict] = None,
+            ensemble_size: int = 50,
+            ensemble_nbest: int = 50,
+            max_models_on_disc: int = 50,
+            temporary_directory: Optional[str] = None,
+            output_directory: Optional[str] = None,
+            delete_tmp_folder_after_terminate: bool = True,
+            delete_output_folder_after_terminate: bool = True,
+            include_components: Optional[Dict[str, Any]] = None,
+            exclude_components: Optional[Dict[str, Any]] = None,
+            backend: Optional[Backend] = None,
+            resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+            task_type: Optional[str] = None
     ) -> None:
 
         if isinstance(resampling_strategy, NoResamplingStrategyTypes) and ensemble_size != 0:
@@ -264,11 +264,11 @@ def __init__(
 
     @abstractmethod
     def build_pipeline(
-        self,
-        dataset_properties: Dict[str, BaseDatasetPropertiesType],
-        include_components: Optional[Dict[str, Any]] = None,
-        exclude_components: Optional[Dict[str, Any]] = None,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+            self,
+            dataset_properties: Dict[str, BaseDatasetPropertiesType],
+            include_components: Optional[Dict[str, Any]] = None,
+            exclude_components: Optional[Dict[str, Any]] = None,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
     ) -> BasePipeline:
         """
         Build pipeline according to current task
@@ -301,15 +301,15 @@ def build_pipeline(
 
     @abstractmethod
     def _get_dataset_input_validator(
-        self,
-        X_train: Union[List, pd.DataFrame, np.ndarray],
-        y_train: Union[List, pd.DataFrame, np.ndarray],
-        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        resampling_strategy: Optional[ResamplingStrategies] = None,
-        resampling_strategy_args: Optional[Dict[str, Any]] = None,
-        dataset_name: Optional[str] = None,
-        dataset_compression: Optional[DatasetCompressionSpec] = None,
+            self,
+            X_train: Union[List, pd.DataFrame, np.ndarray],
+            y_train: Union[List, pd.DataFrame, np.ndarray],
+            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            resampling_strategy: Optional[ResamplingStrategies] = None,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            dataset_name: Optional[str] = None,
+            dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[BaseDataset, BaseInputValidator]:
         """
         Returns an object of a child class of `BaseDataset` and
@@ -347,15 +347,15 @@ def _get_dataset_input_validator(
         raise NotImplementedError
 
     def get_dataset(
-        self,
-        X_train: Union[List, pd.DataFrame, np.ndarray],
-        y_train: Union[List, pd.DataFrame, np.ndarray],
-        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        resampling_strategy: Optional[ResamplingStrategies] = None,
-        resampling_strategy_args: Optional[Dict[str, Any]] = None,
-        dataset_name: Optional[str] = None,
-        dataset_compression: Optional[DatasetCompressionSpec] = None,
+            self,
+            X_train: Union[List, pd.DataFrame, np.ndarray],
+            y_train: Union[List, pd.DataFrame, np.ndarray],
+            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            resampling_strategy: Optional[ResamplingStrategies] = None,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            dataset_name: Optional[str] = None,
+            dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> BaseDataset:
         """
         Returns an object of a child class of `BaseDataset` according to the current task.
@@ -619,9 +619,9 @@ def _close_dask_client(self) -> None:
             None
         """
         if (
-            hasattr(self, '_is_dask_client_internally_created')
-            and self._is_dask_client_internally_created
-            and self._dask_client
+                hasattr(self, '_is_dask_client_internally_created')
+                and self._is_dask_client_internally_created
+                and self._dask_client
         ):
             self._dask_client.shutdown()
             self._dask_client.close()
@@ -914,26 +914,26 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
         return
 
     def _search(
-        self,
-        optimize_metric: str,
-        dataset: BaseDataset,
-        budget_type: str = 'epochs',
-        min_budget: Union[int, float] = 5,
-        max_budget: Union[int, float] = 50,
-        total_walltime_limit: int = 100,
-        func_eval_time_limit_secs: Optional[int] = None,
-        enable_traditional_pipeline: bool = True,
-        memory_limit: Optional[int] = 4096,
-        smac_scenario_args: Optional[Dict[str, Any]] = None,
-        get_smac_object_callback: Optional[Callable] = None,
-        tae_func: Optional[Callable] = None,
-        all_supported_metrics: bool = True,
-        precision: int = 32,
-        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-        load_models: bool = True,
-        portfolio_selection: Optional[str] = None,
-        dask_client: Optional[dask.distributed.Client] = None,
-        **kwargs: Dict[str, Any]
+            self,
+            optimize_metric: str,
+            dataset: BaseDataset,
+            budget_type: str = 'epochs',
+            min_budget: Union[int, float] = 5,
+            max_budget: Union[int, float] = 50,
+            total_walltime_limit: int = 100,
+            func_eval_time_limit_secs: Optional[int] = None,
+            enable_traditional_pipeline: bool = True,
+            memory_limit: Optional[int] = 4096,
+            smac_scenario_args: Optional[Dict[str, Any]] = None,
+            get_smac_object_callback: Optional[Callable] = None,
+            tae_func: Optional[Callable] = None,
+            all_supported_metrics: bool = True,
+            precision: int = 32,
+            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+            load_models: bool = True,
+            portfolio_selection: Optional[str] = None,
+            dask_client: Optional[dask.distributed.Client] = None,
+            **kwargs: Dict[str, Any]
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1097,8 +1097,8 @@ def _search(
         self._all_supported_metrics = all_supported_metrics
         self._disable_file_output = disable_file_output if disable_file_output is not None else []
         if (
-            DisableFileOutputParameters.y_optimization in self._disable_file_output
-            and self.ensemble_size > 1
+                DisableFileOutputParameters.y_optimization in self._disable_file_output
+                and self.ensemble_size > 1
         ):
             self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
                                  f" is in disable_file_output")
@@ -1339,10 +1339,10 @@ def _search(
         return self
 
     def _get_fit_dictionary(
-        self,
-        dataset_properties: Dict[str, BaseDatasetPropertiesType],
-        dataset: BaseDataset,
-        split_id: int = 0
+            self,
+            dataset_properties: Dict[str, BaseDatasetPropertiesType],
+            dataset: BaseDataset,
+            split_id: int = 0
     ) -> Dict[str, Any]:
         X_test = dataset.test_tensors[0].copy() if dataset.test_tensors is not None else None
         y_test = dataset.test_tensors[1].copy() if dataset.test_tensors is not None else None
@@ -1367,9 +1367,9 @@ def _get_fit_dictionary(
         return X
 
     def refit(
-        self,
-        dataset: BaseDataset,
-        split_id: int = 0
+            self,
+            dataset: BaseDataset,
+            split_id: int = 0
     ) -> "BaseTask":
         """
         Refit all models found with fit to new data.
@@ -1435,28 +1435,28 @@ def refit(
         return self
 
     def fit_pipeline(
-        self,
-        configuration: Configuration,
-        *,
-        dataset: Optional[BaseDataset] = None,
-        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        dataset_name: Optional[str] = None,
-        resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
-        resampling_strategy_args: Optional[Dict[str, Any]] = None,
-        run_time_limit_secs: int = 60,
-        memory_limit: Optional[int] = None,
-        eval_metric: Optional[str] = None,
-        all_supported_metrics: bool = False,
-        budget_type: Optional[str] = None,
-        include_components: Optional[Dict[str, Any]] = None,
-        exclude_components: Optional[Dict[str, Any]] = None,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        budget: Optional[float] = None,
-        pipeline_options: Optional[Dict] = None,
-        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+            self,
+            configuration: Configuration,
+            *,
+            dataset: Optional[BaseDataset] = None,
+            X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+            dataset_name: Optional[str] = None,
+            resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
+            resampling_strategy_args: Optional[Dict[str, Any]] = None,
+            run_time_limit_secs: int = 60,
+            memory_limit: Optional[int] = None,
+            eval_metric: Optional[str] = None,
+            all_supported_metrics: bool = False,
+            budget_type: Optional[str] = None,
+            include_components: Optional[Dict[str, Any]] = None,
+            exclude_components: Optional[Dict[str, Any]] = None,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+            budget: Optional[float] = None,
+            pipeline_options: Optional[Dict] = None,
+            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]:
         """
         Fit a pipeline on the given task for the budget.
@@ -1571,8 +1571,8 @@ def fit_pipeline(
 
         if dataset is None:
             if (
-                X_train is not None
-                and y_train is not None
+                    X_train is not None
+                    and y_train is not None
             ):
                 raise ValueError("No dataset provided, must provide X_train, y_train tensors")
             dataset = self.get_dataset(X_train=X_train,
@@ -1685,12 +1685,12 @@ def fit_pipeline(
         return fitted_pipeline, run_info, run_value, dataset
 
     def _get_fitted_pipeline(
-        self,
-        dataset_name: str,
-        pipeline_idx: int,
-        run_info: RunInfo,
-        run_value: RunValue,
-        disable_file_output: List[Union[str, DisableFileOutputParameters]]
+            self,
+            dataset_name: str,
+            pipeline_idx: int,
+            run_info: RunInfo,
+            run_value: RunValue,
+            disable_file_output: List[Union[str, DisableFileOutputParameters]]
     ) -> Optional[BasePipeline]:
 
         if self._logger is None:
@@ -1716,10 +1716,10 @@ def _get_fitted_pipeline(
         )
 
     def predict(
-        self,
-        X_test: np.ndarray,
-        batch_size: Optional[int] = None,
-        n_jobs: int = 1
+            self,
+            X_test: np.ndarray,
+            batch_size: Optional[int] = None,
+            n_jobs: int = 1
     ) -> np.ndarray:
         """Generate the estimator predictions.
         Generate the predictions based on the given examples from the test set.
@@ -1771,9 +1771,9 @@ def predict(
         return predictions
 
     def score(
-        self,
-        y_pred: np.ndarray,
-        y_test: Union[np.ndarray, pd.DataFrame]
+            self,
+            y_pred: np.ndarray,
+            y_test: Union[np.ndarray, pd.DataFrame]
     ) -> Dict[str, float]:
         """Calculate the score on the test set.
         Calculate the evaluation measure on the test set.
@@ -1818,8 +1818,8 @@ def __del__(self) -> None:
             self._backend.context.delete_directories(force=False)
 
     def get_incumbent_results(
-        self,
-        include_traditional: bool = False
+            self,
+            include_traditional: bool = False
     ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]:
         """
         Get Incumbent config and the corresponding results
@@ -1920,13 +1920,13 @@ def sprint_statistics(self) -> str:
         )
 
     def plot_perf_over_time(
-        self,
-        metric_name: str,
-        ax: Optional[plt.Axes] = None,
-        plot_setting_params: PlotSettingParams = PlotSettingParams(),
-        color_label_settings: ColorLabelSettings = ColorLabelSettings(),
-        *args: Any,
-        **kwargs: Any
+            self,
+            metric_name: str,
+            ax: Optional[plt.Axes] = None,
+            plot_setting_params: PlotSettingParams = PlotSettingParams(),
+            color_label_settings: ColorLabelSettings = ColorLabelSettings(),
+            *args: Any,
+            **kwargs: Any
     ) -> None:
         """
         Visualize the performance over time using matplotlib.
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index 7de67316b..9943d5c55 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -187,4 +187,3 @@ def is_single_column_target(self) -> bool:
     @property
     def allow_missing_values(self) -> bool:
         return False
-
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 920917b01..0e4854138 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -1,11 +1,11 @@
 import logging
-from typing import Optional, Union, Tuple, List
+from typing import Optional, Union, List
 import pandas as pd
 import numpy as np
 from scipy.sparse import issparse
 
 from sklearn.base import BaseEstimator
-from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator, SupportedFeatTypes
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 7be693e8a..ca142de25 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -5,13 +5,10 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from scipy import sparse
-
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
-from autoPyTorch.data.base_feature_validator import SupportedFeatTypes
 
 
 class TimeSeriesForecastingInputValidator(TabularInputValidator):
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 2e5ae137c..ed017e5fc 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -67,4 +67,3 @@ def transform(self,
     @property
     def allow_missing_values(self) -> bool:
         return True
-
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index acd2c4846..4f3aeca9f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -205,9 +205,8 @@ def __getitem__(self, index: int, train: bool = True) \
 
             if self.known_future_features_index:
                 if not self.is_test_set:
-                    future_features = self.X[
-                                      index + 1: index + self.n_prediction_steps + 1, self.known_future_features_index
-                                      ]
+                    future_features = self.X[index + 1: index + self.n_prediction_steps + 1,
+                                      self.known_future_features_index]
                 else:
                     future_features = self.X_test[:, self.known_future_features_index]
             else:
@@ -523,12 +522,11 @@ def __init__(self,
 
         self.normalize_y = normalize_y
 
-        sequence_datasets, train_tensors, test_tensors, sequence_lengths = self.transform_data_into_time_series_sequence(
-            X, Y,
-            start_times=self.start_times,
-            X_test=X_test,
-            Y_test=Y_test, )
-
+        training_sets = self.transform_data_into_time_series_sequence(X, Y,
+                                                                      start_times=self.start_times,
+                                                                      X_test=X_test,
+                                                                      Y_test=Y_test, )
+        sequence_datasets, train_tensors, test_tensors, sequence_lengths = training_sets
         Y = train_tensors[1]
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
diff --git a/autoPyTorch/ensemble/ensemble_selection.py b/autoPyTorch/ensemble/ensemble_selection.py
index 442f9c017..6843e28f6 100644
--- a/autoPyTorch/ensemble/ensemble_selection.py
+++ b/autoPyTorch/ensemble/ensemble_selection.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from typing import Any, Dict, List, Tuple, Union, Optional
+from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index b11d86f4b..939dd2307 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -327,7 +327,6 @@ def fit(self, X: Dict[str, Any], y: Any,
         y_train = subsampler(X['y_train'], X['train_indices'])
         return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train, sample_weight)
 
-
     def _genreate_dummy_forecasting(self, X):
         if isinstance(X[0], TimeSeriesSequence):
             X_tail = [x.get_target_values(-1) for x in X]
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index 1e77db194..afcca204c 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,4 +1,4 @@
-from typing import Any, Union, Tuple, List
+from typing import Any, Union, Tuple
 
 import numpy as np
 import pandas as pd
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 0447923d5..d117d9b2f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -495,7 +495,6 @@ def forward(self,
         decoder_output = self.decoder(x_future=x_future, encoder_output=encoder2decoder,
                                       pos_idx=(x_past.shape[1], x_past.shape[1] + self.n_prediction_steps))
 
-
         if self.has_temporal_fusion:
             decoder_output = self.temporal_fusion(encoder_output=encoder_output,
                                                   decoder_output=decoder_output,
@@ -791,7 +790,7 @@ def forward(self,
                                                               past_observed_targets=past_observed_targets,
                                                               decoder_length=idx_pred + 1,
                                                               static_embedding=repeated_x_static,
-                                                              )[:, -1:,]
+                                                              )[:, -1:, ]
 
                     net_output = self.head(decoder_output)
                     samples = net_output.sample().cpu()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index c415217da..6145ca91a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -122,4 +122,3 @@ def get_name(cls) -> str:
             str: Name of the backbone
         """
         return str(cls.get_properties()["shortname"])
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 2b8d20521..00955ecc9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -260,7 +260,7 @@ def __init__(self,
 
         if time_feature_names:
             for name in time_feature_names:
-                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker+1]
+                feature_names2tensor_idx[name] = [idx_tracker, idx_tracker + 1]
                 future_feature_name2tensor_idx[name] = [idx_tracker_future, idx_tracker_future + 1]
                 idx_tracker += 1
                 idx_tracker_future += 1
@@ -370,7 +370,6 @@ def __init__(self,
         if network_encoder['block_1'].encoder_properties.has_hidden_states:
             n_hidden_states = network_encoder['block_1'].n_hidden_states
 
-
         static_context_initial_hidden = [GatedResidualNetwork(input_size=self.hidden_size,
                                                               hidden_size=self.hidden_size,
                                                               output_size=self.hidden_size,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 8824199c8..5331c8994 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -13,12 +13,13 @@
 from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone.\
+from autoPyTorch.pipeline.components.setup.network_backbone. \
     forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderNetwork
 )
 
+
 class NBEATSBLock(DecoderNetwork):
     def __init__(self,
                  n_in_features: int,
@@ -74,7 +75,8 @@ def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
         if self.use_dropout:
             layers.append(nn.Dropout(self.dropout_rate))
 
-    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
+    def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None):
         if self.backcast_head is None and self.forecast_head is None:
             # used to compute head dimensions
             return self.backbone(encoder_output)
@@ -141,21 +143,20 @@ def _build_decoder(self,
                     else:
                         raise ValueError(f"Unsupported stack_type {stack_type}")
 
-                    stacks[stack_idx - 1].append(NBEATSBLock(in_features,
-                                                             stack_idx=stack_idx,
-                                                             stack_type=stack_type,
-                                                             num_blocks=self.config['num_blocks_i_%d' % stack_idx],
-                                                             num_layers=self.config['num_layers_i_%d' % stack_idx],
-                                                             width=self.config['width_i_%d' % stack_idx],
-                                                             normalization=self.config['normalization'],
-                                                             activation=self.config['activation'],
-                                                             weight_sharing=self.config[f'weight_sharing_i_%d' %
-                                                                                        stack_idx],
-                                                             expansion_coefficient_length=ecl,
-                                                             use_dropout=self.config['use_dropout_i'],
-                                                             dropout_rate=self.config.get('dropout_i_%d' %
-                                                                                          stack_idx, None),
-                                                             ))
+                    stacks[stack_idx - 1].append(NBEATSBLock(
+                        in_features,
+                        stack_idx=stack_idx,
+                        stack_type=stack_type,
+                        num_blocks=self.config['num_blocks_i_%d' % stack_idx],
+                        num_layers=self.config['num_layers_i_%d' % stack_idx],
+                        width=self.config['width_i_%d' % stack_idx],
+                        normalization=self.config['normalization'],
+                        activation=self.config['activation'],
+                        weight_sharing=self.config['weight_sharing_i_%d' % stack_idx],
+                        expansion_coefficient_length=ecl,
+                        use_dropout=self.config['use_dropout_i'],
+                        dropout_rate=self.config.get('dropout_i_%d' % stack_idx, None),
+                    ))
         else:
             raise ValueError(f"Unsupported n_beats_type: {n_beats_type}")
         return stacks, stacks[-1][-1].width
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index b35a0114c..46644663f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -4,7 +4,6 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import Constant
 
-
 import torch
 from torch import nn
 
@@ -12,11 +11,8 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.\
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
-    BaseForecastingDecoder,
-    DecoderProperties
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+    base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderNetwork
 )
@@ -31,7 +27,7 @@ def __init__(self,
                  num_layers: int,
                  cell_type: str,
                  dropout: float,
-                 lagged_value: Optional[Union[List, np.ndarray]]=None):
+                 lagged_value: Optional[Union[List, np.ndarray]] = None):
         super().__init__()
         if cell_type == 'lstm':
             cell = nn.LSTM
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index ce10f051d..404a8312c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -16,16 +16,16 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.utils.common import add_hyperparameter
 
-from autoPyTorch.pipeline.components.setup.network_backbone. \
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import (
-    BaseForecastingDecoder,
-    DecoderProperties
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
+
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderNetwork
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
-    PositionalEncoding, build_transformer_layers
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    PositionalEncoding,
+    build_transformer_layers
+)
 
 from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
 
@@ -69,7 +69,8 @@ def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx:
         if self.use_positional_decoder:
             output = self.pos_encoding(output, pos_idx)
         if self.training:
-            output = self.transformer_decoder_layers(output, encoder_output, tgt_mask=self.tgt_mask.to(encoder_output.device))
+            output = self.transformer_decoder_layers(output, encoder_output,
+                                                     tgt_mask=self.tgt_mask.to(encoder_output.device))
         else:
             output = self.transformer_decoder_layers(output, encoder_output)
         return output
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 0eb8d8670..932bd4f98 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -1,8 +1,7 @@
-from abc import abstractmethod, ABC
-from typing import Any, Dict, Iterable, Tuple, List, Optional, NamedTuple
+from abc import abstractmethod
+from typing import Any, Dict, Iterable, Tuple, List, Optional
 from collections import OrderedDict
 
-import torch
 from torch import nn
 
 from autoPyTorch.utils.common import FitRequirement
@@ -140,13 +139,13 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             decoder_output_shape=(self.n_prediction_heads, self.n_decoder_output_features)
         )
         if self.is_last_decoder:
-            X.update({f'network_decoder': network_decoder,
+            X.update({'network_decoder': network_decoder,
                       'n_prediction_heads': self.n_prediction_heads,
                       'n_decoder_output_features': self.n_decoder_output_features,
                       'auto_regressive': self.auto_regressive})
         else:
-            X.update({f'network_decoder': network_decoder,
-                      f'n_decoder_output_features': self.n_decoder_output_features,
+            X.update({'network_decoder': network_decoder,
+                      'n_decoder_output_features': self.n_decoder_output_features,
                       })
 
         return X
@@ -176,7 +175,7 @@ def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
                        future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
-                       dataset_properties:Dict) -> Tuple[nn.Module, int]:
+                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
         """
         Builds the head module and returns it
 
@@ -204,4 +203,3 @@ def get_name(cls) -> str:
             str: Name of the head
         """
         return str(cls.get_properties()["shortname"])
-
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
index fad87e403..28f924595 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union, Tuple, List, NamedTuple
+from typing import Optional, Tuple, NamedTuple
 
 import torch
 from torch import nn
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 32c774cf7..fb73c88c5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -379,7 +379,3 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
     def transform(self, X: Dict) -> Dict:
         assert self.pipeline is not None, "Cannot call transform before the object is initialized"
         return self.pipeline.transform(X)
-
-    @property
-    def _defaults_network(self):
-        return ['MLPEncoder']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 00175b353..9fc4451ce 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -1,9 +1,6 @@
 import numpy as np
 from collections import OrderedDict
 
-import pandas as pd
-from scipy.sparse import csr_matrix
-
 import torchvision
 from autoPyTorch.utils.common import FitRequirement
 from torch import nn
@@ -45,7 +42,7 @@ def _required_fit_arguments(self) -> List[FitRequirement]:
             FitRequirement('uni_variant', (bool,), user_defined=False, dataset_property=True),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('network_structure', (NetworkStructure,),  user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement('network_embedding', (nn.Module, ), user_defined=False, dataset_property=False),
@@ -87,8 +84,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             if variable_selection:
                 in_features = self.n_encoder_output_feature()
             elif self.encoder_properties().lagged_input and hasattr(self, 'lagged_value'):
-                in_features = len(self.lagged_value) * output_shape[-1] + \
-                              input_shape[-1]
+                in_features = len(self.lagged_value) * output_shape[-1] + input_shape[-1]
             else:
                 in_features = output_shape[-1] + input_shape[-1]
 
@@ -133,7 +129,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
                                                                          encoder_output_shape=self.encoder_output_shape,
                                                                          n_hidden_states=self.n_hidden_states())
 
-        X.update({f'network_encoder': network_encoder})
+        X.update({'network_encoder': network_encoder})
         return X
 
     @abstractmethod
@@ -143,9 +139,7 @@ def build_encoder(self,
         Builds the backbone module and returns it
 
         Args:
-            targets_shape (Tuple[int, ...]): shape of target
             input_shape (Tuple[int, ...]): input feature shape
-            static_feature_shape (int): static feature shape.
 
         Returns:
             nn.Module: backbone module
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
index 9e7f67461..2023a4e96 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch import nn
-from typing import Dict, Optional, Tuple, List, NamedTuple
+from typing import Tuple, NamedTuple
 
 
 class EncoderProperties(NamedTuple):
@@ -63,6 +63,4 @@ class EncoderOutputForm(Enum):
     NoOutput = 0
     HiddenStates = 1  # RNN -> RNN
     Sequence = 2  # Transformer -> Transformer
-    SequenceLast = 3 #RNN/TCN/Transformer -> MLP
-
-
+    SequenceLast = 3  # RNN/TCN/Transformer -> MLP
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index f2575463c..c88c0b8d1 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -7,9 +7,8 @@
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderProperties
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderNetwork
 )
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 6fea17b70..43b3225b7 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -4,15 +4,14 @@
 
 from ConfigSpace import ConfigurationSpace
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder, EncoderProperties
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.MLPEncoder import \
-    TimeSeriesMLP
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    MLPEncoder import TimeSeriesMLP
 
 
 class NBEATSEncoder(BaseForecastingEncoder):
@@ -44,7 +43,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.check_requirements(X, y)
         self.window_size = X["window_size"]
 
-        input_shape = X["dataset_properties"]['input_shape']
         # n-BEATS only requires targets as its input
         # TODO add support for multi-variant
         output_shape = X["dataset_properties"]['output_shape']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index aaa41feb0..7a5585fa3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -9,9 +9,8 @@
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.base_forecasting_encoder import (
-    BaseForecastingEncoder,
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 6c907f138..446245208 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -126,7 +126,7 @@ class TCNEncoder(BaseForecastingEncoder):
     def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         num_channels = [self.config["num_filters_1"]]
         kernel_size = [self.config["kernel_size_1"]]
-        dropout = self.config[f"dropout"] if self.config["use_dropout"] else 0.0
+        dropout = self.config["dropout"] if self.config["use_dropout"] else 0.0
         for i in range(2, self.config["num_blocks"] + 1):
             num_channels.append(self.config[f"num_filters_{i}"])
             kernel_size.append(self.config[f"kernel_size_{i}"])
@@ -195,7 +195,6 @@ def get_hyperparameter_search_space(
         use_dropout = get_hyperparameter(use_dropout, CategoricalHyperparameter)
         cs.add_hyperparameter(use_dropout)
 
-
         dropout_hp = get_hyperparameter(dropout, UniformFloatHyperparameter)
         cs.add_hyperparameter(dropout_hp)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 803d42231..88355a113 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -5,6 +5,7 @@
 import inspect
 
 from ConfigSpace.hyperparameters import (
+    Hyperparameter,
     Constant,
     CategoricalHyperparameter,
     UniformFloatHyperparameter,
@@ -35,8 +36,8 @@
     base_forecasting_decoder import BaseForecastingDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     ForecastingNetworkStructure
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.TemporalFusion import \
-    TemporalFusion
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components. \
+    TemporalFusion import TemporalFusion
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -401,7 +402,7 @@ def get_hyperparameter_search_space(
                     # parent_hyperparameter=parent_hyperparameter
                 )
 
-                hps = cs.get_hyperparameters()  # type: List[CSH.Hyperparameter]
+                hps = cs.get_hyperparameters()  # type: List[Hyperparameter]
                 conditions_to_add = []
                 for hp in hps:
                     # TODO consider if this will raise any unexpected behavior
@@ -477,7 +478,7 @@ def get_hyperparameter_search_space(
                                             forbidden_decoder_ar,
                                             ForbiddenEqualsClause(hp_decoder_type, decoder)
                                         )
-                                )
+                                    )
                             else:
                                 if add_forbidden_for_non_ar_recurrent_decoder:
                                     forbiddens_decoder_auto_regressive.append(
@@ -495,7 +496,6 @@ def get_hyperparameter_search_space(
         if conds_decoder_ar:
             cs.add_condition(OrConjunction(*conds_decoder_ar))
 
-
         use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
         cs.add_hyperparameter(use_temporal_fusion)
         if True in use_temporal_fusion.choices:
@@ -694,7 +694,7 @@ def set_hyperparameters(self,
                     new_params[param] = value
             temporal_fusion = TemporalFusion(self.random_state,
                                              **new_params)
-            pipeline_steps.extend([(f'temporal_fusion', temporal_fusion)])
+            pipeline_steps.extend([('temporal_fusion', temporal_fusion)])
 
         self.pipeline = Pipeline(pipeline_steps)
         self.choice = self.encoder_choice[0]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index 9eaebc6b8..afbfd4c88 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -1,19 +1,19 @@
 import numpy as np
 
 from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
     UniformIntegerHyperparameter
+)
 from ConfigSpace.conditions import EqualsCondition
 from autoPyTorch.utils.common import FitRequirement
-from typing import Any, Dict, Iterable, Optional, Tuple, List, Union, NamedTuple
+from typing import Any, Dict, Iterable, Optional, List
 
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderProperties, EncoderBlockInfo, EncoderNetwork
-)
+
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import TemporalFusionLayer
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
@@ -30,7 +30,7 @@ def __init__(self,
                  attention_d_model_log: int = 4,
                  use_dropout: bool = False,
                  dropout_rate: Optional[float] = None, ):
-        autoPyTorchComponent.__init__(self)
+        autoPyTorchComponent.__init__(self, random_state=random_state)
         self.add_fit_requirements(
             self._required_fit_requirements
         )
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
index b7d4ba2af..0539df422 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -23,8 +23,8 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has
     Can and should be overridden by subclasses that know the output shape
     without running a dummy forward pass.
     :param input_shape: shape of the input
-    :param has_hidden_states: bool, if the network backbone contains a hidden_states. if yes, the network will return a Tuple,
-    we will then only consider the first item
+    :param has_hidden_states: bool, if the network backbone contains a hidden_states. if yes,
+        the network will return a Tuple, we will then only consider the first item
     :return: output_shape
     """
     placeholder = torch.randn((2, *input_shape), dtype=torch.float)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 8c97d0f5a..5eab2d1d5 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -29,7 +29,6 @@
     Beta,
     Distribution,
     Gamma,
-    NegativeBinomial,
     Normal,
     Poisson,
     StudentT,
@@ -185,7 +184,7 @@ class PoissonOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"rate": 1}
 
-    def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor,]:
+    def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor]:
         rate_pos = F.softplus(rate).clone()
         return rate_pos.squeeze(-1),
 
@@ -208,7 +207,6 @@ class DisForecastingStrategy(NamedTuple):
     num_samples: int = 100
     aggregation: str = "mean"
 
-
 # TODO find components that are compatible with beta, gamma and poisson distribution!
 
 # TODO consider how to implement NegativeBinomialOutput without scale information
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 17584f9bf..618018ee5 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -4,13 +4,9 @@
 import torch
 from torch import nn
 from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
-    UniformIntegerHyperparameter
-from ConfigSpace.conditions import EqualsCondition
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderBlockInfo
 )
@@ -19,9 +15,6 @@
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
     ALL_DISTRIBUTIONS, DisForecastingStrategy
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import TemporalFusionLayer
 
 
 class QuantileHead(nn.Module):
@@ -58,7 +51,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_decoder', (Dict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_heads', (int,), user_defined=False, dataset_property=False),
             FitRequirement('output_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('net_output_type', (str, ), user_defined=False, dataset_property=False),
+            FitRequirement('net_output_type', (str,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_steps', (int,), user_defined=False, dataset_property=True)
 
         ]
@@ -168,10 +161,10 @@ def build_head(self,
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
                    decoder_has_local_layer: bool = True,
-                   net_output_type: str="distribution",
+                   net_output_type: str = "distribution",
                    dist_cls: Optional[str] = None,
                    n_prediction_heads: int = 1,
-                   num_quantiles:int = 3,
+                   num_quantiles: int = 3,
                    ) -> nn.Module:
         """
         Builds the head module and returns it
@@ -184,7 +177,7 @@ def build_head(self,
             net_output_type (str): network output type
             dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
             n_prediction_heads (Dict): additional paramter for initializing architectures. How many heads to predict
-            num_quantile (int): number of quantile losses
+            num_quantiles (int): number of quantile losses
 
         Returns:
             nn.Module: head module
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index dfa9f42a2..b1ee41bb2 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -109,7 +109,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
 
-
         self.train_data_loader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=min(self.batch_size, len(train_dataset)),
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index acc3372a0..60bd84b86 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union, Tuple, List
+from typing import Any, Dict, Optional, Union, Tuple, List, Callable
 import warnings
 from functools import partial
 
@@ -12,7 +12,6 @@
 from sklearn.compose import ColumnTransformer
 
 import torch
-from torch.utils.data.sampler import SubsetRandomSampler
 
 import torchvision
 
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 1e1f97b42..fd6e15a4b 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -193,7 +193,7 @@ def _get_hyperparameter_search_space(self,
                                     forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
                                     for hp_dist in hp_distribution_children:
                                         forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
-                                        forbidden_hp_dist = AndConjunction(forbidden_hp_dist, 
+                                        forbidden_hp_dist = AndConjunction(forbidden_hp_dist,
                                                                            forbidden_hp_regression_loss)
                                         forbidden_regression_losses_all.append(forbidden_hp_dist)
                                 else:
@@ -257,11 +257,14 @@ def _get_hyperparameter_search_space(self,
 
                 if 'MLPEncoder' in network_flat_encoder_hp.choices:
                     forbidden = ['MLPEncoder']
-                    forbidden_deepAREncoder = [forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices]
+                    forbidden_deepAREncoder = [
+                        forbid for forbid in forbidden if forbid in network_flat_encoder_hp.choices
+                    ]
                     for hp_ar in hp_deepAR:
                         if True in hp_ar.choices:
                             forbidden_hp_ar = ForbiddenEqualsClause(hp_ar, ar_forbidden)
-                            forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp, forbidden_deepAREncoder)
+                            forbidden_hp_mlpencoder = ForbiddenInClause(network_flat_encoder_hp,
+                                                                        forbidden_deepAREncoder)
                             forbidden_hp_ar_mlp = ForbiddenAndConjunction(forbidden_hp_ar, forbidden_hp_mlpencoder)
                             forbidden_losses_all.append(forbidden_hp_ar_mlp)
 
@@ -339,7 +342,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             steps.extend([("impute", TimeSeriesFeatureImputer(random_state=self.random_state)),
                           ("scaler", BaseScaler(random_state=self.random_state)),
                           ('feature_encoding', TimeSeriesEncoderChoice(default_dataset_properties,
-                                                               random_state=self.random_state)),
+                                                                       random_state=self.random_state)),
                           ("time_series_transformer", TimeSeriesFeatureTransformer(random_state=self.random_state)),
                           ("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
                           ])
diff --git a/setup.py b/setup.py
index e1e3d47e2..b9df7f21a 100755
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
             "pre-commit",
             "pytest-cov",
             'pytest-forked',
+            'pytest-subtests',
             "codecov",
             "pep8",
             "mypy",
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 1ec93185b..bf8ab2eb0 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -409,7 +409,7 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
 @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', new=dummy_eval_train_function)
 @pytest.mark.parametrize('forecasting_toy_dataset', ['multi_variant_wo_missing'], indirect=True)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
-                         (#(HoldoutValTypes.time_series_hold_out_validation, None),
+                         ((HoldoutValTypes.time_series_hold_out_validation, None),
                           (CrossValTypes.time_series_cross_validation, {'num_splits': CV_NUM_SPLITS}),
                           ))
 def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, backend, resampling_strategy_args):
@@ -465,8 +465,7 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
 
     # Internal dataset has expected settings
     assert estimator.dataset.task_type == 'time_series_forecasting'
-    expected_num_splits = HOLDOUT_NUM_SPLITS if resampling_strategy ==\
-                                                HoldoutValTypes.time_series_hold_out_validation\
+    expected_num_splits = HOLDOUT_NUM_SPLITS if resampling_strategy == HoldoutValTypes.time_series_hold_out_validation \
         else CV_NUM_SPLITS
     assert estimator.resampling_strategy == resampling_strategy
     assert estimator.dataset.resampling_strategy == resampling_strategy
@@ -1094,5 +1093,3 @@ def test_task_inference(ans, task_class, backend):
             estimator.get_dataset(X, y)
     else:
         estimator.get_dataset(X, y)
-
-
diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py
index 7a9a972f0..bb8f9c061 100644
--- a/test/test_api/test_base_api.py
+++ b/test/test_api/test_base_api.py
@@ -170,7 +170,7 @@ def test_no_resampling_error(backend):
         (0.01, 1.0, 'num_seq', {'budget_type': 'num_seq', 'num_seq': 1.0}),
         (0.01, 1.0, 'num_sample_per_seq', {'budget_type': 'num_sample_per_seq', 'num_sample_per_seq': 1.0}),
     ])
-def test_pipeline_get_budget(fit_dictionary_forecasting, min_budget, max_budget, budget_type, expected):
+def test_pipeline_get_budget_forecasting(fit_dictionary_forecasting, min_budget, max_budget, budget_type, expected):
     BaseTask.__abstractmethods__ = set()
     estimator = BaseTask(task_type='time_series_forecasting', ensemble_size=0)
     # Fixture pipeline config
diff --git a/test/test_api/utils.py b/test/test_api/utils.py
index 0c209fac5..beff5a2c9 100644
--- a/test/test_api/utils.py
+++ b/test/test_api/utils.py
@@ -2,7 +2,7 @@
 
 from smac.runhistory.runhistory import DataOrigin, RunHistory, RunKey, RunValue, StatusType
 
-from autoPyTorch.constants import REGRESSION_TASKS, CLASSIFICATION_TASKS, FORECASTING_TASKS
+from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
 from autoPyTorch.evaluation.abstract_evaluator import (
     DummyClassificationPipeline,
     DummyRegressionPipeline,
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_time_series_feature_validator.py
index 6513ff41f..ce219a310 100644
--- a/test/test_data/test_time_series_feature_validator.py
+++ b/test/test_data/test_time_series_feature_validator.py
@@ -1,16 +1,9 @@
-import copy
-import functools
-
 import numpy as np
 
 import pandas as pd
 
 import pytest
 
-from scipy import sparse
-
-import sklearn.datasets
-import sklearn.model_selection
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
 
 
diff --git a/test/test_datasets/test_resampling_strategies.py b/test/test_datasets/test_resampling_strategies.py
index 2228db0dd..1f046afb5 100644
--- a/test/test_datasets/test_resampling_strategies.py
+++ b/test/test_datasets/test_resampling_strategies.py
@@ -121,4 +121,3 @@ def eval_ts_sea_cv(num_splits, n_prediction_steps, n_repeats, freq_value):
     # We cannot do a split, thus the two splits are the same
 
     assert np.all(sp2[1][1] == sp2[0][1])
-
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index f130a9c23..4b1a6bea9 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -238,9 +238,9 @@ def test_exception(self):
 
 
 @pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing',
-                                                            'uni_variant_w_missing',
-                                                            'multi_variant_wo_missing',
-                                                            'uni_variant_w_missing'], indirect=True)
+                                                        'uni_variant_w_missing',
+                                                        'multi_variant_wo_missing',
+                                                        'uni_variant_w_missing'], indirect=True)
 def test_dataset_properties(backend, fit_dictionary_forecasting):
     # The fixture creates a datamanager by itself
     datamanager: TimeSeriesForecastingDataset = backend.load_datamanager()
@@ -298,7 +298,7 @@ def test_target_normalization():
     assert np.allclose(dataset.y_mean.values, np.vstack([np.mean(y) for y in Y]))
     assert np.allclose(dataset.y_std.values, np.vstack([np.std(y, ddof=1) for y in Y]))
     assert np.allclose(dataset.train_tensors[1].values.flatten(),
-                       np.hstack([(y - np.mean(y))/np.std(y, ddof=1) for y in Y]))
+                       np.hstack([(y - np.mean(y)) / np.std(y, ddof=1) for y in Y]))
 
 
 @pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing'], indirect=True)
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index afbe7648f..c44782d06 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -1,7 +1,6 @@
 import functools
 import traceback
 import unittest
-import pytest
 
 import numpy as np
 from numpy.linalg import LinAlgError
@@ -279,4 +278,3 @@ def get_dataset_getters():
             get_abalone_datamanager,
             get_regression_datamanager,
             get_forecasting_dataset]
-
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
index 44b39e04b..ba25c0740 100644
--- a/test/test_evaluation/test_forecasting_evaluators.py
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -244,7 +244,6 @@ def test_finish_up(self, pipeline_mock, queue_mock):
         configuration = unittest.mock.Mock(spec=Configuration)
         backend_api = create(self.tmp_dir, self.output_dir, prefix='autoPyTorch')
         backend_api.load_datamanager = lambda: D
-        queue_ = multiprocessing.Queue()
 
         ae = TimeSeriesForecastingTrainEvaluator(backend_api,
                                                  queue_mock,
diff --git a/test/test_pipeline/components/preprocessing/forecasting/base.py b/test/test_pipeline/components/preprocessing/forecasting/base.py
index 78e115959..dd7936c98 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/base.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/base.py
@@ -46,4 +46,3 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
                       ])
 
         return steps
-
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
index 424b8898e..b0015a7fd 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
@@ -1,4 +1,3 @@
-import copy
 import unittest
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
@@ -22,4 +21,4 @@ def test_get_set_config_space(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
index e071c08e2..e5134b890 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
@@ -68,8 +68,8 @@ def test_one_hot_encoder_no_unknown(self):
         idx_cat = 0
         for i, fea_name in enumerate(dataset_properties['feature_names']):
             if i in dataset_properties['categorical_columns']:
-                self.assertEqual( dataset_properties['feature_shapes'][fea_name],
-                                  len(dataset_properties['categories'][idx_cat]))
+                self.assertEqual(dataset_properties['feature_shapes'][fea_name],
+                                 len(dataset_properties['categories'][idx_cat]))
                 idx_cat += 1
             else:
                 assert dataset_properties['feature_shapes'][fea_name] == 1
@@ -91,4 +91,4 @@ def test_none_encoder(self):
 
         dataset_properties = X['dataset_properties']
         for i, fea_name in enumerate(dataset_properties['feature_names']):
-            self.assertEqual( dataset_properties['feature_shapes'][fea_name], 1)
\ No newline at end of file
+            self.assertEqual(dataset_properties['feature_shapes'][fea_name], 1)
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
index b4c85ba85..d1bb355ca 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import pandas as pd
-from numpy.testing import assert_allclose
 
 from sklearn.base import BaseEstimator
 from sklearn.compose import make_column_transformer
@@ -76,8 +75,7 @@ def test_base_and_standard_scaler(self):
                                                              [0., -0.80178373, 0.33824071],
                                                              [0., -0.80178373, 1.24021595]])))
 
-
-        # second column is static features, those it need to be the mean and std value across all sequences
+        # second column is static features. It needs to be the mean and std value across all sequences
         scaler.dataset_is_small_preprocess = False
         scaler.static_features = self.static_features_column
         scaler = scaler.fit(self.raw_data[0])
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
index 386258c22..842c994ea 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from scipy.sparse import csr_matrix
-
 from sklearn.compose import ColumnTransformer
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
@@ -14,12 +12,12 @@
 
 
 @pytest.mark.parametrize("fit_dictionary_forecasting", ['uni_variant_wo_missing',
-                                                            'uni_variant_w_missing',
-                                                            'multi_variant_wo_missing',
-                                                            'multi_variant_w_missing',
-                                                            'multi_variant_w_missing_only_cat',
-                                                            'multi_variant_w_missing_only_num',
-                                                            ], indirect=True)
+                                                        'uni_variant_w_missing',
+                                                        'multi_variant_wo_missing',
+                                                        'multi_variant_w_missing',
+                                                        'multi_variant_w_missing_only_cat',
+                                                        'multi_variant_w_missing_only_num',
+                                                        ], indirect=True)
 def test_time_series_preprocess(fit_dictionary_forecasting):
     pipeline = ForecastingPipeline(dataset_properties=fit_dictionary_forecasting['dataset_properties'])
     pipeline = pipeline.fit(fit_dictionary_forecasting)
@@ -56,7 +54,9 @@ def test_time_series_preprocess(fit_dictionary_forecasting):
 
         # Make sure no columns are unintentionally dropped after preprocessing
         if len(fit_dictionary_forecasting['dataset_properties']["numerical_columns"]) == 0:
-            categorical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_['categorical_pipeline']
+            categorical_pipeline = time_series_feature_transformer.preprocessor.named_transformers_[
+                'categorical_pipeline'
+            ]
             categorical_data = categorical_pipeline.transform(X['X_train'])
             assert features.shape[1] == categorical_data.shape[1]
         elif len(fit_dictionary_forecasting['dataset_properties']["categorical_columns"]) == 0:
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index f77877f74..237b77c3f 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -167,7 +167,6 @@ def test_encoder_choices(self):
         cs_seq = encoder_choices.get_hyperparameter_search_space(dataset_properties)
         self.assertListEqual(list(cs_seq.get_hyperparameter('__choice__').choices), ['seq_encoder'])
 
-
         encoder_choices = ForecastingNetworkChoice(dataset_properties)
         update_rnn_decoder_type = HyperparameterSearchSpaceUpdate(
             node_name="network_backbone",
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
index 6f6935f64..2e8ef547e 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
@@ -9,7 +9,6 @@
 )
 
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
@@ -100,8 +99,8 @@ def test_network_forward(self,
             return
         if network_type == 'NBEATSNet':
             # NBEATS only needs one pass
-            if not (embedding == 'NoEmbedding' and net_output_type == 'regression' and
-                    not variable_selection and not with_static_features and network_encoder == 'RNNEncoder'
+            if not (embedding == 'NoEmbedding' and net_output_type == 'regression'
+                    and not variable_selection and not with_static_features and network_encoder == 'RNNEncoder'
                     and not uni_variant_data):
                 return
         if uni_variant_data:
@@ -125,9 +124,10 @@ def test_network_forward(self,
         fit_dictionary['target_scaler'] = TargetStandardScaler().fit(fit_dictionary)
 
         if net_output_type.startswith("distribution"):
-            fit_dictionary['dist_forecasting_strategy'] = DisForecastingStrategy(list(ALL_DISTRIBUTIONS.keys())[0],
-                                                                                 forecast_strategy=
-                                                                                 net_output_type.split("_")[1])
+            fit_dictionary['dist_forecasting_strategy'] = DisForecastingStrategy(
+                list(ALL_DISTRIBUTIONS.keys())[0],
+                forecast_strategy=net_output_type.split("_")[1]
+            )
             net_output_type = net_output_type.split("_")[0]
         elif net_output_type == 'quantile':
             fit_dictionary['quantile_values'] = [0.5, 0.1, 0.9]
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index e2ba3b615..496a88829 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -60,7 +60,7 @@ def test_config_space(self):
         self.assertTrue('network_decoder' in fit_dict)
         self.assertEqual(len(fit_dict['network_decoder']), num_blocks)
 
-        #test error:
+        # test error:
         dataset_properties = copy.copy(self.dataset_properties)
         dataset_properties.update({'feature_shapes': {},
                                    'feature_names': tuple(),
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
index 898891d00..3998f518a 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -200,7 +200,7 @@ def test_target_standard_scalar(self):
         self.assertTrue(torch.equal(transformed_past_target[2], torch.zeros([10, 1])))
 
         self.assertTrue(torch.equal(transformed_future_targets[0], torch.ones([10, 1]) * 10))
-        self.assertTrue(torch.allclose(transformed_future_targets[1], torch.ones([10, 1]) * 12.0618,  atol=1e-4))
+        self.assertTrue(torch.allclose(transformed_future_targets[1], torch.ones([10, 1]) * 12.0618, atol=1e-4))
         self.assertTrue(torch.equal(transformed_future_targets[2], torch.ones([10, 1]) * 6.))
 
         self.assertTrue(
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
index 3fc1f491a..664f1e802 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
@@ -96,4 +96,3 @@ def test_regression_loss(self):
             train_loss = fit_dictionary['loss']
 
             self.assertEqual(train_loss, loss_type)
-
diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py
index 63ca438dc..71f871765 100644
--- a/test/test_pipeline/components/training/base.py
+++ b/test/test_pipeline/components/training/base.py
@@ -1,4 +1,3 @@
-from typing import Any, Optional, Dict, List, Tuple
 import logging
 
 from sklearn.datasets import make_classification, make_regression
diff --git a/test/test_pipeline/test_metrics.py b/test/test_pipeline/test_metrics.py
index b3b3cd386..0772239c5 100644
--- a/test/test_pipeline/test_metrics.py
+++ b/test/test_pipeline/test_metrics.py
@@ -18,8 +18,7 @@
 from autoPyTorch.metrics import (accuracy,
                                  balanced_accuracy,
                                  mean_squared_error,
-                                 compute_mase_coefficient,
-                                 median_MAPE_forecasting)
+                                 compute_mase_coefficient)
 from autoPyTorch.pipeline.components.training.metrics.base import (
     _PredictMetric,
     _ThresholdMetric,
@@ -56,7 +55,7 @@ def test_get_no_name_regression(output_type):
 
 
 @pytest.mark.parametrize('output_type', ['continuous', 'continuous-multioutput'])
-def test_get_no_name_regression(output_type):
+def test_get_no_name_forecasting(output_type):
     dataset_properties = {'task_type': 'time_series_forecasting',
                           'output_type': output_type}
     metrics = get_metrics(dataset_properties)
@@ -216,7 +215,9 @@ def test_forecastingcomputation():
     n_prediction_steps = 5
     n_targets = 2
 
-    y_true = np.expand_dims([np.arange(n_prediction_steps) + i * 10 for i in range(n_seq)], -1).repeat(n_targets, axis=-1)
+    y_true = np.expand_dims(
+        [np.arange(n_prediction_steps) + i * 10 for i in range(n_seq)], -1
+    ).repeat(n_targets, axis=-1)
     y_pred = y_true + 1
     score_mean = scorer_mean(y_true=y_true, y_pred=y_pred, sp=1, n_prediction_steps=n_prediction_steps)
     score_median = scorer_median(y_true=y_true, y_pred=y_pred, sp=1, n_prediction_steps=n_prediction_steps)
@@ -267,7 +268,6 @@ def test_sign_flip():
     assert score == pytest.approx(-1.0)
 
 
-
 def test_classification_only_metric():
     y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
     y_pred = \

From a1c79300fb2813165c8e4dff64c8c3f894c64e6e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 24 May 2022 22:10:43 +0200
Subject: [PATCH 288/347] mypy

---
 .../setup/network_backbone/MLPBackbone.py     |  2 +-
 .../setup/network_backbone/__init__.py        |  2 +-
 .../forecasting_backbone/__init__.py          |  6 +--
 .../forecasting_backbone/cells.py             | 45 +++++++++---------
 .../forecasting_backbone/components_util.py   | 12 ++---
 .../forecasting_decoder/MLPDecoder.py         |  6 +--
 .../forecasting_decoder/NBEATSDecoder.py      | 13 +++--
 .../forecasting_decoder/RNNDecoder.py         |  7 +--
 .../forecasting_decoder/TransformerDecoder.py | 14 ++++--
 .../base_forecasting_decoder.py               | 10 ++--
 .../forecasting_decoder/components.py         |  5 +-
 .../forecasting_encoder/__init__.py           | 21 +++++----
 .../base_forecasting_encoder.py               |  6 ++-
 .../forecasting_encoder/components.py         |  2 +-
 .../flat_encoder/MLPEncoder.py                |  9 ++--
 .../flat_encoder/NBEATSEncoder.py             |  8 ++--
 .../seq_encoder/InceptionTimeEncoder.py       |  6 +--
 .../seq_encoder/RNNEncoder.py                 |  6 ++-
 .../seq_encoder/TCNEncoder.py                 |  6 +--
 .../seq_encoder/TransformerEncoder.py         | 16 +++----
 .../seq_encoder/__init__.py                   | 33 ++++++-------
 .../other_components/TemporalFusion.py        |  5 +-
 .../forecasting_network_head/NBEATS_head.py   | 13 +++--
 .../forecasting_network_head/distribution.py  | 47 ++++++++++---------
 .../forecasting_head.py                       | 37 ++++++++-------
 .../pipeline/components/training/losses.py    |  4 +-
 26 files changed, 185 insertions(+), 156 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
index 46f3f913d..f2ed459c3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/MLPBackbone.py
@@ -31,7 +31,7 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> nn.Module:
         in_features = input_shape[0]
         return self._build_backbone(in_features)
 
-    def _build_backbone(self, in_features: int, ):
+    def _build_backbone(self, in_features: int, ) -> nn.Module:
         layers: List[nn.Module] = list()
         self._add_layer(layers, in_features, self.config['num_units_1'], 1)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index 4a9e360fe..59f7aed60 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -188,7 +188,7 @@ def get_hyperparameter_search_space(
         return cs
 
     @property
-    def _defaults_network(self):
+    def _defaults_network(self) -> List[str]:
         return [
             'ShapedMLPBackbone',
             'MLPBackbone',
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 178408663..20fce642e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -30,8 +30,8 @@ def __init__(self,
                  random_state: Optional[np.random.RandomState] = None
                  ):
         super().__init__(dataset_properties, random_state)
-        self.include_components = {}
-        self.exclude_components = {}
+        self.include_components: Dict[str, List[str]] = {}
+        self.exclude_components: Dict[str, List[str]] = {}
 
         self.default_components = OrderedDict(
             {"flat_encoder": FlatForecastingEncoderChoice(dataset_properties=self.dataset_properties,
@@ -285,7 +285,7 @@ def _apply_search_space_update(self, hyperparameter_search_space_update: Hyperpa
             self.get_components()[sub_module_name]._apply_search_space_update(update_sub_module)
 
     @property
-    def _defaults_network(self):
+    def _defaults_network(self) -> List[str]:
         return ['flat_network',
                 'seq_network']
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 00955ecc9..21f9c6c2e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, List, Tuple, Union
+from typing import Any, Dict, Optional, List, Tuple, Union, Set
 
 import torch
 from torch import nn
@@ -95,14 +95,14 @@ def forward(self,
                 decoder_output: torch.Tensor,
                 past_observed_targets: torch.BoolTensor,
                 decoder_length: int,
-                static_embedding: Optional[torch.Tensor] = None):
+                static_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
         Args:
             encoder_output: the output of the last layer of encoder network
             decoder_output: the output of the last layer of decoder network
             past_observed_targets: observed values in the past
             decoder_length: length of decoder network
-            static_embedding: output of static variable selection network (if applible)
+            static_embedding: output of static variable selection network (if available)
         """
 
         if self.decoder_proj_layer is not None:
@@ -147,15 +147,15 @@ def forward(self,
             return output
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return self._device
 
     @device.setter
-    def device(self, device: torch.device):
+    def device(self, device: torch.device) -> None:
         self.to(device)
         self._device = device
 
-    def get_attention_mask(self, past_observed_targets: torch.BoolTensor, decoder_length: int):
+    def get_attention_mask(self, past_observed_targets: torch.BoolTensor, decoder_length: int) -> torch.Tensor:
         """
         https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/
         temporal_fusion_transformer/__init__.py
@@ -193,11 +193,11 @@ def __init__(self,
                  dataset_properties: Dict[str, Any],
                  network_encoder: Dict[str, EncoderBlockInfo],
                  auto_regressive: bool = False,
-                 feature_names: Tuple[str] = (),
-                 known_future_features: Tuple[str] = tuple(),
+                 feature_names: Union[Tuple[str], Tuple[()]] = (),
+                 known_future_features: Union[Tuple[str], Tuple[()]] = (),
                  feature_shapes: Dict[str, int] = {},
-                 static_features: Tuple[Union[str, int]] = (),
-                 time_feature_names: Tuple[str] = (),
+                 static_features: Union[Tuple[Union[str, int]], Tuple[()]] = (),
+                 time_feature_names: Union[Tuple[str], Tuple[()]] = (),
                  ):
         """
         Variable Selector. This models follows the implementation from
@@ -232,13 +232,13 @@ def __init__(self,
         idx_tracker = 0
         idx_tracker_future = 0
 
-        static_features = set(static_features)
+        static_features = set(static_features)  # type: ignore[assignment]
         static_features_input_size = {}
 
         # static_features should always be known beforehand
-        known_future_features = tuple(known_future_features)
-        feature_names = tuple(feature_names)
-        time_feature_names = tuple(time_feature_names)
+        known_future_features = tuple(known_future_features)  # type: ignore[assignment]
+        feature_names = tuple(feature_names)  # type: ignore[assignment]
+        time_feature_names = tuple(time_feature_names)  # type: ignore[assignment]
 
         if feature_names:
             for name in feature_names:
@@ -273,7 +273,7 @@ def __init__(self,
             placeholder_features = 'placeholder_features'
             i = 0
 
-            self.placeholder_features = []
+            self.placeholder_features: List[str] = []
             while placeholder_features in feature_names or placeholder_features in self.placeholder_features:
                 i += 1
                 placeholder_features = f'placeholder_features_{i}'
@@ -287,8 +287,8 @@ def __init__(self,
             decoder_input_sizes[name] = self.hidden_size
             self.placeholder_features.append(placeholder_features)
 
-        feature_names = time_feature_names + feature_names
-        known_future_features = time_feature_names + known_future_features
+        feature_names = time_feature_names + feature_names  # type: ignore[assignment]
+        known_future_features = time_feature_names + known_future_features  # type: ignore[assignment]
 
         self.feature_names = feature_names
         self.feature_names2tensor_idx = feature_names2tensor_idx
@@ -381,11 +381,11 @@ def __init__(self,
         self.cached_static_embedding = None
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return self._device
 
     @device.setter
-    def device(self, device: torch.device):
+    def device(self, device: torch.device) -> None:
         self.to(device)
         self._device = device
 
@@ -398,7 +398,7 @@ def forward(self,
                 batch_size: int = 0,
                 cache_static_contex: bool = False,
                 use_cached_static_contex: bool = False,
-                ):
+                ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], torch.Tensor, Optional[torch.Tensor]]:
         if x_past is None and x_future is None:
             raise ValueError('Either past input or future inputs need to be given!')
         if length_past == 0 and length_future == 0:
@@ -409,6 +409,7 @@ def forward(self,
             if len(self.static_input_sizes) > 0:
                 static_embedding, _ = self.static_variable_selection(x_static)
             else:
+                assert x_future is not None and x_past is not None
                 model_dtype = next(iter(x_past.values())).dtype if length_past > 0 else next(
                     iter(x_future.values())).dtype
                 static_embedding = torch.zeros(
@@ -585,7 +586,7 @@ def __init__(self,
                  ):
         super().__init__()
         self.num_blocks = network_structure.num_blocks
-        self.first_block = None
+        self.first_block = -1
         self.skip_connection = network_structure.skip_connection
 
         self.decoder_has_hidden_states = []
@@ -593,7 +594,7 @@ def __init__(self,
         for i in range(1, self.num_blocks + 1):
             block_id = f'block_{i}'
             if block_id in decoder_info:
-                self.first_block = i if self.first_block is None else self.first_block
+                self.first_block = i if self.first_block == -1 else self.first_block
                 decoder[block_id] = decoder_info[block_id].decoder
                 if decoder_info[block_id].decoder_properties.has_hidden_states:
                     self.decoder_has_hidden_states.append(True)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index d4482910c..671aecabe 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -37,7 +37,7 @@ def __init__(self,
         self.skip_connection = skip_connection
         self.skip_connection_type = skip_connection_type
         self.grn_dropout_rate = grn_dropout_rate
-        self.network_structure = None
+        self.network_structure: Optional[NetworkStructure] = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.network_structure = NetworkStructure(num_blocks=self.num_blocks,
@@ -68,14 +68,14 @@ def __init__(self, input_size: int, skip_size: int):
             self.fc = nn.Linear(skip_size, input_size)
         self.norm = nn.LayerNorm(input_size)
 
-    def forward(self, input: torch.Tensor, skip: torch.Tensor):
+    def forward(self, input: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
         if hasattr(self, 'fc'):
             return self.norm(input + self.fc(skip))
         else:
             return self.norm(input)
 
 
-def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type='encoder'):
+def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type: str = 'encoder') -> nn.Module:
     nhead = 2 ** config['n_head_log']
     dim_feedforward = 2 ** config['d_feed_forward_log']
     dropout = config.get('dropout', 0.0)
@@ -114,7 +114,7 @@ class PositionalEncoding(nn.Module):
         >>> pos_encoder = PositionalEncoding(d_model)
     """
 
-    def __init__(self, d_model, dropout=0.1, max_len=5000):
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
         super(PositionalEncoding, self).__init__()
         self.dropout = nn.Dropout(p=dropout)
 
@@ -126,7 +126,7 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         pe = pe.unsqueeze(0)
         self.register_buffer('pe', pe)
 
-    def forward(self, x, pos_idx: Optional[Tuple[int]] = None):
+    def forward(self, x: torch.Tensor, pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
         r"""Inputs of forward function
         Args:
             x: the sequence fed to the positional encoder model (required).
@@ -141,5 +141,5 @@ def forward(self, x, pos_idx: Optional[Tuple[int]] = None):
         if pos_idx is None:
             x = x + self.pe[:, :x.size(1), :]
         else:
-            x = x + self.pe[:, pos_idx[0]: pos_idx[1], :]
+            x = x + self.pe[:, pos_idx[0]: pos_idx[1], :]  # type: ignore
         return self.dropout(x)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index be3493014..7c9bbb131 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Tuple, Union, Any
+from typing import Dict, Optional, Tuple, Union, Any, List
 
 import numpy as np
 import torch
@@ -31,7 +31,7 @@ def __init__(self,
         self.auto_regressive = auto_regressive
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
-                pos_idx: Optional[Tuple[int]] = None):
+                pos_idx: Optional[Tuple[int]] = None) ->torch.Tensor:
         if not self.auto_regressive:
             if len(encoder_output.shape) == 3:
                 encoder_output = encoder_output.squeeze(1)
@@ -103,7 +103,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return super().transform(X)
 
     @property
-    def fitted_encoder(self):
+    def fitted_encoder(self) -> List[str]:
         return ['RNNEncoder', 'TCNEncoder', 'MLEncoder', 'NBEATSEncoder']
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 5331c8994..fe6026fbb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -57,7 +57,7 @@ def __init__(self,
         self.backcast_head = None
         self.forecast_head = None
 
-    def build_backbone(self):
+    def build_backbone(self) -> List[nn.Module]:
         layers: List[nn.Module] = list()
         n_in_features = self.n_in_features
         for _ in range(self.num_layers):
@@ -76,7 +76,7 @@ def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
             layers.append(nn.Dropout(self.dropout_rate))
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
-                pos_idx: Optional[Tuple[int]] = None):
+                pos_idx: Optional[Tuple[int]] = None) -> Union[torch.Module, Tuple[torch.Module, torch.Module]]:
         if self.backcast_head is None and self.forecast_head is None:
             # used to compute head dimensions
             return self.backbone(encoder_output)
@@ -91,7 +91,6 @@ class NBEATSDecoder(BaseForecastingDecoder):
     _fixed_seq_length = True
     window_size = 1
     fill_lower_resolution_seq = False
-    fill_kwargs = {}
 
     @staticmethod
     def decoder_properties() -> DecoderProperties:
@@ -101,11 +100,11 @@ def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
                        future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
-                       dataset_properties: Dict) -> Tuple[nn.Module, int]:
+                       dataset_properties: Dict) -> Tuple[List[List[NBEATSBLock]], int]:
         in_features = encoder_output_shape[-1]
         n_beats_type = self.config['n_beats_type']
         if n_beats_type == 'G':
-            stacks = [[] for _ in range(self.config['num_stacks_g'])]
+            stacks: List[List[NBEATSBLock]] = [[] for _ in range(self.config['num_stacks_g'])]
             for stack_idx in range(1, self.config['num_stacks_g'] + 1):
                 for block_idx in range(self.config['num_blocks_g']):
                     if self.config['weight_sharing_g'] and block_idx > 0:
@@ -127,7 +126,7 @@ def _build_decoder(self,
                                                              ))
 
         elif n_beats_type == 'I':
-            stacks = [[] for _ in range(self.config['num_stacks_i'])]
+            stacks: List[List[NBEATSBLock]] = [[] for _ in range(self.config['num_stacks_i'])]  # type:ignore
             for stack_idx in range(1, self.config['num_stacks_i'] + 1):
                 for block_idx in range(self.config['num_blocks_i_%d' % stack_idx]):
                     if self.config['weight_sharing_i_%d' % stack_idx] and block_idx > 0:
@@ -173,7 +172,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     @property
-    def fitted_encoder(self):
+    def fitted_encoder(self) -> List[str]:
         return ['NBEATSEncoder']
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 46644663f..f63edaa64 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -57,10 +57,10 @@ class ForecastingRNNDecoder(BaseForecastingDecoder):
     Standard searchable RNN decoder for time series data, only works when the encoder is
     """
 
-    def __init__(self, **kwargs: Dict):
+    def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
-        self.rnn_kwargs = None
+        self.rnn_kwargs:Optional[Dict] = None
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
     @property
@@ -74,6 +74,7 @@ def _build_decoder(self,
                        future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        assert self.rnn_kwargs is not None
         # RNN decoder only allows RNN encoder, these parameters need to exists.
         hidden_size = self.rnn_kwargs['hidden_size']
         num_layers = self.rnn_kwargs['num_layers']
@@ -89,7 +90,7 @@ def _build_decoder(self,
         return decoder, hidden_size
 
     @property
-    def fitted_encoder(self):
+    def fitted_encoder(self) -> List[str]:
         return ['RNNEncoder']
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 404a8312c..9654e5071 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -64,7 +64,10 @@ def __init__(self,
                                                                 norm=norm)
         self.tgt_mask = nn.Transformer.generate_square_subsequent_mask(n_prediction_steps)
 
-    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
+    def forward(self,
+                x_future: torch.Tensor,
+                encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
         output = self.input_layer(x_future)
         if self.use_positional_decoder:
             output = self.pos_encoding(output, pos_idx)
@@ -77,10 +80,10 @@ def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx:
 
 
 class ForecastingTransformerDecoder(BaseForecastingDecoder):
-    def __init__(self, **kwargs: Dict):
+    def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
-        self.transformer_encoder_kwargs = None
+        self.transformer_encoder_kwargs: Optional[dict] = None
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
     def _build_decoder(self,
@@ -88,6 +91,7 @@ def _build_decoder(self,
                        future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
                        dataset_properties: Dict) -> Tuple[nn.Module, int]:
+        assert self.transformer_encoder_kwargs is not None
         d_model = 2 ** self.transformer_encoder_kwargs['d_model_log']
         transformer_decoder_layers = build_transformer_layers(d_model=d_model, config=self.config, layer_type='decoder')
         n_prediction_steps = dataset_properties['n_prediction_steps']
@@ -113,7 +117,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
         return fit_requirement
 
     @staticmethod
-    def decoder_properties():
+    def decoder_properties() -> DecoderProperties:
         return DecoderProperties(recurrent=True,
                                  lagged_input=True,
                                  mask_on_future_target=True)
@@ -136,7 +140,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     @property
-    def fitted_encoder(self):
+    def fitted_encoder(self) -> List[str]:
         return ['TransformerEncoder']
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 932bd4f98..9f25ea2dc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -22,17 +22,17 @@ class BaseForecastingDecoder(autoPyTorchComponent):
     def __init__(self,
                  block_number: int = 1,
                  auto_regressive: bool = False,
-                 **kwargs: Dict[str, Any]):
+                 **kwargs: Any):
         super().__init__()
         self.block_number = block_number
         self.add_fit_requirements(self._required_fit_requirements)
         self.auto_regressive = auto_regressive
         self.config = kwargs
         self.decoder: Optional[nn.Module] = None
-        self.n_decoder_output_features = None
-        self.decoder_input_shape = None
+        self.n_decoder_output_features: Optional[int] = None
+        self.decoder_input_shape: Optional[Tuple[int, ...]] = None
         self.n_prediction_heads = 1
-        self.is_last_decoder = False
+        self.is_last_decoder: Optional[bool] = False
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -132,6 +132,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         # 'n_prediction_heads' and 'n_decoder_output_features' are only applied to the head such that they could be
         # overwritten by the following decoders
         network_decoder = X.get('network_decoder', OrderedDict())
+        assert self.decoder_input_shape is not None
+        assert self.n_decoder_output_features is not None
         network_decoder[f'block_{self.block_number}'] = DecoderBlockInfo(
             decoder=self.decoder,
             decoder_properties=self.decoder_properties(),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
index 28f924595..33ed93f69 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -21,7 +21,9 @@ class DecoderBlockInfo(NamedTuple):
 
 
 class DecoderNetwork(nn.Module):
-    def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx: Optional[Tuple[int]] = None):
+    def forward(self, x_future: torch.Tensor,
+                encoder_output: torch.Tensor,
+                pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
         """
         Base forecasting Decoder Network, its output needs to be a 3-d Tensor:
 
@@ -29,6 +31,7 @@ def forward(self, x_future: torch.Tensor, encoder_output: torch.Tensor, pos_idx:
         Args:
             x_future: torch.Tensor(B, L_future, N_out), the future features
             encoder_output: torch.Tensor(B, L_encoder, N), output of the encoder network, or the hidden states
+            pos_idx: positional index, indicating the position of the forecasted tensor, used for transformer
         Returns:
             net_output: torch.Tensor with shape either (B, L_future, N)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index fb73c88c5..8b9cff22b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -1,7 +1,7 @@
 import os
 import warnings
 from collections import OrderedDict
-from typing import Dict, Optional, List, Any, Type
+from typing import Dict, Optional, List, Any, Type, Callable
 from abc import abstractmethod
 from sklearn.pipeline import Pipeline
 
@@ -25,6 +25,8 @@
 )
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
     decoders, decoder_addons
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
@@ -41,12 +43,11 @@ class AbstractForecastingEncoderChoice(autoPyTorchChoice):
     """
 
     def __init__(self,
-
-                 **kwargs,
+                 **kwargs: Any,
                  ):
         super().__init__(**kwargs)
         self.pipeline = None
-        self.decoder_choice = None
+        self.decoder_choice: Optional[List[BaseForecastingDecoder]] = None
 
     @abstractmethod
     def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
@@ -61,14 +62,14 @@ def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         """
         raise NotImplementedError
 
-    def get_decoder_components(self) -> Dict[str, autoPyTorchComponent]:
+    def get_decoder_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         components = OrderedDict()
         components.update(decoders)
         components.update(decoder_addons.components)
         return components
 
     @property
-    def additional_components(self):
+    def additional_components(self) -> List[Callable]:
         # This function is deigned to add additional components rather than the components in __choice__
         return [self.get_decoder_components]
 
@@ -209,8 +210,8 @@ def get_hyperparameter_search_space(
             )
         cs.add_hyperparameter(hp_encoder)
 
-        decoder2encoder = {key: [] for key in available_decoders.keys()}
-        encoder2decoder = {}
+        decoder2encoder: Dict[str, List[str]] = {key: [] for key in available_decoders.keys()}
+        encoder2decoder: Dict[str, List[str]] = {}
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
             config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,
@@ -351,7 +352,7 @@ def set_hyperparameters(self,
 
         self.new_params = new_params
         self.choice = self.get_components()[choice](**new_params)
-        self.decoder_choice = decoder_components[decoder_type](**decoder_params)
+        self.decoder_choice = decoder_components[decoder_type](**decoder_params)  # type: ignore[index]
 
         self.pipeline = Pipeline([('net_structure', ForecastingNetworkStructure()),
                                   ('encoder', self.choice),
@@ -359,7 +360,7 @@ def set_hyperparameters(self,
         return self
 
     @property
-    def _defaults_network(self):
+    def _defaults_network(self) -> List[str]:
         return ['MLPEncoder', 'RNNEncoder', 'NBEATSEncoder']
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 9fc4451ce..b4c3490b0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -103,13 +103,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         has_hidden_states = self.encoder_properties().has_hidden_states
         self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
-        if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:
+        if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:  # type: ignore
             raise ValueError(f'n_encoder_output_feature ({ self.n_encoder_output_feature()}) '
                              f'must equal to the output dimension f({self.encoder_output_shape})')
         return self
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         raise NotImplementedError
 
     @abstractmethod
@@ -123,6 +123,8 @@ def n_hidden_states(self) -> int:
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X['dataset_properties'].update({'input_shape': self.input_shape})
         network_encoder = X.get('network_encoder', OrderedDict())
+        assert self.input_shape is not None
+        assert self.encoder_output_shape is not None
         network_encoder[f'block_{self.block_number}'] = EncoderBlockInfo(encoder=self.encoder,
                                                                          encoder_properties=self.encoder_properties(),
                                                                          encoder_input_shape=self.input_shape,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
index 2023a4e96..9fba83862 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -24,7 +24,7 @@ class EncoderBlockInfo(NamedTuple):
 class EncoderNetwork(nn.Module):
     def forward(self,
                 x: torch.Tensor,
-                output_seq: bool = False):
+                output_seq: bool = False) -> torch.Tensor:
         """
         Base forecasting network, its output needs to be a 2-d or 3-d Tensor:
         When the decoder is an auto-regressive model, then it needs to output a 3-d Tensor, in which case, output_seq
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index c88c0b8d1..20843d8cb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -29,13 +29,12 @@ def __init__(self,
             window_size (int): T
             fill_lower_resolution_seq: if sequence with lower resolution needs to be filled with 0
         (for multi-fidelity problems with resolution as fidelity)
-            fill_kwargs: filling information
         """
         super().__init__()
         self.window_size = window_size
         self.network = network
 
-    def forward(self, x: torch.Tensor, output_seq: bool = False):
+    def forward(self, x: torch.Tensor, output_seq: bool = False) -> torch.Tensor:
         """
 
         Args:
@@ -64,7 +63,7 @@ def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
+class MLPEncoder(BaseForecastingEncoder, MLPBackbone):  # type:ignore[misc]
     _fixed_seq_length = True
     window_size = 1
 
@@ -73,7 +72,7 @@ def encoder_properties() -> EncoderProperties:
         return EncoderProperties(bijective_seq_output=False, fixed_input_seq_length=True)
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         """
         decoder that is compatible with the encoder
         """
@@ -132,7 +131,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     @staticmethod
-    def get_hyperparameter_search_space(
+    def get_hyperparameter_search_space(  # type: ignore
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
                                                                               value_range=(1, 5),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 43b3225b7..9d034674b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -27,7 +27,7 @@ def encoder_properties() -> EncoderProperties:
         return EncoderProperties(fixed_input_seq_length=True)
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         """
         decoder that is compatible with the encoder
         """
@@ -51,13 +51,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             input_shape=output_shape,
         )
 
-        self.input_shape = [self.window_size, output_shape[-1]]
+        self.input_shape = [self.window_size, output_shape[-1]]  # type: ignore[assignment]
 
         has_hidden_states = self.encoder_properties().has_hidden_states
-        self.encoder_output_shape = get_output_shape(self.encoder, self.input_shape, has_hidden_states)
+        self.encoder_output_shape = get_output_shape(self.encoder, self.input_shape, has_hidden_states)  # type: ignore
         return self
 
-    def n_encoder_output_feature(self):
+    def n_encoder_output_feature(self) -> None:  # type: ignore
         # THIS function should never be called!!!
         raise NotImplementedError
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 7a5585fa3..094f26ab8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, List
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -114,7 +114,7 @@ def __init__(self,
             n_inputs = block.get_n_outputs()
         self.receptive_field = receptive_field
 
-    def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, output_seq: bool = False) -> torch.Tensor:
         # swap sequence and feature dimensions for use with convolutional nets
         x = x.transpose(1, 2).contiguous()
         res = x
@@ -151,7 +151,7 @@ def n_encoder_output_feature(self) -> int:
         return self.config['num_filters'] * 4
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         """
         decoder that is compatible with the encoder
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 4779f2283..1fd73507d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -80,7 +80,7 @@ class RNNEncoder(BaseForecastingEncoder):
     """
     _fixed_seq_length = False
 
-    def __init__(self, **kwargs: Dict):
+    def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
@@ -100,9 +100,11 @@ def n_hidden_states(self) -> int:
             return 2
         elif self.config['cell_type'] == 'gru':
             return 1
+        else:
+            raise NotImplementedError
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         """
         decoder that is compatible with the encoder
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 446245208..666b89478 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -11,7 +11,7 @@
 import torch
 from torch import nn
 from torch.nn.utils import weight_norm
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
     EncoderNetwork
@@ -103,7 +103,7 @@ def __init__(self, num_inputs: int, num_channels: List[int], kernel_size: List[i
         self.receptive_field = receptive_field
         self.network = nn.Sequential(*layers)
 
-    def forward(self, x: torch.Tensor, output_seq=False) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, output_seq: bool = False) -> torch.Tensor:
         # swap sequence and feature dimensions for use with convolutional nets
         x = x.transpose(1, 2).contiguous()
         x = self.network(x)
@@ -144,7 +144,7 @@ def n_encoder_output_feature(self) -> int:
         return self.config[f"num_filters_{num_blocks}"]
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         """
         decoder that is compatible with the encoder
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index fa28c70c2..f1c817411 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -27,7 +27,7 @@ def __init__(self,
                  in_features: int,
                  d_model: int,
                  num_layers: int,
-                 transformer_encoder_layers: [nn.Module],
+                 transformer_encoder_layers: nn.Module,
                  use_positional_encoder: bool,
                  use_layer_norm_output: bool,
                  dropout_pe: float = 0.0,
@@ -39,12 +39,12 @@ def __init__(self,
         else:
             self.lagged_value = lagged_value
         if in_features != d_model:
-            self.input_layer = [nn.Linear(in_features, d_model, bias=False)]
+            input_layer = [nn.Linear(in_features, d_model, bias=False)]
         else:
-            self.input_layer = []
+            input_layer = []
         if use_positional_encoder:
-            self.input_layer.append(PositionalEncoding(d_model, dropout_pe))
-        self.input_layer = nn.Sequential(*self.input_layer)
+            input_layer.append(PositionalEncoding(d_model, dropout_pe))
+        self.input_layer = nn.Sequential(*input_layer)
 
         self.use_layer_norm_output = use_layer_norm_output
         if use_layer_norm_output:
@@ -77,7 +77,7 @@ class TransformerEncoder(BaseForecastingEncoder):
     """
     _fixed_seq_length = False
 
-    def __init__(self, **kwargs: Dict):
+    def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
@@ -102,14 +102,14 @@ def n_encoder_output_feature(self) -> int:
         return 2 ** self.config['d_model_log']
 
     @staticmethod
-    def allowed_decoders():
+    def allowed_decoders() -> List[str]:
         """
         decoder that is compatible with the encoder
         """
         return ['MLPDecoder', 'TransformerDecoder']
 
     @staticmethod
-    def encoder_properties():
+    def encoder_properties() -> EncoderProperties:
         return EncoderProperties(lagged_input=True,
                                  causality=False)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 88355a113..b4ca02302 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -4,6 +4,7 @@
 from sklearn.pipeline import Pipeline
 import inspect
 
+import ConfigSpace as CS
 from ConfigSpace.hyperparameters import (
     Hyperparameter,
     Constant,
@@ -70,7 +71,7 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components.update(_addons.components)
         return components
 
-    def get_hyperparameter_search_space(
+    def get_hyperparameter_search_space(  # type: ignore
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
@@ -227,12 +228,12 @@ def get_hyperparameter_search_space(
         # Compile a list of legal preprocessors for this problem
         available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(
             dataset_properties=dataset_properties,
-            include=include, exclude=exclude)
+            include=include, exclude=exclude)  # type:ignore[assignment]
 
         available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(
             dataset_properties=dataset_properties,
             include=None, exclude=exclude,
-            components=self.get_decoder_components())
+            components=self.get_decoder_components())  # type:ignore[assignment]
 
         if len(available_encoders) == 0:
             raise ValueError("No Encoder found")
@@ -262,7 +263,7 @@ def get_hyperparameter_search_space(
         #   this is judged by add_forbidden_for_non_ar_recurrent_decoder
 
         if True in decoder_auto_regressive.choices:
-            forbidden_decoder_ar = ForbiddenEqualsClause(decoder_auto_regressive, True)
+            forbidden_decoder_ar: Optional[ForbiddenEqualsClause] = ForbiddenEqualsClause(decoder_auto_regressive, True)
         else:
             forbidden_decoder_ar = None
 
@@ -272,7 +273,7 @@ def get_hyperparameter_search_space(
                 add_forbidden_for_non_ar_recurrent_decoder = True
 
         if len(decoder_auto_regressive.choices) == 1 and True in decoder_auto_regressive.choices:
-            conds_decoder_ar = None
+            conds_decoder_ar: Optional[List[CS.conditions.ConditionComponent]] = None
         else:
             conds_decoder_ar = []
 
@@ -318,7 +319,7 @@ def get_hyperparameter_search_space(
                     GreaterThanCondition(hp_encoder, num_blocks, i - 1)
                 )
 
-            decoder2encoder = {key: [] for key in available_decoders.keys()}
+            decoder2encoder: Dict[str, List[str]] = {key: [] for key in available_decoders.keys()}
             encoder2decoder = {}
             for encoder_name in hp_encoder.choices:
                 updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
@@ -393,8 +394,8 @@ def get_hyperparameter_search_space(
                         encoders_with_multi_decoder.append(encoder)
                     else:
                         encoder_with_single_decoder.append(encoder)
-                encoders_with_multi_decoder = set(encoders_with_multi_decoder)
-                encoder_with_single_decoder = set(encoder_with_single_decoder)
+                encoders_with_multi_decoder = set(encoders_with_multi_decoder)  # type:ignore[assignment]
+                encoder_with_single_decoder = set(encoder_with_single_decoder)  # type:ignore[assignment]
 
                 cs.add_configuration_space(
                     block_prefix + decoder_name,
@@ -431,7 +432,7 @@ def get_hyperparameter_search_space(
                 cs.add_conditions(conditions_to_add)
 
             if conds_decoder_ar is not None or forbidden_decoder_ar is not None:
-                forbiddens_ar_non_recurrent = []
+                forbiddens_ar_non_recurrent: List[CS.forbidden.AbstractForbiddenClause] = []
                 for encoder in hp_encoder.choices:
                     if len(encoder2decoder[encoder]) == 1:
                         if available_decoders[encoder2decoder[encoder][0]].decoder_properties().recurrent:
@@ -510,7 +511,7 @@ def get_hyperparameter_search_space(
             )
 
         for encoder_name, encoder in available_encoders.items():
-            encoder_is_casual = encoder.encoder_properties()
+            encoder_is_casual = encoder.encoder_properties()  # type: ignore
             if not encoder_is_casual:
                 # we do not allow non-casual encoder to appear in the lower layer of the network. e.g, if we have an
                 # encoder with 3 blocks, then non_casual encoder is only allowed to appear in the third layer
@@ -529,9 +530,9 @@ def get_hyperparameter_search_space(
         cs.add_forbidden_clauses(forbiddens_decoder_auto_regressive)
 
         if self.deepAR_decoder_name in available_decoders:
-            deep_ar_hp = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
-            if deep_ar_hp in cs:
-                deep_ar_hp = cs.get_hyperparameter(deep_ar_hp)
+            deep_ar_hp_name = ':'.join([self.deepAR_decoder_prefix, self.deepAR_decoder_name, 'auto_regressive'])
+            if deep_ar_hp_name in cs:
+                deep_ar_hp = cs.get_hyperparameter(deep_ar_hp_name)
                 if True in deep_ar_hp.choices:
                     forbidden_deep_ar = ForbiddenEqualsClause(deep_ar_hp, True)
                     if min_num_blocks == 1:
@@ -594,7 +595,7 @@ def get_hyperparameter_search_space(
         return cs
 
     @property
-    def _defaults_network(self):
+    def _defaults_network(self) -> List[str]:
         return ['RNNEncoder', 'NBEATSEncoder']
 
     def set_hyperparameters(self,
@@ -632,7 +633,7 @@ def set_hyperparameters(self,
 
         pipeline_steps = [('net_structure', ForecastingNetworkStructure(**forecasting_structure_kwargs))]
         self.encoder_choice = []
-        self.decoder_choice = []
+        self.decoder_choice: List[BaseForecastingEncoder] = []
 
         decoder_components = self.get_decoder_components()
 
@@ -681,7 +682,7 @@ def set_hyperparameters(self,
             if 'auto_regressive' not in decoder_params:
                 decoder_params['auto_regressive'] = decoder_auto_regressive
             encoder = self.get_components()[choice](**new_params)
-            decoder = decoder_components[decoder_type](**decoder_params)
+            decoder = decoder_components[decoder_type](**decoder_params)  # type:ignore
             pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
             self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index afbfd4c88..0e1436165 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -1,4 +1,5 @@
 import numpy as np
+import torch
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
@@ -39,7 +40,7 @@ def __init__(self,
         self.use_dropout = use_dropout
         self.dropout_rate = dropout_rate
 
-        self.temporal_fusion = None
+        self.temporal_fusion: Optional[torch.nn.Module] = None
         self.n_decoder_output_features = 0
 
     @property
@@ -51,7 +52,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
         ]
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> "autoPyTorchComponent":
+    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
         network_structure = X['network_structure']  # type: NetworkStructure
 
         self.temporal_fusion = TemporalFusionLayer(window_size=X['window_size'],
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index 3e21d91db..6fda054df 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -15,7 +15,7 @@ def __init__(self, weights: torch.Tensor):
         super().__init__()
         self.register_buffer('weights', weights)
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x.mm(self.weights)
 
 
@@ -34,7 +34,8 @@ def linspace(backcast_length: int, forecast_length: int, centered: bool = False)
     return b_ls, f_ls
 
 
-def get_generic_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
+def get_generic_heads(block_width: int, thetas_dim: int,
+                      forecast_length: int, backcast_length: int) -> Tuple[nn.Module, nn.Module]:
     backcast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
                                   nn.Linear(thetas_dim, backcast_length, bias=False))
     forecast_head = nn.Sequential(nn.Linear(block_width, thetas_dim, bias=False),
@@ -42,7 +43,8 @@ def get_generic_heads(block_width: int, thetas_dim: int, forecast_length: int, b
     return backcast_head, forecast_head
 
 
-def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
+def get_trend_heads(block_width: int, thetas_dim: int,
+                    forecast_length: int, backcast_length: int) -> Tuple[nn.Module, nn.Module]:
     base_layer = nn.Linear(block_width, thetas_dim, bias=False)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=True)
@@ -60,12 +62,13 @@ def get_trend_heads(block_width: int, thetas_dim: int, forecast_length: int, bac
     return backcast_head, forecast_head
 
 
-def get_seasonality_heads(block_width: int, thetas_dim: int, forecast_length: int, backcast_length: int):
+def get_seasonality_heads(block_width: int, thetas_dim: int,
+                          forecast_length: int, backcast_length: int) -> Tuple[nn.Module, nn.Module]:
     base_layer = nn.Linear(block_width, forecast_length, bias=False)
 
     backcast_linspace, forecast_linspace = linspace(backcast_length, forecast_length, centered=False)
 
-    def get_frequencies(n):
+    def get_frequencies(n: int) -> np.ndarray:
         return np.linspace(0, (backcast_length + forecast_length) / thetas_dim, n)
 
     p1, p2 = (forecast_length // 2, forecast_length // 2) if forecast_length % 2 == 0 else \
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 5eab2d1d5..fb601ce39 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -17,7 +17,7 @@
 # Additionally, scale information is not presented here to avoid
 
 
-from typing import Dict, Tuple, NamedTuple
+from typing import Dict, Tuple, NamedTuple, Any, Type
 
 from abc import abstractmethod
 
@@ -50,22 +50,22 @@ def __init__(self,
                  n_prediction_heads: int,
                  auto_regressive: bool,
                  decoder_has_local_layer: bool,
-                 **kwargs, ):
+                 **kwargs: Any, ):
         super().__init__(**kwargs)
 
         # we consider all the prediction steps holistically. thus, the output of the poj layer is
         # n_prediction_steps * dim *output_shape
 
-        def build_single_proj_layer(arg_dim):
+        def build_single_proj_layer(arg_dim: int) -> nn.Module:
             """
             build a single proj layer given the input dims, the output is unflattened to fit the required output_shape
             and n_prediction_steps.
             we note that output_shape's first dimensions is always n_prediction_steps
             Args:
-                arg_dim: dimension of the target distribution
+                arg_dim (int): dimension of the target distribution
 
             Returns:
-
+                proj_layer (nn.Module): projection layer that maps the decoder output to parameterize distributions
             """
             if decoder_has_local_layer:
                 return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
@@ -99,12 +99,12 @@ def arg_dims(self) -> Dict[str, int]:
         raise NotImplementedError
 
     @abstractmethod
-    def domain_map(self, *args: torch.Tensor) -> Tuple[torch.Tensor]:
+    def domain_map(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
         raise NotImplementedError
 
     @property
     @abstractmethod
-    def dist_cls(self) -> type(Distribution):
+    def dist_cls(self) -> Type[Distribution]:
         raise NotImplementedError
 
 
@@ -113,12 +113,12 @@ class NormalOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"loc": 1, "scale": 1}
 
-    def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:  # type: ignore
         scale = F.softplus(scale) + 1e-10
         return loc.squeeze(-1), scale.squeeze(-1)
 
     @property
-    def dist_cls(self) -> type(Distribution):
+    def dist_cls(self) -> Type[Distribution]:
         return Normal
 
 
@@ -127,14 +127,17 @@ class StudentTOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"df": 1, "loc": 1, "scale": 1}
 
-    def domain_map(self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor) \
-            -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def domain_map(  # type: ignore[override]
+            self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor
+    ) -> Tuple[torch.Tensor,
+               torch.Tensor,
+               torch.Tensor]:
         scale = F.softplus(scale) + 1e-10
         df = 2.0 + F.softplus(df)
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
 
     @property
-    def dist_cls(self) -> type(Distribution):
+    def dist_cls(self) -> Type[Distribution]:
         return StudentT
 
 
@@ -145,8 +148,9 @@ class BetaOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration1": 1, "concentration0": 1}
 
-    def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor) \
-            -> Tuple[torch.Tensor, torch.Tensor]:
+    def domain_map(  # type: ignore[override]
+            self, concentration1: torch.Tensor, concentration0: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO we need to adapt epsilon value given the datatype of this module
         epsilon = 1e-10
         concentration1 = F.softplus(concentration1) + epsilon
@@ -154,7 +158,7 @@ def domain_map(self, concentration1: torch.Tensor, concentration0: torch.Tensor)
         return concentration1.squeeze(-1), concentration0.squeeze(-1)
 
     @property
-    def dist_cls(self) -> type(Distribution):
+    def dist_cls(self) -> Type[Distribution]:
         # TODO consider constraints on Beta!!!
         return Beta
 
@@ -166,8 +170,9 @@ class GammaOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"concentration": 1, "rate": 1}
 
-    def domain_map(self, concentration: torch.Tensor, rate: torch.Tensor) \
-            -> Tuple[torch.Tensor, torch.Tensor]:
+    def domain_map(  # type: ignore[override]
+            self, concentration: torch.Tensor, rate: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO we need to adapt epsilon value given the datatype of this module
         epsilon = 1e-10
         concentration = F.softplus(concentration) + epsilon
@@ -175,7 +180,7 @@ def domain_map(self, concentration: torch.Tensor, rate: torch.Tensor) \
         return concentration.squeeze(-1), rate.squeeze(-1)
 
     @property
-    def dist_cls(self) -> type(Distribution):
+    def dist_cls(self) -> Type[Distribution]:
         return Gamma
 
 
@@ -184,12 +189,12 @@ class PoissonOutput(ProjectionLayer):
     def arg_dims(self) -> Dict[str, int]:
         return {"rate": 1}
 
-    def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor]:
+    def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor]:  # type: ignore[override]
         rate_pos = F.softplus(rate).clone()
         return rate_pos.squeeze(-1),
 
     @property
-    def dist_cls(self) -> type(Distribution):
+    def dist_cls(self) -> Type[Distribution]:
         return Poisson
 
 
@@ -198,7 +203,7 @@ def dist_cls(self) -> type(Distribution):
                      # 'beta': BetaOutput,
                      # 'gamma': GammaOutput,
                      # 'poisson': PoissonOutput
-                     }  # type: Dict[str, ProjectionLayer]
+                     }  # type: Dict[str, Type[ProjectionLayer]]
 
 
 class DisForecastingStrategy(NamedTuple):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 618018ee5..279a3084f 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -40,7 +40,7 @@ def __init__(self,
 
         self.add_fit_requirements(self._required_fit_requirements)
         self.head: Optional[nn.Module] = None
-        self.output_shape = None
+        self.output_shape: Optional[Tuple[int]] = None
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
@@ -76,7 +76,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             # if the decoder is a stacked block, we directly build head inside the decoder
             if net_output_type != 'regression':
                 raise ValueError("decoder with multi block structure only allow regression loss!")
-            self.output_shape = (X['dataset_properties']['n_prediction_steps'], output_shape[-1])
+            self.output_shape = (X['dataset_properties']['n_prediction_steps'], output_shape[-1])  # type: ignore
             return self
 
         num_quantiles = 0
@@ -93,13 +93,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         auto_regressive = X.get('auto_regressive', False)
 
-        head_input_shape = X["n_decoder_output_features"]
+        head_n_in_features: int = X["n_decoder_output_features"]
         n_prediction_heads = X["n_prediction_heads"]
 
         decoder_has_local_layer = X.get('mlp_has_local_layer', True)
 
         head_components = self.build_head(
-            input_shape=head_input_shape,
+            head_n_in_features=head_n_in_features,
             output_shape=output_shape,
             auto_regressive=auto_regressive,
             decoder_has_local_layer=decoder_has_local_layer,
@@ -126,6 +126,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             decoder = X['network_decoder']
             # NBEATS is a flat encoder, it only has one decoder
             first_decoder = decoder['block_1']
+            assert self.output_shape is not None
             nbeats_decoder = build_NBEATS_network(first_decoder.decoder, self.output_shape)
             decoder['block_1'] = DecoderBlockInfo(decoder=nbeats_decoder,
                                                   decoder_properties=first_decoder.decoder_properties,
@@ -156,8 +157,8 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
             'handles_time_series': True,
         }
 
-    def build_head(self,
-                   input_shape: Tuple[int, ...],
+    def build_head(self,  # type: ignore[override]
+                   head_n_in_features: int,
                    output_shape: Tuple[int, ...],
                    auto_regressive: bool = False,
                    decoder_has_local_layer: bool = True,
@@ -170,7 +171,7 @@ def build_head(self,
         Builds the head module and returns it
 
         Args:
-            input_shape (Tuple[int, ...]): shape of the input to the head (usually the shape of the backbone output)
+            head_n_in_features (int): shape of the input to the head (usually the shape of the backbone output)
             output_shape (Tuple[int, ...]): shape of the output of the head
             auto_regressive (bool): if the network is auto-regressive
             decoder_has_local_layer (bool): if the decoder has local layer
@@ -183,7 +184,8 @@ def build_head(self,
             nn.Module: head module
         """
         if net_output_type == 'distribution':
-            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=input_shape,
+            assert dist_cls is not None
+            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=head_n_in_features,
                                                      output_shape=output_shape[1:],
                                                      n_prediction_heads=n_prediction_heads,
                                                      auto_regressive=auto_regressive,
@@ -192,22 +194,25 @@ def build_head(self,
             return proj_layer
         elif net_output_type == 'regression':
             if decoder_has_local_layer:
-                proj_layer = nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
+                proj_layer = nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
             else:
                 proj_layer = nn.Sequential(
-                    nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
+                    nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
                     nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
                 )
             return proj_layer
         elif net_output_type == "quantile":
             if decoder_has_local_layer:
-                proj_layer = [nn.Sequential(nn.Linear(input_shape, np.product(output_shape[1:])))
-                              for _ in range(num_quantiles)]
+                proj_layer = [  # type: ignore[assignment]
+                    nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
+                    for _ in range(num_quantiles)
+                ]
             else:
-                proj_layer = [nn.Sequential(
-                    nn.Linear(input_shape, n_prediction_heads * np.product(output_shape[1:])),
-                    nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
-                ) for _ in range(num_quantiles)]
+                proj_layer = [  # type: ignore[assignment]
+                    nn.Sequential(
+                        nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
+                        nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
+                    ) for _ in range(num_quantiles)]
             proj_layer = QuantileHead(proj_layer)
             return proj_layer
         else:
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 27b692cef..5c2c73d86 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -102,11 +102,11 @@ def forward(self,
 
 
 class QuantileLoss(AbstractForecastingLoss):
-    def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5], loss_weights=None) -> None:
+    def __init__(self, reduction: str = 'mean', quantiles: List[float] = [0.5]) -> None:
         super(QuantileLoss, self).__init__(reduction=reduction)
         self.quantiles = quantiles
 
-    def set_quantiles(self, quantiles=List[float]):
+    def set_quantiles(self, quantiles: List[float]) -> None:
         self.quantiles = quantiles
 
     def forward(self,

From ba96c37faad5cf6e4d9bf50ff77ad5d89800dd32 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 27 May 2022 19:03:09 +0200
Subject: [PATCH 289/347] validator for pd dataframe

---
 .../data/time_series_feature_validator.py     |  31 +++-
 .../data/time_series_forecasting_validator.py | 164 ++++++++++++------
 .../data/time_series_target_validator.py      |   6 +-
 test/conftest.py                              |   7 +
 ... => test_forecasting_feature_validator.py} |   4 +-
 .../test_forecasting_input_validator.py       |  34 ++++
 .../preprocessing/forecasting/test_scaling.py |  78 ++-------
 7 files changed, 199 insertions(+), 125 deletions(-)
 rename test/test_data/{test_time_series_feature_validator.py => test_forecasting_feature_validator.py} (95%)

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 0e4854138..3816f1443 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -3,12 +3,21 @@
 import pandas as pd
 import numpy as np
 from scipy.sparse import issparse
+from sklearn.preprocessing import OrdinalEncoder
 
 from sklearn.base import BaseEstimator
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
+def df2index(df: Union[pd.DataFrame, pd.Series]) -> np.ndarray:
+    if isinstance(df, pd.Series):
+        seq_lengths = df.value_counts().values
+    else:
+        seq_lengths = np.unique(OrdinalEncoder().fit_transform(df), axis=0, return_counts=True)[1]
+    return np.arange(len(seq_lengths)).repeat(seq_lengths)
+
+
 class TimeSeriesFeatureValidator(TabularFeatureValidator):
     def __init__(
             self,
@@ -61,7 +70,8 @@ def fit(self,
                                          f"is not part of {X_train.columns.tolist()}")
                 if X_train[list(series_idx)].isnull().values.any():
                     raise ValueError('NaN should not exit in Series ID!')
-                index = pd.MultiIndex.from_frame(pd.DataFrame(X_train[series_idx]))
+                index = df2index(df=X_train[series_idx])
+
                 self.only_contain_series_idx = len(X_train.columns) == len(series_idx)
 
                 if self.only_contain_series_idx:
@@ -73,8 +83,12 @@ def fit(self,
                     return self
 
                 X_train = X_train.drop(series_idx, axis=1)
+                X_train.index = index
 
-                X_test = X_test.drop(series_idx, axis=1) if X_test is not None else None
+                if X_test is not None:
+                    index = df2index(df=X_test[series_idx])
+                    X_test = X_test.drop(series_idx, axis=1)
+                    X_test.index = index
 
                 super().fit(X_train, X_test)
             else:
@@ -82,10 +96,15 @@ def fit(self,
                                           f"X_train is {type(X_train)} ")
         else:
             super().fit(X_train, X_test)
+
+        X_train_has_idx = isinstance(X_train, pd.DataFrame)
         X_train = pd.DataFrame(X_train)
         if index is None:
             if sequence_lengths is None:
-                index = np.zeros(len(X_train))
+                if not X_train_has_idx:
+                    index = np.zeros(len(X_train))
+                else:
+                    index = X_train.index
             else:
                 if np.sum(sequence_lengths) != len(X_train):
                     raise ValueError("The Sum of Sequence length must equal to the length of hte dataset")
@@ -109,12 +128,16 @@ def transform(
             else:
                 raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
                                           f"X_train is {type(X)} ")
+        X_has_idx = isinstance(X, pd.DataFrame)
+        if X_has_idx and index is None:
+            index = X.index
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if X.ndim == 1:
             X = np.expand_dims(X, -1)
         X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
         if index is None:
-            index = np.array([0] * len(X))
+            if not X_has_idx:
+                index = np.array([0] * len(X))
         else:
             if len(index) != X.shape[0]:
                 raise ValueError('Given index must have length as the input features!')
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index ca142de25..88754c05a 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -7,7 +7,7 @@
 
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.data.tabular_validator import TabularInputValidator
-from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator
+from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator, df2index
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
 
 
@@ -59,57 +59,73 @@ def fit(
         """
         self.series_idx = series_idx
 
-        if start_times is None:
-            start_times = [pd.Timestamp('1900-01-01')] * len(y_train)
-        else:
-            assert len(start_times) == len(y_train), 'start_times_train must have the same length as y_train!'
-
-        self.start_times = start_times
-
         if X_train is None:
             self._is_uni_variant = True
-        if isinstance(y_train, List):
-            # X_train and y_train are stored as lists
-            y_train_stacked = self.join_series(y_train)
-            if y_test is not None:
-                y_test_stacked, seq_y_test_length = self.join_series(y_test, return_seq_lengths=True)
-            else:
-                y_test_stacked = None
-
-            if self._is_uni_variant:
-                self.feature_validator.num_features = 0
-                self.feature_validator.numerical_columns = []
-                self.feature_validator.categorical_columns = []
 
-                self.target_validator.fit(y_train_stacked, y_test_stacked)
-
-                self._is_fitted = True
+        if self._is_uni_variant:
+            self.feature_validator.num_features = 0
+            self.feature_validator.numerical_columns = []
+            self.feature_validator.categorical_columns = []
+            if isinstance(y_train, List):
+                n_seqs = len(y_train)
+                y_train = self.join_series(y_train)
+                if y_test is not None:
+                    y_test = self.join_series(y_test, return_seq_lengths=False)
+                else:
+                    y_test = None
+            elif isinstance(y_train, pd.DataFrame):
+                n_seqs = len(y_train.index.unique())
             else:
+                raise NotImplementedError
+
+            self.target_validator.fit(y_train, y_test)
+            self._is_fitted = True
+        else:
+            if isinstance(y_train, List):
                 # Check that the data is valid
                 if len(X_train) != len(y_train):
                     raise ValueError("Inconsistent number of sequences for features and targets,"
                                      " {} for features and {} for targets".format(len(X_train), len(y_train), ))
-                X_train_stacked, sequence_lengths = self.join_series(X_train, return_seq_lengths=True)
-                X_test_stacked = self.join_series(X_test) if X_test is not None else None
-                if X_test_stacked is not None and y_test_stacked is not None:
-                    if len(X_test_stacked) != len(y_test_stacked):
+                n_seqs = len(y_train)
+
+                # X_train and y_train are stored as lists
+                y_train = self.join_series(y_train)
+                if y_test is not None:
+                    y_test = self.join_series(y_test, return_seq_lengths=False)
+
+                X_train, sequence_lengths = self.join_series(X_train, return_seq_lengths=True)
+                X_test = self.join_series(X_test) if X_test is not None else None
+                if X_test is not None and y_test is not None:
+                    if len(X_test) != len(y_test):
                         raise ValueError("Inconsistent number of test datapoints for features and targets,"
                                          " {} for features and {} for targets".format(len(X_test), len(y_test), ))
+            elif isinstance(y_train, (pd.DataFrame, pd.Series)):
+                sequence_lengths = None
+                if series_idx is not None:
+                    n_seqs = len(X_train.groupby(series_idx))
+                else:
+                    n_seqs = len(y_train.index.unique())
+            else:
+                raise NotImplementedError
+
+            self.feature_validator.fit(X_train, X_test,
+                                       series_idx=series_idx, sequence_lengths=sequence_lengths)
+            self.target_validator.fit(y_train, y_test)
 
-                self.feature_validator.fit(X_train_stacked, X_test_stacked,
-                                           series_idx=series_idx, sequence_lengths=sequence_lengths)
-                self.target_validator.fit(y_train_stacked, y_test_stacked)
+            if self.feature_validator.only_contain_series_idx:
+                self._is_uni_variant = True
 
-                if self.feature_validator.only_contain_series_idx:
-                    self._is_uni_variant = True
+            self._is_fitted = True
 
-                self._is_fitted = True
+            self.feature_names = self.feature_validator.get_reordered_columns()
+            self.feature_shapes = {feature_name: 1 for feature_name in self.feature_names}
 
-                self.feature_names = self.feature_validator.get_reordered_columns()
-                self.feature_shapes = {feature_name: 1 for feature_name in self.feature_names}
+        if start_times is None:
+            start_times = [pd.Timestamp('1900-01-01')] * n_seqs
         else:
-            # TODO X_train and y_train are pd.DataFrame
-            raise NotImplementedError
+            assert len(start_times) == n_seqs, 'start_times_train must have the same length as y_train!'
+
+        self.start_times = start_times
 
         return self
 
@@ -118,7 +134,7 @@ def transform(
             X: Optional[Union[List, pd.DataFrame]],
             y: Optional[Union[List, pd.DataFrame]] = None,
             validate_for_future_features: bool = False
-    ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], List[int]]:
+    ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
         """
         transform the data with the fitted validator
         Args:
@@ -130,7 +146,7 @@ def transform(
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
         if validate_for_future_features and y is None:
             if X is None:
-                return None, None, []
+                return None, None, np.asarray([])
             if isinstance(X, List):
                 num_sequences = len(X)
                 sequence_lengths = [0] * num_sequences
@@ -139,9 +155,14 @@ def transform(
                 sequence_lengths = np.asarray(sequence_lengths)
                 x_transformed, _ = self._transform_X(X, sequence_lengths)
                 return x_transformed, None, sequence_lengths
+            elif isinstance(X, pd.DataFrame):
+                if self.series_idx is not None:
+                    X = X.sort_values(self.series_idx)
+                x_transformed, _ = self._transform_X(X, None)
+                return x_transformed, None, X.index.value_counts(sort=False).values
+
             else:
                 raise NotImplementedError
-
         else:
             if y is None:
                 raise ValueError('Targets must be given!')
@@ -169,25 +190,66 @@ def transform(
                     return None, y_transformed, sequence_lengths
 
                 return x_transformed, y_transformed, sequence_lengths
+            elif isinstance(y, (pd.DataFrame, pd.Series)):
+                if self.series_idx is not None:
+                    if isinstance(y, pd.Series):
+                        y_columns = [y.name]
+                    else:
+                        if isinstance(y.columns, pd.RangeIndex):
+                            y_columns = [f'target_{i}' for i in y.columns]
+                            y.columns = y_columns
+                        y_columns = y.columns
+                    xy = pd.concat([X, y], axis=1)
+                    xy.sort_values(self.feature_validator.series_idx, inplace=True)
+
+                    y = xy[y_columns]
+                    X = xy.drop(y_columns, axis=1)
+                    del xy
+
+                x_transformed, series_number = self._transform_X(X, None)
+
+                if self._is_uni_variant:
+                    y_transformed: pd.DataFrame = self.target_validator.transform(y, series_number)
+                    return None, y_transformed, y_transformed.index.value_counts(sort=False).values
+
+                y_transformed: pd.DataFrame = self.target_validator.transform(y, x_transformed.index)
+                return x_transformed, y_transformed, y_transformed.index.value_counts(sort=False).values
+
             else:
                 raise NotImplementedError
 
     def _transform_X(self,
                      X: Optional[Union[List, pd.DataFrame]],
-                     sequence_lengths: np.ndarray) -> Tuple[pd.DataFrame, Union[np.ndarray, pd.Index]]:
+                     sequence_lengths: Optional[np.ndarray] = None) -> Tuple[pd.DataFrame,
+                                                                             Optional[Union[np.ndarray, pd.Index]]]:
         if self.series_idx is None:
-            series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
-            if not self._is_uni_variant:
-                x_stacked = self.join_series(X)
-                x_transformed = self.feature_validator.transform(x_stacked,
-                                                                 index=series_number)
-            else:
+            if self._is_uni_variant:
                 x_transformed = None
+                if sequence_lengths is not None:
+                    series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+                else:
+                    series_number = None
+            else:
+                if isinstance(X, List):
+                    assert sequence_lengths is not None
+                    series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+                    x_stacked = self.join_series(X)
+                    x_transformed = self.feature_validator.transform(x_stacked,
+                                                                     index=series_number)
+                elif isinstance(X, pd.DataFrame):
+                    series_number = X.index
+                    x_transformed = self.feature_validator.transform(X)
+                else:
+                    raise NotImplementedError
         else:
-            # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
-            x_stacked = pd.concat(X)
-
-            series_number = pd.MultiIndex.from_frame(pd.DataFrame(x_stacked[self.series_idx]))
+            if isinstance(X, List):
+                # In this case X can only contain pd.DataFrame, see ```time_series_feature_validator.py```
+                x_stacked = pd.concat(X)
+            elif isinstance(X, pd.DataFrame):
+                x_stacked = X
+            else:
+                raise NotImplementedError
+            series_number = df2index(x_stacked[self.series_idx])
 
             if not self._is_uni_variant:
                 x_transformed = self.feature_validator.transform(x_stacked,
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index ed017e5fc..1bd05b246 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -51,10 +51,14 @@ def transform(self,
             pd.DataFrame:
                 The transformed array
         """
+        y_has_idx = isinstance(y, pd.DataFrame)
+        if y_has_idx and index is None:
+            index = y.index
         y: ArrayType = super().transform(y)
 
         if index is None:
-            index = np.array([0] * y.shape[0])
+            if not y_has_idx:
+                index = np.array([0] * y.shape[0])
         else:
             if len(index) != y.shape[0]:
                 raise ValueError('Index must have length as the input targets!')
diff --git a/test/conftest.py b/test/conftest.py
index 9ffe0c6ee..908c0b6f3 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -846,6 +846,13 @@ def input_data_forecastingfeaturetest(request):
             {'A': 3, 'B': 2},
             {'A': 2, 'B': 4},
         ], dtype='category'), None, [2, 2]
+    elif request.param == 'pandas_multi_seq_w_idx':
+        return pd.DataFrame([
+            {'A': 1, 'B': 2},
+            {'A': 1, 'B': 4},
+            {'A': 3, 'B': 2},
+            {'A': 2, 'B': 4},
+        ], dtype='category', index=[0, 0, 1, 1]), None, None
     elif request.param == 'pandas_with_static_features_multi_series':
         return pd.DataFrame([
             {'A': 1, 'B': 2},
diff --git a/test/test_data/test_time_series_feature_validator.py b/test/test_data/test_forecasting_feature_validator.py
similarity index 95%
rename from test/test_data/test_time_series_feature_validator.py
rename to test/test_data/test_forecasting_feature_validator.py
index ce219a310..be3f4d1bb 100644
--- a/test/test_data/test_time_series_feature_validator.py
+++ b/test/test_data/test_forecasting_feature_validator.py
@@ -20,6 +20,7 @@
         'pandas_without_seriesid',
         'pandas_with_static_features',
         'pandas_multi_seq',
+        'pandas_multi_seq_w_idx',
         'pandas_with_static_features_multi_series',
     ),
     indirect=True
@@ -42,7 +43,8 @@ def test_forecasting_validator_supported_types(input_data_forecastingfeaturetest
     transformed_X = validator.transform(data, index)
     assert isinstance(transformed_X, pd.DataFrame)
     if series_idx is None and seq_lengths is None:
-        assert np.all(transformed_X.index == 0)
+        if not (isinstance(data, pd.DataFrame) and len(data.index.unique() > 1)):
+            assert np.all(transformed_X.index == 0)
     else:
         if series_idx is not None:
             assert series_idx not in transformed_X
diff --git a/test/test_data/test_forecasting_input_validator.py b/test/test_data/test_forecasting_input_validator.py
index 7f2bb852c..e5419198f 100644
--- a/test/test_data/test_forecasting_input_validator.py
+++ b/test/test_data/test_forecasting_input_validator.py
@@ -68,6 +68,40 @@ def test_multi_variant_validator_with_series_id(input_data_forecastingfeaturetes
     assert series_idx not in x_transformed
 
 
+@pytest.mark.parametrize(
+    'input_data_forecastingfeaturetest',
+    (
+        'pandas_wo_seriesid',
+        'pandas_w_seriesid',
+        'pandas_only_seriesid',
+        'pandas_without_seriesid',
+        'pandas_with_static_features',
+        'pandas_multi_seq',
+        'pandas_multi_seq_w_idx',
+        'pandas_with_static_features_multi_series',
+    ),
+    indirect=True
+)
+def test_transform_pds(input_data_forecastingfeaturetest):
+    data, series_idx, _ = input_data_forecastingfeaturetest
+    validator = TimeSeriesForecastingInputValidator(is_classification=False)
+    # start_times = [pd.Timestamp('2000-01-01')]
+    start_times = None
+    x = data
+    y = pd.DataFrame(range(len(data)))
+    validator.fit(x, y, start_times=start_times, series_idx=series_idx)
+
+    x_transformed, y_transformed, sequence_lengths = validator.transform(x, y)
+    assert np.all(sequence_lengths == y_transformed.index.value_counts(sort=False).values)
+
+    if x_transformed is not None:
+        assert series_idx not in x_transformed
+        assert np.all(sequence_lengths == x_transformed.index.value_counts(sort=False).values)
+    if series_idx is not None:
+        for seq_len, group in zip(sequence_lengths, data.groupby(series_idx)):
+            assert seq_len == len(group[1])
+
+
 def test_forecasting_validator():
     df = pd.DataFrame([
         {'category': 'one', 'int': 1, 'float': 1.0, 'bool': True},
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
index d1bb355ca..eabd99e4d 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -77,17 +77,9 @@ def test_base_and_standard_scaler(self):
 
         # second column is static features. It needs to be the mean and std value across all sequences
         scaler.dataset_is_small_preprocess = False
-        scaler.static_features = self.static_features_column
-        scaler = scaler.fit(self.raw_data[0])
 
-        self.assertTrue(np.allclose(scaler.loc, np.asarray([[1., 2., 3]])))
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 2., 3.]])))
-
-        transformed_test = scaler.transform(self.raw_data[0])
-        self.assertIsInstance(transformed_test, np.ndarray)
-        # should have the same value as the second part of transformed except for the static values
-        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed[:len(self.raw_data[0]), [0, -1]]))
-        self.assertTrue(np.all(transformed_test[:, 1] == 0.))
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
 
     def test_min_max(self):
         scaler = TimeSeriesScaler(mode='min_max',
@@ -112,22 +104,10 @@ def test_min_max(self):
 
         scaler.dataset_is_small_preprocess = False
         scaler.static_features = self.static_features_column
-        scaler = scaler.fit(self.raw_data[0])
 
-        self.assertTrue(np.allclose(scaler.loc, np.asarray([[0., 2., 3.]])))
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[2., 2., 3.]])))
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
 
-        idx_start = 0
-        for i, raw_data in enumerate(self.raw_data):
-            idx_end = idx_start + len(raw_data)
-            scaler = scaler.fit(raw_data)
-
-            transformed_test = scaler.transform(self.raw_data[i])
-            self.assertIsInstance(transformed_test, np.ndarray)
-            # should have the same value as the second part of transformed except for the static values
-            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
-            self.assertTrue(np.all(transformed_test[:, 1] == 0.))
-            idx_start = idx_end
 
     def test_max_abs_scaler(self):
         scaler = TimeSeriesScaler(mode='max_abs',
@@ -152,22 +132,9 @@ def test_max_abs_scaler(self):
                                                                   [0., 0.5, 1.]])))
 
         scaler.dataset_is_small_preprocess = False
-        scaler.static_features = self.static_features_column
-        scaler = scaler.fit(self.raw_data[0])
-        self.assertIsNone(scaler.loc)
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[2., 2., 3.]])))
 
-        idx_start = 0
-        for i, raw_data in enumerate(self.raw_data):
-            idx_end = idx_start + len(raw_data)
-            scaler = scaler.fit(raw_data)
-
-            transformed_test = scaler.transform(self.raw_data[i])
-            self.assertIsInstance(transformed_test, np.ndarray)
-            # should have the same value as the second part of transformed except for the static values
-            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
-            self.assertTrue(np.all(transformed_test[:, 1] == 1.))
-            idx_start = idx_end
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
 
     def test_mean_abs_scaler(self):
         scaler = TimeSeriesScaler(mode='mean_abs',
@@ -192,20 +159,8 @@ def test_mean_abs_scaler(self):
         scaler.static_features = self.static_features_column
         scaler = scaler.fit(self.raw_data[0])
 
-        self.assertIsNone(scaler.loc)
-        self.assertTrue(np.allclose(scaler.scale, np.asarray([[1., 2., 3.]])))
-
-        idx_start = 0
-        for i, raw_data in enumerate(self.raw_data):
-            idx_end = idx_start + len(raw_data)
-            scaler = scaler.fit(raw_data)
-
-            transformed_test = scaler.transform(self.raw_data[i])
-            self.assertIsInstance(transformed_test, np.ndarray)
-            # should have the same value as the second part of transformed except for the static values
-            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
-            self.assertTrue(np.all(transformed_test[:, 1] == 1.))
-            idx_start = idx_end
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
 
     def test_no_scaler(self):
         scaler = TimeSeriesScaler(mode='none',
@@ -220,22 +175,9 @@ def test_no_scaler(self):
         self.assertIsNone(scaler.scale)
 
         scaler.dataset_is_small_preprocess = False
-        scaler.static_features = self.static_features_column
-        scaler = scaler.fit(self.raw_data[0])
-
-        idx_start = 0
-        for i, raw_data in enumerate(self.raw_data):
-            idx_end = idx_start + len(raw_data)
-            scaler = scaler.fit(raw_data)
-
-            transformed_test = scaler.transform(self.raw_data[i])
-            self.assertIsInstance(transformed_test, np.ndarray)
-            # should have the same value as the second part of transformed except for the static values
-            self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_data[idx_start:idx_end, [0, -1]]))
 
-            self.assertIsNone(scaler.loc)
-            self.assertIsNone(scaler.scale)
-            idx_start = idx_end
+        transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
+        self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
 
         with self.assertRaises(ValueError):
             scaler = TimeSeriesScaler(mode='random',

From cdcdb5a6424f468695e84617edb54ef4a0551c33 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 27 May 2022 19:26:44 +0200
Subject: [PATCH 290/347] allow series idx for api

---
 autoPyTorch/api/time_series_forecasting.py    |   8 +-
 .../data/time_series_feature_validator.py     |  17 +-
 .../data/time_series_target_validator.py      |  14 +-
 autoPyTorch/datasets/base_dataset.py          |   2 +-
 autoPyTorch/datasets/time_series_dataset.py   | 199 ++++++++++--------
 .../TimeSeriesTransformer.py                  |   1 +
 .../encoding/OneHotEncoder.py                 |   2 +-
 .../encoding/time_series_base_encoder.py      |   4 +-
 .../scaling/utils.py                          | 111 ++++++----
 .../setup/early_preprocessor/utils.py         |   2 +-
 .../TargetMaxAbsScaler.py                     |   2 +-
 .../TargetMeanAbsScaler.py                    |   2 +-
 .../TargetMinMaxScaler.py                     |   2 +-
 .../TargetNoScaler.py                         |   2 +-
 .../TargetStandardScaler.py                   |   2 +-
 .../forecasting_target_scaling/__init__.py    |   9 +-
 .../base_target_scaler.py                     |   2 +-
 .../DistributionLoss.py                       |   2 +-
 .../forecasting_training_loss/QuantileLoss.py |   2 +-
 .../forecasting_training_loss/__init__.py     |   5 +-
 .../base_forecasting_loss.py                  |   6 +-
 .../components/setup/network/base_network.py  |   4 +-
 .../setup/network/forecasting_architecture.py | 128 ++++++-----
 .../setup/network/forecasting_network.py      |  14 +-
 .../setup/network_backbone/__init__.py        |   4 +-
 .../forecasting_backbone/__init__.py          |   2 +-
 .../forecasting_backbone/cells.py             |  14 +-
 .../forecasting_backbone/components_util.py   |   2 +-
 .../forecasting_decoder/NBEATSDecoder.py      |   2 +-
 .../forecasting_encoder/__init__.py           |  22 +-
 .../flat_encoder/MLPEncoder.py                |   2 +-
 .../flat_encoder/__init__.py                  |   4 +-
 .../seq_encoder/__init__.py                   |   6 +-
 .../other_components/TemporalFusion.py        |   1 +
 .../LearnedEntityEmbedding.py                 |   6 +-
 .../setup/network_embedding/__init__.py       |   6 +-
 .../components/setup/network_head/__init__.py |   4 +-
 .../setup/network_initializer/__init__.py     |   6 +-
 .../components/setup/optimizer/__init__.py    |   6 +-
 .../components/training/base_training.py      |   6 +-
 .../training/data_loader/base_data_loader.py  |   2 +-
 .../time_series_forecasting_data_loader.py    |  52 +++--
 .../training/data_loader/time_series_util.py  |  39 ++--
 .../components/training/metrics/base.py       |  18 +-
 .../components/training/metrics/utils.py      |   2 +-
 .../components/training/trainer/__init__.py   |   6 +-
 .../trainer/forecasting_trainer/__init__.py   |   6 +-
 .../forecasting_base_trainer.py               |  14 +-
 .../pipeline/time_series_forecasting.py       |  12 +-
 requirements.txt                              |   2 +-
 50 files changed, 426 insertions(+), 362 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 85058649a..3a39d8fbe 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -161,6 +161,7 @@ def _get_dataset_input_validator(
             dataset_compression: Optional[DatasetCompressionSpec] = None,
             freq: Optional[Union[str, int, List[int]]] = None,
             start_times: List[pd.DatetimeIndex] = [],
+            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
             n_prediction_steps: int = 1,
             known_future_features: Tuple[Union[int, str]] = (),
             **forecasting_dataset_kwargs,
@@ -218,7 +219,7 @@ def _get_dataset_input_validator(
         # Fit an input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
-        input_validator.fit(X_train=X_train, y_train=y_train, start_times=start_times,
+        input_validator.fit(X_train=X_train, y_train=y_train, start_times=start_times, series_idx=series_idx,
                             X_test=X_test, y_test=y_test)
 
         dataset = TimeSeriesForecastingDataset(
@@ -226,6 +227,7 @@ def _get_dataset_input_validator(
             X_test=X_test, Y_test=y_test,
             freq=freq,
             start_times=start_times,
+            series_idx=series_idx,
             validator=input_validator,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
@@ -246,6 +248,7 @@ def search(
             n_prediction_steps: int = 1,
             freq: Optional[Union[str, int, List[int]]] = None,
             start_times: Optional[List[pd.DatetimeIndex]] = None,
+            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
             dataset_name: Optional[str] = None,
             budget_type: str = 'epochs',
             min_budget: Union[int, str] = 5,
@@ -293,6 +296,8 @@ def search(
                 we will use the default configuration
             start_times: : List[pd.DatetimeIndex]
                 A list indicating the start time of each series in the training sets
+            series_idx: Optional[Union[List[Union[str, int]], str, int]]
+                variable in X indicating series indices
             dataset_name: Optional[str],
                 dataset name
             budget_type (str):
@@ -396,6 +401,7 @@ def search(
             dataset_compression=self._dataset_compression,
             freq=freq,
             start_times=start_times,
+            series_idx=series_idx,
             n_prediction_steps=n_prediction_steps,
             **forecasting_dataset_kwargs
         )
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 3816f1443..e0e646d99 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -26,15 +26,15 @@ def __init__(
         super().__init__(logger)
         self.only_contain_series_idx = False
         self.static_features = ()
-        self.series_idx: Optional[Union[List[Union[str, int]]]] = None
+        self.series_idx: Optional[List[Union[str, int]]] = None
 
-    def get_reordered_columns(self):
+    def get_reordered_columns(self) -> List[str]:
         return self.transformed_columns + [col for col in self.column_order if col not in set(self.transformed_columns)]
 
     def fit(self,
             X_train: Union[pd.DataFrame, np.ndarray],
             X_test: Union[pd.DataFrame, np.ndarray] = None,
-            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+            series_idx: Optional[List[Union[str, int]]] = None,
             sequence_lengths: Optional[List[int]] = None) -> BaseEstimator:
         """
 
@@ -46,7 +46,7 @@ def fit(self,
             X_test (Union[pd.DataFrame, np.ndarray]):
                 An optional set of data that is going to be validated
 
-            series_idx (Optional[Union[str, int]]):
+            series_idx (Optional[List[Union[str, int]]]):
                 Series Index, to identify each individual series
 
             sequence_lengths (Optional[List[int]]):
@@ -86,6 +86,7 @@ def fit(self,
                 X_train.index = index
 
                 if X_test is not None:
+                    assert isinstance(X_test, pd.DataFrame)
                     index = df2index(df=X_test[series_idx])
                     X_test = X_test.drop(series_idx, axis=1)
                     X_test.index = index
@@ -111,11 +112,11 @@ def fit(self,
                 index = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
         X_train.index = index
 
-        static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
+        static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()  # type: ignore[assignment]
         self.static_features = tuple(idx for idx in static_features.index if static_features[idx])
         return self
 
-    def transform(
+    def transform(  # type: ignore[override]
             self,
             X: Union[pd.DataFrame, np.ndarray],
             index: Optional[Union[pd.Index, np.ndarray]] = None,
@@ -130,10 +131,10 @@ def transform(
                                           f"X_train is {type(X)} ")
         X_has_idx = isinstance(X, pd.DataFrame)
         if X_has_idx and index is None:
-            index = X.index
+            index = X.index  # type: ignore[union-attr]
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if X.ndim == 1:
-            X = np.expand_dims(X, -1)
+            X = np.expand_dims(X, -1)  # type: ignore[union-attr]
         X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
         if index is None:
             if not X_has_idx:
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 1bd05b246..3962d9029 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -53,19 +53,19 @@ def transform(self,
         """
         y_has_idx = isinstance(y, pd.DataFrame)
         if y_has_idx and index is None:
-            index = y.index
-        y: ArrayType = super().transform(y)
+            index = y.index  # type: ignore[union-attr]
+        y: ArrayType = super().transform(y)  # type: ignore[no-redef]
 
         if index is None:
             if not y_has_idx:
-                index = np.array([0] * y.shape[0])
+                index = np.array([0] * y.shape[0])  # type: ignore[union-attr]
         else:
-            if len(index) != y.shape[0]:
+            if len(index) != y.shape[0]:  # type: ignore[union-attr]
                 raise ValueError('Index must have length as the input targets!')
-        if y.ndim == 1:
+        if y.ndim == 1:  # type: ignore[union-attr]
             y = np.expand_dims(y, -1)
-        y: pd.DataFrame = pd.DataFrame(y)
-        y.index = index
+        y: pd.DataFrame = pd.DataFrame(y)  # type: ignore[no-redef]
+        y.index = index   # type: ignore
         return y
 
     @property
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
index 7761d07c2..bd50cdbd6 100644
--- a/autoPyTorch/datasets/base_dataset.py
+++ b/autoPyTorch/datasets/base_dataset.py
@@ -30,7 +30,7 @@
 from autoPyTorch.utils.common import FitRequirement, ispandas
 
 BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
-BaseDatasetPropertiesType = Union[int, float, str, List, bool]
+BaseDatasetPropertiesType = Union[int, float, str, List, bool, Tuple]
 
 
 def check_valid_data(data: Any) -> None:
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 4f3aeca9f..25deafae2 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -51,7 +51,7 @@
 
 def extract_feature_index(feature_shapes: Dict[str, int],
                           feature_names: Tuple[str],
-                          queried_features: Tuple[str]) -> Tuple[int]:
+                          queried_features: Union[Tuple[Union[str, int]], Tuple[()]]) -> Tuple[int]:
     """
     extract the index of a set of queried_features from the extracted feature_shapes
     Args:
@@ -68,10 +68,10 @@ def extract_feature_index(feature_shapes: Dict[str, int],
     value_ranges = df_range[list(queried_features)].T.values
     feature_index: List[int] = sum([list(range(*value_r)) for value_r in value_ranges], [])
     feature_index.sort()
-    return tuple(feature_index)
+    return tuple(feature_index)  # type: ignore[return-value]
 
 
-def compute_time_features(start_time: pd.Timestamp,
+def compute_time_features(start_time: pd.DatetimeIndex,
                           date_period_length: int,
                           time_feature_length: int,
                           freq: str,
@@ -100,7 +100,7 @@ def __init__(self,
                  Y: np.ndarray,
                  start_time: Optional[pd.DatetimeIndex] = None,
                  freq: str = '1Y',
-                 time_feature_transform: List[TimeFeature] = [],
+                 time_feature_transform: List[TimeFeature] = [ConstantTransform],
                  X_test: Optional[np.ndarray] = None,
                  Y_test: Optional[np.ndarray] = None,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -109,16 +109,29 @@ def __init__(self,
                  sp: int = 1,
                  known_future_features_index: Optional[List[int]] = None,
                  compute_mase_coefficient_value: bool = True,
-                 time_features=None,
-                 is_test_set=False,
-                 ):
+                 time_features: Optional[np.ndarray] = None,
+                 is_test_set: bool = False,
+                 ) -> None:
         """
         A dataset representing a time series sequence.
         Args:
-            seed:
-            train_transforms:
-            val_transforms:
-            n_prediction_steps: int, how many steps need to be predicted in advance
+            X (Optional[np.ndarray]): past features
+            Y (np.ndarray): past targets
+            start_time (Optional[pd.DatetimeIndex]): times of the first timestep of the series
+            freq (str): frequency that the data is sampled
+            time_feature_transform (List[TimeFeature]) available time features applied to the series
+            X_test (Optional[np.ndarray]): known future features
+            Y_test (Optional[np.ndarray]): future targets
+            train_transforms (Optional[torchvision.transforms.Compose]): training transforms, used to transform
+                training features
+            val_transforms (Optional[torchvision.transforms.Compose]): validation transforms, used to transform
+                training features
+            n_prediction_steps (int): how many steps need to be predicted in advance
+            known_future_features_index (int), indices of the known future index
+            compute_mase_coefficient_value (bool): if the mase coefficient for this series is pre-computed
+            time_features (Optional[np.ndarray]): pre-computed time features
+            is_test_set (bool): if this dataset is test sets. Test sequence will simply make X_test and Y_test as future
+                features and future targets
         """
         self.n_prediction_steps = n_prediction_steps
 
@@ -168,11 +181,11 @@ def __init__(self,
         self.is_test_set = is_test_set
 
     @property
-    def is_test_set(self):
+    def is_test_set(self) -> bool:
         return self._is_test_set
 
     @is_test_set.setter
-    def is_test_set(self, value: bool):
+    def is_test_set(self, value: bool) -> None:
         if value and value != self._is_test_set:
             if self.known_future_features_index:
                 if self.X_test is None:
@@ -205,10 +218,12 @@ def __getitem__(self, index: int, train: bool = True) \
 
             if self.known_future_features_index:
                 if not self.is_test_set:
-                    future_features = self.X[index + 1: index + self.n_prediction_steps + 1,
-                                      self.known_future_features_index]
+                    future_features = \
+                        self.X[index + 1: index + self.n_prediction_steps + 1, self.known_future_features_index]
                 else:
-                    future_features = self.X_test[:, self.known_future_features_index]
+                    if index < self.__len__() - 1:
+                        raise ValueError('Test Sequence is only allowed to be accessed with the last index!')
+                    future_features = self.X_test[:, self.known_future_features_index]  # type: ignore[index]
             else:
                 future_features = None
         else:
@@ -229,16 +244,19 @@ def __getitem__(self, index: int, train: bool = True) \
                 self.cache_time_features()
 
                 if past_features is not None:
-                    past_features = np.hstack([past_features, self._cached_time_features[:index + 1]])
+                    past_features = np.hstack(
+                        [past_features, self._cached_time_features[:index + 1]]  # type: ignore[index]
+                    )
                 else:
-                    past_features = self._cached_time_features[:index + 1]
+                    past_features = self._cached_time_features[:index + 1]   # type: ignore[index]
                 if future_features is not None:
                     future_features = np.hstack([
                         future_features,
-                        self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+                        self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]  # type: ignore[index]
                     ])
                 else:
-                    future_features = self._cached_time_features[index + 1:index + self.n_prediction_steps + 1]
+                    future_features = self._cached_time_features[index + 1:  # type: ignore[index]
+                                                                 index + self.n_prediction_steps + 1]
 
         if future_features is not None and future_features.shape[0] == 0:
             future_features = None
@@ -283,7 +301,7 @@ def __getitem__(self, index: int, train: bool = True) \
     def __len__(self) -> int:
         return self.Y.shape[0] if self.is_test_set else self.Y.shape[0] - self.n_prediction_steps
 
-    def get_target_values(self, index: int):
+    def get_target_values(self, index: int) -> np.ndarray:
         """
         Get the visible targets in the datasets without generating a tensor. This can be used to create a dummy pipeline
         Args:
@@ -296,7 +314,10 @@ def get_target_values(self, index: int):
             index = self.__len__() + index
         return self.Y[index]
 
-    def cache_time_features(self, ):
+    def cache_time_features(self, ) -> None:
+        """
+        compute time features if it is not cached. For test sets, we also need to compute the time features for future
+        """
         if self._cached_time_features is None:
             periods = self.Y.shape[0]
             if self.is_test_set:
@@ -361,7 +382,7 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
             else:
                 X = None
             if self.known_future_features_index:
-                X_test = self.X[index + 1: index + 1 + self.n_prediction_steps]
+                X_test = self.X[index + 1: index + 1 + self.n_prediction_steps]  # type: ignore[index]
             else:
                 X_test = None
             if self._cached_time_features is None:
@@ -387,7 +408,7 @@ def get_val_seq_set(self, index: int) -> "TimeSeriesSequence":
 
             return val_set
 
-    def get_test_target(self, test_idx: int):
+    def get_test_target(self, test_idx: int) -> np.ndarray:
         if self.is_test_set:
             raise ValueError("get_test_target is not supported for test sequences!")
         if test_idx < 0:
@@ -395,7 +416,7 @@ def get_test_target(self, test_idx: int):
         Y_future = self.Y[test_idx + 1: test_idx + self.n_prediction_steps + 1]
         return Y_future
 
-    def update_attribute(self, **kwargs):
+    def update_attribute(self, **kwargs: Any) -> None:
         for key, value in kwargs.items():
             if not hasattr(self, key):
                 raise ValueError('Trying to update invalid attribute for TimeSeriesSequence!')
@@ -412,11 +433,12 @@ def __init__(self,
                  X_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                  Y_test: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                  start_times: Optional[List[pd.DatetimeIndex]] = None,
-                 known_future_features: Optional[Tuple[Union[str, int]]] = None,
+                 series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+                 known_future_features: Optional[Union[Tuple[Union[str, int]], Tuple[()]]] = None,
                  time_feature_transform: Optional[List[TimeFeature]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
-                 resampling_strategy: Optional[Union[
-                     CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
+                 resampling_strategy: Optional[
+                     Union[CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
@@ -448,14 +470,14 @@ def __init__(self,
         self.seasonality = int(seasonality)
 
         self.freq: str = freq
-        self.freq_value: int = freq_value
+        self.freq_value: Real = freq_value
 
         self.n_prediction_steps = n_prediction_steps
 
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
         self.dataset_name = dataset_name
 
-        if self.dataset_name is None:
-            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
         # Data Validation
         if validator is None:
             validator = TimeSeriesForecastingInputValidator(is_classification=False)
@@ -466,7 +488,7 @@ def __init__(self,
                              f"but receive {type(validator)}")
 
         if not self.validator._is_fitted:
-            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test,
+            self.validator.fit(X_train=X, y_train=Y, X_test=X_test, y_test=Y_test, series_idx=series_idx,
                                start_times=start_times)
 
         self.is_uni_variant = self.validator._is_uni_variant
@@ -504,12 +526,12 @@ def __init__(self,
 
         # Construct time series sequences
         if known_future_features is None:
-            known_future_features = tuple()
+            known_future_features = tuple()  # type: ignore[assignment]
         known_future_features_index = extract_feature_index(self.feature_shapes,
-                                                            self.feature_names,
-                                                            queried_features=known_future_features)
+                                                            self.feature_names,  # type: ignore[arg-type]
+                                                            queried_features=known_future_features)  # type: ignore
 
-        self.known_future_features = tuple(known_future_features)
+        self.known_future_features = tuple(known_future_features)  # type: ignore[arg-type]
 
         # initialize datasets
         self.sequences_builder_kwargs = {"freq": self.freq,
@@ -527,37 +549,37 @@ def __init__(self,
                                                                       X_test=X_test,
                                                                       Y_test=Y_test, )
         sequence_datasets, train_tensors, test_tensors, sequence_lengths = training_sets
-        Y = train_tensors[1]
+        Y: pd.DataFrame = train_tensors[1]  # type: ignore[no-redef]
 
         ConcatDataset.__init__(self, datasets=sequence_datasets)
 
         self.num_sequences = len(Y)
-        self.sequence_lengths_train = np.asarray(sequence_lengths) - n_prediction_steps
+        self.sequence_lengths_train: np.ndarray = np.asarray(sequence_lengths) - n_prediction_steps
 
         self.seq_length_min = int(np.min(self.sequence_lengths_train))
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
         self.seq_length_max = int(np.max(self.sequence_lengths_train))
 
-        if freq_value > self.seq_length_median:
+
+        if int(freq_value) > self.seq_length_median:
             self.base_window_size = self.seq_length_median
         else:
-            self.base_window_size = freq_value
+            self.base_window_size = int(freq_value)
 
-        self.train_tensors = train_tensors
+        self.train_tensors: Tuple[Optional[pd.DataFrame], pd.DataFrame] = train_tensors
 
-        self.test_tensors = test_tensors
+        self.test_tensors: Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]] = test_tensors
         self.val_tensors = None
 
-        self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
 
-        self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)
+        self.input_shape: Tuple[int, int] = (self.seq_length_min, self.num_features)  # type: ignore[assignment]
 
         # process known future features
         if known_future_features is None:
             future_feature_shapes: Tuple[int, int] = (self.seq_length_min, 0)
         else:
-            future_feature_shapes: Tuple[int, int] = (self.seq_length_min, len(known_future_features))
+            future_feature_shapes = (self.seq_length_min, len(known_future_features))
         self.encoder_can_be_auto_regressive = (self.input_shape[-1] == future_feature_shapes[-1])
 
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
@@ -569,10 +591,10 @@ def __init__(self,
                 self.output_type = "continuous"
 
             if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
-                num_targets = len(np.unique(Y))
+                num_targets: int = len(np.unique(Y))
             else:
-                num_targets = Y.shape[-1] if Y.ndim > 1 else 1
-            self.output_shape = [self.n_prediction_steps, num_targets]
+                num_targets = Y.shape[-1] if Y.ndim > 1 else 1  # type: ignore[union-attr]
+            self.output_shape = [self.n_prediction_steps, num_targets]  # type: ignore
         else:
             raise ValueError('Forecasting dataset must contain target values!')
 
@@ -580,7 +602,7 @@ def __init__(self,
         self.is_small_preprocess = True
 
         # dataset split
-        self.task_type = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
+        self.task_type: str = TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING]
 
         self.numerical_features: List[int] = self.numerical_columns
         self.categorical_features: List[int] = self.categorical_columns
@@ -592,7 +614,7 @@ def __init__(self,
             sequence_lengths=sequence_lengths,
             n_prediction_steps=n_prediction_steps,
             freq_value=self.freq_value,
-            resampling_strategy=resampling_strategy,
+            resampling_strategy=resampling_strategy,  # type: ignore[arg-type]
             resampling_strategy_args=resampling_strategy_args
         )
 
@@ -610,7 +632,7 @@ def __init__(self,
             self.holdout_validators = HoldOutFuncs.get_holdout_validators(
                 HoldoutValTypes.time_series_hold_out_validation)
 
-        self.splits = self.get_splits_from_resampling_strategy()
+        self.splits = self.get_splits_from_resampling_strategy()  # type: ignore[assignment]
 
         # TODO doing experiments to give the most proper way of defining these two values
         if lagged_value is None:
@@ -664,7 +686,7 @@ def compute_time_features(start_times: List[pd.DatetimeIndex],
         series in a dataset share the same start time, we could only compute the features for longest possible series
         and reuse them
         """
-        series_lengths_max = {}
+        series_lengths_max: Dict[pd.DatetimeIndex, int] = {}
         for start_t, seq_l in zip(start_times, seq_lengths):
             if start_t not in series_lengths_max or seq_l > series_lengths_max[start_t]:
                 series_lengths_max[start_t] = seq_l
@@ -687,26 +709,28 @@ def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Unio
             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
         return dataset_idx, sample_idx
 
-    def __len__(self):
+    def __len__(self) -> int:
         return ConcatDataset.__len__(self)
 
-    def __getitem__(self, idx, train=True):
-        dataset_idx, sample_idx = self._get_dataset_indices(idx)
+    def __getitem__(self, idx: int,  # type: ignore[override]
+                    train: bool = True) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
+        dataset_idx, sample_idx = self._get_dataset_indices(idx)  # type: ignore[misc]
         return self.datasets[dataset_idx].__getitem__(sample_idx, train)
 
-    def get_validation_set(self, idx):
-        dataset_idx, sample_idx = self._get_dataset_indices(idx)
+    def get_validation_set(self, idx: int) -> TimeSeriesSequence:
+        dataset_idx, sample_idx = self._get_dataset_indices(idx)  # type: ignore[misc]
         return self.datasets[dataset_idx].get_val_seq_set(sample_idx)
 
-    def get_time_series_seq(self, idx) -> TimeSeriesSequence:
+    def get_time_series_seq(self, idx: int) -> TimeSeriesSequence:
         dataset_idx = self._get_dataset_indices(idx, True)
-        return self.datasets[dataset_idx]
+        return self.datasets[dataset_idx]  # type: ignore[index]
 
     def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
         test_indices = np.where(test_indices < 0, test_indices + len(self), test_indices)
         y_test = np.ones([len(test_indices), self.n_prediction_steps, self.num_targets])
         y_test_argsort = np.argsort(test_indices)
-        dataset_idx = self._get_dataset_indices(test_indices[y_test_argsort[0]], only_dataset_idx=True)
+        dataset_idx: int = self._get_dataset_indices(test_indices[y_test_argsort[0]],  # type: ignore[assignment]
+                                                     only_dataset_idx=True)
 
         for y_i in y_test_argsort:
             test_idx = test_indices[y_i]
@@ -726,8 +750,11 @@ def transform_data_into_time_series_sequence(self,
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                                                  Y_test: Optional[
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
-                                                 is_test_set: bool = False) -> [
-
+                                                 is_test_set: bool = False,) -> Tuple[
+        List[TimeSeriesSequence],
+        Tuple[Optional[pd.DataFrame], pd.DataFrame],
+        Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]],
+        List[int]
     ]:
         """
         Transform the raw data into a list of TimeSeriesSequence that can be processed by AutoPyTorch Time Series
@@ -746,13 +773,14 @@ def transform_data_into_time_series_sequence(self,
                 flattened test target array with size N_all (the sum of all the series sequences) and number of targets
             is_test_set: Optional[List[pd.DatetimeIndex]]
                 if the genereated sequecne used for test
-            sequences_kwargs: Dict
-                additional arguments for test sets
         Returns:
             sequence_datasets : List[TimeSeriesSequence]
                 a list of datasets
-            train_tensors: Tuple[List[np.ndarray], List[np.ndarray]]
+            train_tensors: Tuple[Optional[pd.DataFrame], pd.DataFrame]
                 training tensors
+            test_tensors: Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]]
+                test tensors
+
         """
         dataset_with_future_features = X is not None and len(self.known_future_features) > 0
         X, Y, sequence_lengths = self.validator.transform(X, Y)
@@ -765,7 +793,7 @@ def transform_data_into_time_series_sequence(self,
             X_test, Y_test, _ = self.validator.transform(X_test, Y_test,
                                                          validate_for_future_features=dataset_with_future_features)
 
-        y_groups = Y.groupby(Y.index)
+        y_groups: pd.DataFrameGroupBy = Y.groupby(Y.index)  # type: ignore[union-attr]
         if self.normalize_y:
             mean = y_groups.agg("mean")
             std = y_groups.agg("std")
@@ -790,11 +818,11 @@ def transform_data_into_time_series_sequence(self,
     def make_sequences_datasets(X: Optional[pd.DataFrame],
                                 Y: pd.DataFrame,
                                 start_times: List[pd.DatetimeIndex],
-                                time_features: Optional[Dict[pd.Timestamp, np.ndarray]] = None,
+                                time_features: Optional[Dict[pd.DatetimeIndex, np.ndarray]] = None,
                                 X_test: Optional[pd.DataFrame] = None,
                                 Y_test: Optional[pd.DataFrame] = None,
                                 is_test_set: bool = False,
-                                **sequences_kwargs: Optional[Dict]) -> Tuple[
+                                **sequences_kwargs: Any) -> Tuple[
         List[TimeSeriesSequence],
         Tuple[Optional[pd.DataFrame], pd.DataFrame],
         Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]]
@@ -856,7 +884,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
                 start_time=start_time,
                 X_test=x_test_ser,
                 Y_test=y_test_ser,
-                time_features=time_features[start_time][:len(y_ser)],
+                time_features=time_features[start_time][:len(y_ser)] if time_features is not None else None,
                 is_test_set=is_test_set,
                 **sequences_kwargs)
             sequence_datasets.append(sequence)
@@ -915,16 +943,16 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
         return self
 
     @property
-    def transform_time_features(self):
+    def transform_time_features(self) -> bool:
         return self._transform_time_features
 
     @transform_time_features.setter
-    def transform_time_features(self, value: bool):
-        self._transform_time_feature = value
+    def transform_time_features(self, value: bool) -> None:
+        self._transform_time_features = value
         for seq in self.datasets:
             seq.transform_time_features = value
 
-    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
+    def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]:
         """
         Creates a set of splits based on a resampling strategy provided, apart from the
         'get_splits_from_resampling_strategy' implemented in base_dataset, here we will get self.upper_sequence_length
@@ -955,7 +983,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             else:
                 n_repeats = 1
             # Create the split if it was not created before
-            splits.extend(self.create_cross_val_splits(
+            splits.extend(self.create_cross_val_splits(  # type: ignore[arg-type]
                 cross_val_type=self.resampling_strategy,
                 num_splits=cast(int, num_splits),
                 n_repeats=n_repeats
@@ -964,7 +992,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             splits.append(self.create_refit_split())
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
-        return splits
+        return splits  # type: ignore[return-value]
 
     def get_required_dataset_info(self) -> Dict[str, Any]:
         """
@@ -984,7 +1012,7 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
     def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
         dataset_properties = super().get_dataset_properties(dataset_requirements=dataset_requirements)
         dataset_properties.update({'n_prediction_steps': self.n_prediction_steps,
-                                   'sp': self.seasonality,  # For metric computation,
+                                   'sp': self.seasonality,  # For metric computation
                                    'input_shape': self.input_shape,
                                    'time_feature_transform': self.time_feature_transform,
                                    'uni_variant': self.is_uni_variant,
@@ -1000,8 +1028,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
     def get_split_strategy(sequence_lengths: List[int],
                            n_prediction_steps: int,
                            freq_value: Real,
-                           resampling_strategy: Optional[Union[
-                               CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
+                           resampling_strategy: Union[
+                               CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
                            resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
             Tuple[Union[CrossValTypes, HoldoutValTypes], Optional[Dict[str, Any]]]:
         """
@@ -1099,7 +1127,7 @@ def create_cross_val_splits(
             self,
             cross_val_type: CrossValTypes,
             num_splits: int,
-            n_repeats=1,
+            n_repeats: int = 1,
     ) -> List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]:
         """
         This function creates the cross validation split for the given task.
@@ -1127,11 +1155,10 @@ def create_cross_val_splits(
         kwargs = {"n_prediction_steps": self.n_prediction_steps}
         if cross_val_type == CrossValTypes.time_series_ts_cross_validation:
             seasonality_h_value = int(np.round((self.n_prediction_steps // int(self.freq_value) + 1) * self.freq_value))
-            kwargs.update({'seasonality_h_value': seasonality_h_value,
-                           'freq_value': self.freq_value})
+            kwargs.update({'seasonality_h_value': seasonality_h_value})
         kwargs["n_repeats"] = n_repeats
 
-        splits = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
+        splits: List[List[Tuple]] = [[() for _ in range(len(self.datasets))] for _ in range(num_splits)]
 
         for idx_seq, dataset in enumerate(self.datasets):
             split = self.cross_validators[cross_val_type.name](self.random_state,
@@ -1147,11 +1174,11 @@ def create_cross_val_splits(
         #  first_split = [([0], [1]), ([2], [3])] ....
         splits_merged = []
         for i in range(num_splits):
-            split = splits[i]
+            split = splits[i]  # type: ignore[assignment]
             train_indices = np.hstack([sp[0] for sp in split])
             test_indices = np.hstack([sp[1] for sp in split])
             splits_merged.append((train_indices, test_indices))
-        return splits_merged
+        return splits_merged  # type: ignore[return-value]
 
     def create_holdout_val_split(
             self,
@@ -1191,7 +1218,7 @@ def create_holdout_val_split(
                                                                    indices=np.arange(len(dataset)) + idx_start,
                                                                    **kwargs)
             for idx_split in range(2):
-                splits[idx_split][idx_seq] = split[idx_split]
+                splits[idx_split][idx_seq] = split[idx_split]  # type: ignore[call-overload]
             idx_start += self.sequence_lengths_train[idx_seq]
 
         train_indices = np.hstack([sp for sp in splits[0]])
@@ -1228,7 +1255,7 @@ def create_refit_split(
 
     def create_refit_set(self) -> "TimeSeriesForecastingDataset":
         refit_set: TimeSeriesForecastingDataset = copy.deepcopy(self)
-        refit_set.resampling_strategy = None
+        refit_set.resampling_strategy = None  # type: ignore[assignment]
         refit_set.splits = refit_set.get_splits_from_resampling_strategy()
         return refit_set
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 37b05d209..78d4b4dc4 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -66,6 +66,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             X_train = X['X_train']
         else:
             X_train = X['backend'].load_datamanager().train_tensors[0]
+
         self.preprocessor.fit(X_train)
         return self
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
index cb616dfb9..c69008b86 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -24,7 +24,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder:
         feature_shapes = X['dataset_properties']['feature_shapes']
 
         if len(n_features_cat) == 0:
-            n_features_cat = self.preprocessor['categorical'].categories
+            n_features_cat = self.preprocessor['categorical'].categories  # type: ignore
         for i, cat_column in enumerate(categorical_columns):
             feature_shapes[feature_names[cat_column]] = len(n_features_cat[i])
         self.feature_shapes = feature_shapes
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
index 8299ab62a..381ebb22d 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, Union
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
@@ -19,7 +19,7 @@ def __init__(self) -> None:
             FitRequirement('feature_names', (Tuple,), user_defined=True, dataset_property=True),
             FitRequirement('feature_shapes', (Dict, ), user_defined=True, dataset_property=True),
         ])
-        self.feature_shapes = {}
+        self.feature_shapes: Union[Dict[str, int]] = {}
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index afcca204c..f80140683 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -11,7 +11,7 @@
 class TimeSeriesScaler(BaseEstimator):
     def __init__(self, mode: str,
                  dataset_is_small_preprocess: bool = True,
-                 static_features: Tuple[Union[str, int]] = ()):
+                 static_features: Union[Tuple[Union[str, int], ...], Tuple[()]] = ()):
         self.mode = mode
         self.dataset_is_small_preprocess = dataset_is_small_preprocess
         self.static_features = static_features
@@ -21,13 +21,14 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Any = None) -> "TimeSeriesS
         The transformer is transformed on the fly (for each batch)
         """
         if self.dataset_is_small_preprocess:
+            if not isinstance(X, pd.DataFrame):
+                raise ValueError(f'Scaling that works on small_preprocess dataset must work with pd.DataFrame.'
+                                 f'However, it gets {type(X)}')
+
             static_features = [static_fea for static_fea in self.static_features if static_fea in X.columns]
-        else:
-            static_features = [static_fea for static_fea in self.static_features if static_fea < X.shape[1]]
-        self.static_features = static_features
+            self.static_features = static_features  # type: ignore[assignment]
 
-        if self.mode == "standard":
-            if self.dataset_is_small_preprocess:
+            if self.mode == "standard":
                 X_grouped = X.groupby(X.index)
 
                 self.loc = X_grouped.agg("mean")
@@ -41,15 +42,7 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Any = None) -> "TimeSeriesS
                 # ensure that if all the values are the same in a group, we could still normalize them correctly
                 self.scale[self.scale == 0] = 1.
 
-            else:
-                # in this case X is a np array
-                self.loc = X.mean(axis=0, keepdims=True)
-                self.scale = np.nan_to_num(X.std(axis=0, ddof=1, keepdims=True))
-                self.scale = np.where(self.scale == 0, self.loc, self.scale)
-                self.scale[self.scale == 0] = 1.
-
-        elif self.mode == "min_max":
-            if self.dataset_is_small_preprocess:
+            elif self.mode == "min_max":
                 X_grouped = X.groupby(X.index)
                 min_ = X_grouped.agg("min")
                 max_ = X_grouped.agg("max")
@@ -63,54 +56,38 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Any = None) -> "TimeSeriesS
                 self.scale.mask(self.scale == 0.0, self.loc)
                 self.scale[self.scale == 0.0] = 1.0
 
-            else:
-                min_ = X.min(axis=0, keepdims=True)
-                max_ = X.max(axis=0, keepdims=True)
-
-                diff_ = max_ - min_
-                self.loc = min_
-                self.scale = diff_
-                self.scale = np.where(self.scale == 0., self.loc, self.scale)
-                self.scale[self.scale == 0.0] = 1.0
-
-        elif self.mode == "max_abs":
-            if self.dataset_is_small_preprocess:
+            elif self.mode == "max_abs":
                 X_abs = X.transform("abs")
                 max_abs_ = X_abs.groupby(X_abs.index).agg("max")
                 max_abs_[self.static_features] = max_abs_[self.static_features].max()
-            else:
-                X_abs = np.abs(X)
-                max_abs_ = X_abs.max(0, keepdims=True)
 
-            max_abs_[max_abs_ == 0.0] = 1.0
-            self.loc = None
-            self.scale = max_abs_
+                max_abs_[max_abs_ == 0.0] = 1.0
+                self.loc = None
+                self.scale = max_abs_
 
-        elif self.mode == 'mean_abs':
-            if self.dataset_is_small_preprocess:
+            elif self.mode == 'mean_abs':
                 X_abs = X.transform("abs")
                 X_abs = X_abs.groupby(X_abs.index)
                 mean_abs_ = X_abs.agg("mean")
                 mean_abs_[self.static_features] = mean_abs_[self.static_features].mean()
                 self.scale = mean_abs_.mask(mean_abs_ == 0.0, X_abs.agg("max"))
-            else:
-                X_abs = np.abs(X)
-                mean_abs_ = X_abs.mean(0, keepdims=True)
-                self.scale = np.where(mean_abs_ == 0.0, np.max(X_abs), mean_abs_)
 
-            self.scale[self.scale == 0] = 1
-            self.loc = None
+                self.scale[self.scale == 0] = 1
+                self.loc = None
 
-        elif self.mode == "none":
-            self.loc = None
-            self.scale = None
+            elif self.mode == "none":
+                self.loc = None
+                self.scale = None
 
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for time series scaler")
         else:
-            raise ValueError(f"Unknown mode {self.mode} for time series scaler")
+            static_features = [static_fea for static_fea in self.static_features if static_fea < X.shape[1]]
+            self.static_features = static_features  # type: ignore[assignment]
 
         return self
 
-    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...]:
+    def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]:
         """
         X = sklearn.utils.check_array(
             X,
@@ -121,6 +98,48 @@ def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Tuple[np.ndarray, ...
             accept_large_sparse=False
         ) # type: np.ndarray
         """
+        if not self.dataset_is_small_preprocess:
+            if not isinstance(X, np.ndarray):
+                raise ValueError(f'Scaling that works on none-small_preprocess dataset must work with np.ndarray.'
+                                 f'However, it gets {type(X)}')
+            if self.mode == 'standard':
+                # in this case X is a np array
+                loc = X.mean(axis=0, keepdims=True)
+                scale = np.nan_to_num(X.std(axis=0, ddof=1, keepdims=True))
+                scale = np.where(scale == 0, loc, scale)
+                scale[scale == 0] = 1.
+                return (X - loc) / scale
+
+            elif self.mode == 'min_max':
+                min_ = X.min(axis=0, keepdims=True)
+                max_ = X.max(axis=0, keepdims=True)
+
+                diff_ = max_ - min_
+                loc = min_
+                scale = diff_
+                scale = np.where(scale == 0., loc, scale)
+                scale[scale == 0.0] = 1.0
+                return (X - loc) / scale
+
+            elif self.mode == "max_abs":
+                X_abs = np.abs(X)
+                max_abs_ = X_abs.max(0, keepdims=True)
+                max_abs_[max_abs_ == 0.0] = 1.0
+                scale = max_abs_
+                return X / scale
+
+            elif self.mode == 'mean_abs':
+                X_abs = np.abs(X)
+                mean_abs_ = X_abs.mean(0, keepdims=True)
+                scale = np.where(mean_abs_ == 0.0, np.max(X_abs), mean_abs_)
+                scale[scale == 0] = 1
+                return X / scale
+
+            elif self.mode == "none":
+                return X
+            else:
+                raise ValueError(f"Unknown mode {self.mode} for time series scaler")
+
         if self.mode == "standard":
             return (X - self.loc) / self.scale
 
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index b9037a138..c4c214fb5 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -17,7 +17,7 @@
 def get_preprocess_transforms(X: Dict[str, Any],
                               preprocess_type: Union[Type[aPTPre], Type[aPTTPre]] = aPTPre) \
         -> List[Union[Type[aPTPre], Type[aPTTPre]]]:
-    candidate_transforms: List[preprocess_type] = list()
+    candidate_transforms = []
     for key, value in X.items():
         if isinstance(value, preprocess_type):
             candidate_transforms.append(copy.deepcopy(value))
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
index e79bb95e4..8245c01bd 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
@@ -5,7 +5,7 @@
 
 class TargetMaxAbsScaler(BaseTargetScaler):
     @property
-    def scaler_mode(self):
+    def scaler_mode(self) -> str:
         return 'max_abs'
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
index cce8cbc08..32add9b32 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
@@ -5,7 +5,7 @@
 
 class TargetMeanAbsScaler(BaseTargetScaler):
     @property
-    def scaler_mode(self):
+    def scaler_mode(self) -> str:
         return 'mean_abs'
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
index d345d4334..924e960b0 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
@@ -5,7 +5,7 @@
 
 class TargetMinMaxScaler(BaseTargetScaler):
     @property
-    def scaler_mode(self):
+    def scaler_mode(self) -> str:
         return 'min_max'
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
index 1c57b1ea1..760dd2afe 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
@@ -5,7 +5,7 @@
 
 class TargetNoScaler(BaseTargetScaler):
     @property
-    def scaler_mode(self):
+    def scaler_mode(self) -> str:
         return 'none'
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
index 0dfe15f10..2b6f9a743 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
@@ -5,7 +5,7 @@
 
 class TargetStandardScaler(BaseTargetScaler):
     @property
-    def scaler_mode(self):
+    def scaler_mode(self) -> str:
         return 'standard'
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
index 44950a654..fce48a271 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
@@ -6,6 +6,7 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
@@ -47,16 +48,14 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         return components
 
     def get_hyperparameter_search_space(self,
-                                        dataset_properties: Optional[Dict[str, Any]] = None,
+                                        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
                                         default: Optional[str] = None,
                                         include: Optional[List[str]] = None,
                                         exclude: Optional[List[str]] = None) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
         if dataset_properties is None:
-            dataset_properties = dict()
-
-        dataset_properties = {**self.dataset_properties, **dataset_properties}
+            dataset_properties:  Dict[str, BaseDatasetPropertiesType] = self.dataset_properties  # type: ignore
 
         available_scalers = self.get_available_components(dataset_properties=dataset_properties,
                                                           include=include,
@@ -89,5 +88,5 @@ def get_hyperparameter_search_space(self,
                                        parent_hyperparameter=parent_hyperparameter)
 
         self.configuration_space = cs
-        self.dataset_properties = dataset_properties
+        self.dataset_properties = dataset_properties  # type: ignore[assignment]
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index 3ebfac000..5318c1ca4 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -35,7 +35,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         return self
 
     @property
-    def scaler_mode(self):
+    def scaler_mode(self) -> str:
         raise NotImplementedError
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
index b1719d45c..12fcd4410 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -19,7 +19,7 @@
 
 class DistributionLoss(ForecastingLossComponents):
     loss = LogProbLoss
-    net_output_type = 'distribution'
+    net_output_type = 'distribution'  # type: ignore[assignment]
 
     def __init__(self,
                  dist_cls: str,
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index b71d2b65a..34d0e576f 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -26,7 +26,7 @@ def __init__(self,
         self.quantiles = [0.5, lower_quantile, upper_quantile]
         # To make it compatible with
         # autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer
-        self.loss = partial(QuantileLoss, quantiles=self.quantiles)
+        self.loss = partial(QuantileLoss, quantiles=self.quantiles)  # type: ignore[assignment]
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({"quantile_values": self.quantiles})
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index 53c6788b8..32e5c0443 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -1,10 +1,9 @@
 import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 from collections import OrderedDict
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 import ConfigSpace.hyperparameters as CSH
-import numpy as np
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
@@ -193,6 +192,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
index c49e56263..6133ceae1 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any
+from typing import Dict, Any, Optional, Callable
 
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 
@@ -8,8 +8,8 @@
 class ForecastingLossComponents(autoPyTorchComponent):
     _required_properties = ["name", "handles_tabular", "handles_image", "handles_time_series",
                             'handles_regression', 'handles_classification']
-    loss = None
-    net_output_type = None
+    loss: Optional[Callable] = None
+    net_output_type: Optional[str] = None
 
     def __init__(self,
                  **kwargs: Any):
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index fe2a1732b..e191d3f76 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Union, List
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
@@ -32,7 +32,7 @@ def __init__(
         self.final_activation: Optional[torch.nn.Module] = None
 
     @property
-    def _required_fit_requirements(self):
+    def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
             FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index d117d9b2f..61ac6c6d1 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Union, Tuple, List
+from typing import Dict, Optional, Union, Tuple, List, Any, TypeVar
 
 from abc import abstractmethod
 
@@ -27,6 +27,8 @@
     DecoderBlockInfo
 )
 
+ALL_NET_OUTPUT = TypeVar('ALL_NET_OUTPUT', torch.Tensor, List[torch.Tensor], torch.distributions.Distribution)
+
 
 class TransformedDistribution_(TransformedDistribution):
     """
@@ -34,7 +36,7 @@ class TransformedDistribution_(TransformedDistribution):
     """
 
     @property
-    def mean(self):
+    def mean(self) -> torch.Tensor:
         mean = self.base_dist.mean
         for transform in self.transforms:
             mean = transform(mean)
@@ -114,7 +116,7 @@ def get_lagged_subsequences(
 def get_lagged_subsequences_inference(
         sequence: torch.Tensor,
         subsequences_length: int,
-        lags_seq: Optional[List[int]] = None, ):
+        lags_seq: List[int]) -> torch.Tensor:
     """
     this function works exactly the same as get_lagged_subsequences. However, this implementation is faster when no
     cached value is available, thus it more suitable during inference times.
@@ -155,20 +157,20 @@ def __init__(self,
                  network_encoder: Dict[str, EncoderBlockInfo],
                  network_decoder: Dict[str, DecoderBlockInfo],
                  temporal_fusion: Optional[TemporalFusionLayer],
-                 network_head: Optional[nn.Module],
+                 network_head: nn.Module,
                  window_size: int,
                  target_scaler: BaseTargetScaler,
                  dataset_properties: Dict,
                  auto_regressive: bool,
-                 feature_names: Optional[Tuple[str]] = (),
-                 known_future_features: Optional[Tuple[str]] = (),
-                 feature_shapes: Optional[Dict[str, int]] = (),
-                 static_features: Tuple[Union[str, int]] = (),
-                 time_feature_names: Optional[Tuple[str]] = (),
+                 feature_names: Union[Tuple[str], Tuple[()]] = (),
+                 known_future_features: Union[Tuple[str], Tuple[()]] = (),
+                 feature_shapes: Dict[str, int] = {},
+                 static_features: Union[Tuple[str], Tuple[()]] = (),
+                 time_feature_names: Union[Tuple[str], Tuple[()]] = (),
                  output_type: str = 'regression',
                  forecast_strategy: Optional[str] = 'mean',
-                 num_samples: Optional[int] = 100,
-                 aggregation: Optional[str] = 'mean'
+                 num_samples: int = 50,
+                 aggregation: str = 'mean'
                  ):
         """
         This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
@@ -225,16 +227,19 @@ def __init__(self,
                                       encoder_info=network_encoder,
                                       decoder_info=network_decoder)
         if has_temporal_fusion:
+            if temporal_fusion is None:
+                raise ValueError("When the network structure uses temporal fusion layer, "
+                                 "temporal_fusion must be given!")
             self.temporal_fusion = temporal_fusion  # type: TemporalFusionLayer
             self.lazy_modules.append(self.temporal_fusion)
         self.has_temporal_fusion = has_temporal_fusion
         self.head = network_head
 
-        first_decoder = 0
+        first_decoder = 'block_0'
         for i in range(1, network_structure.num_blocks + 1):
             block_number = f'block_{i}'
             if block_number in network_decoder:
-                if first_decoder == 0:
+                if first_decoder == 'block_0':
                     first_decoder = block_number
 
         if first_decoder == 0:
@@ -267,21 +272,21 @@ def __init__(self,
             self.decoder_lagged_value = network_decoder[first_decoder].decoder.lagged_value
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return self._device
 
     @device.setter
-    def device(self, device: torch.device):
+    def device(self, device: torch.device) -> None:
         self.to(device)
         self._device = device
         for model in self.lazy_modules:
             model.device = device
 
     def rescale_output(self,
-                       outputs: Union[torch.distributions.Distribution, torch.Tensor, List[torch.Tensor]],
+                       outputs: ALL_NET_OUTPUT,
                        loc: Optional[torch.Tensor],
                        scale: Optional[torch.Tensor],
-                       device: torch.device = torch.device('cpu')):
+                       device: torch.device = torch.device('cpu')) -> ALL_NET_OUTPUT:
         if isinstance(outputs, List):
             return [self.rescale_output(output, loc, scale, device) for output in outputs]
         if loc is not None or scale is not None:
@@ -292,7 +297,7 @@ def rescale_output(self,
                 outputs = TransformedDistribution_(outputs, [transform])
             else:
                 if loc is None:
-                    outputs = outputs * scale.to(device)
+                    outputs = outputs * scale.to(device)  # type: ignore[union-attr]
                 elif scale is None:
                     outputs = outputs + loc.to(device)
                 else:
@@ -300,13 +305,13 @@ def rescale_output(self,
         return outputs
 
     def scale_value(self,
-                    outputs: Union[torch.distributions.Distribution, torch.Tensor],
+                    outputs: torch.Tensor,
                     loc: Optional[torch.Tensor],
                     scale: Optional[torch.Tensor],
-                    device: torch.device = torch.device('cpu')):
+                    device: torch.device = torch.device('cpu')) -> torch.Tensor:
         if loc is not None or scale is not None:
             if loc is None:
-                outputs = outputs / scale.to(device)
+                outputs = outputs / scale.to(device)  # type: ignore[union-attr]
             elif scale is None:
                 outputs = outputs - loc.to(device)
             else:
@@ -319,13 +324,13 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                past_observed_targets: Optional[torch.Tensor] = None,
+                past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
-                ):
+                ) -> ALL_NET_OUTPUT:
         raise NotImplementedError
 
     @abstractmethod
-    def pred_from_net_output(self, net_output):
+    def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
         raise NotImplementedError
 
     @abstractmethod
@@ -334,7 +339,7 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.Tensor] = None,
-                ):
+                ) -> torch.Tensor:
         raise NotImplementedError
 
     def repeat_intermediate_values(self,
@@ -345,13 +350,13 @@ def repeat_intermediate_values(self,
             if isinstance(inter_value, torch.Tensor):
                 repeated_value = inter_value.repeat_interleave(repeats=repeats, dim=1 if is_hx else 0)
                 intermediate_values[i] = repeated_value
-            elif isinstance(inter_value, Tuple):
+            elif isinstance(inter_value, tuple):
                 dim = 1 if is_hx else 0
                 repeated_value = tuple(hx.repeat_interleave(repeats=repeats, dim=dim) for hx in inter_value)
                 intermediate_values[i] = repeated_value
         return intermediate_values
 
-    def pad_tensor(self, tensor_to_be_padded: torch.Tensor, target_length) -> torch.Tensor:
+    def pad_tensor(self, tensor_to_be_padded: torch.Tensor, target_length: int) -> torch.Tensor:
         tensor_shape = tensor_to_be_padded.shape
         padding_size = [tensor_shape[0], target_length - tensor_shape[1], tensor_shape[-1]]
         tensor_to_be_padded = torch.cat([tensor_to_be_padded.new_zeros(padding_size), tensor_to_be_padded], dim=1)
@@ -367,7 +372,7 @@ def pre_processing(self,
                        length_past: int = 0,
                        length_future: int = 0,
                        variable_selector_kwargs: Dict = {},
-                       ):
+                       ) -> Tuple[torch.Tensor, ...]:
         if self.encoder_lagged_input:
             if self.window_size < past_targets.shape[1]:
                 past_targets[:, -self.window_size:], _, loc, scale = self.target_scaler(
@@ -477,7 +482,7 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
-                ):
+                ) -> ALL_NET_OUTPUT:
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden, _ = self.pre_processing(
             past_targets=past_targets,
             past_observed_targets=past_observed_targets,
@@ -507,7 +512,7 @@ def forward(self,
 
         return self.rescale_output(output, loc, scale, self.device)
 
-    def pred_from_net_output(self, net_output):
+    def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
         if self.output_type == 'regression':
             return net_output
         elif self.output_type == 'quantile':
@@ -539,7 +544,7 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
-                ):
+                ) -> torch.Tensor:
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
@@ -549,21 +554,19 @@ def predict(self,
 
 class ForecastingSeq2SeqNet(ForecastingNet):
     future_target_required = True
+    """
+    Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while
+
+    This structure is activate when the decoder is recurrent (RNN or transformer).
+    We train the network with teacher forcing, thus
+    future_targets is required for the network. To train the network, past targets and past features are fed to the
+    encoder to obtain the hidden states whereas future targets and future features.
+    When the output type is distribution and forecast_strategy is sampling,
+    this model is equivalent to a deepAR model during inference.
+    """
 
-    def __init__(self, **kwargs):
-        """
-        Forecasting network with Seq2Seq structure, Encoder/ Decoder need to be the same recurrent models while
-
-        This structure is activate when the decoder is recurrent (RNN or transformer).
-        We train the network with teacher forcing, thus
-        future_targets is required for the network. To train the network, past targets and past features are fed to the
-        encoder to obtain the hidden states whereas future targets and future features.
-        When the output type is distribution and forecast_strategy is sampling,
-        this model is equivalent to a deepAR model during inference.
-        """
-        super(ForecastingSeq2SeqNet, self).__init__(**kwargs)
-
-    def decoder_select_variable(self, future_targets: torch.tensor, future_features: Optional[torch.Tensor]):
+    def decoder_select_variable(self, future_targets: torch.tensor,
+                                future_features: Optional[torch.Tensor]) -> torch.Tensor:
         batch_size = future_targets.shape[0]
         length_future = future_targets.shape[1]
         future_targets = future_targets.to(self.device)
@@ -599,7 +602,7 @@ def forward(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
-                decoder_observed_values: Optional[torch.Tensor] = None, ):
+                decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:
         x_past, _, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
             past_targets=past_targets,
             past_observed_targets=past_observed_targets,
@@ -752,7 +755,8 @@ def forward(self,
                     self.variable_selector.cached_static_contex = self.repeat_intermediate_values(
                         [self.variable_selector.cached_static_contex],
                         is_hidden_states=[False],
-                        repeats=self.num_samples)[0]
+                        repeats=self.num_samples
+                    )[0]
 
                 for idx_pred in range(self.n_prediction_steps):
                     if self.decoder_lagged_input:
@@ -813,7 +817,7 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
-                ):
+                ) -> torch.Tensor:
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
@@ -828,7 +832,7 @@ class ForecastingDeepARNet(ForecastingSeq2SeqNet):
     future_target_required = True
 
     def __init__(self,
-                 **kwargs):
+                 **kwargs: Any):
         """
         Forecasting network with DeepAR structure.
 
@@ -849,7 +853,7 @@ def train(self, mode: bool = True) -> nn.Module:
 
     def encoder_select_variable(self, past_targets: torch.tensor, past_features: Optional[torch.Tensor],
                                 length_past: int,
-                                **variable_selector_kwargs):
+                                **variable_selector_kwargs: Any) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
         batch_size = past_targets.shape[0]
         past_targets = past_targets.to(self.device)
         if past_features is not None:
@@ -886,10 +890,13 @@ def forward(self,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                past_observed_targets: Optional[torch.Tensor] = None,
-                decoder_observed_values: Optional[torch.Tensor] = None, ):
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:
         encode_length = min(self.window_size, past_targets.shape[1])
 
+        if past_observed_targets is None:
+            past_observed_targets = torch.ones_like(past_targets, dtype=torch.bool)
+
         if self.training:
             if self.encoder_lagged_input:
                 if self.window_size < past_targets.shape[1]:
@@ -927,6 +934,7 @@ def forward(self,
 
             if self.network_structure.variable_selection:
                 if past_features is not None:
+                    assert future_features is not None
                     past_features = past_features[:, -self.window_size:]
                     features_all = torch.cat([past_features, future_features[:, :-1]], dim=1)
                 else:
@@ -937,6 +945,7 @@ def forward(self,
                                                                                             length_past=length_past)
             else:
                 if past_features is not None:
+                    assert future_features is not None
                     if self.window_size <= past_features.shape[1]:
                         past_features = past_features[:, -self.window_size:]
 
@@ -1010,6 +1019,7 @@ def forward(self,
 
             else:
                 if past_features is not None:
+                    assert future_features is not None
                     features_all = torch.cat([past_features[:, -encode_length:], future_features[:, :-1]], dim=1)
                 else:
                     features_all = None
@@ -1129,7 +1139,7 @@ def predict(self,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
-                ):
+                ) -> torch.Tensor:
         net_output = self(past_targets=past_targets,
                           past_features=past_features,
                           future_features=future_features,
@@ -1140,13 +1150,17 @@ def predict(self,
 class NBEATSNet(ForecastingNet):
     future_target_required = False
 
-    def forward(self,
+    def forward(self, # type: ignore[override]
                 past_targets: torch.Tensor,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
                 future_features: Optional[torch.Tensor] = None,
-                past_observed_targets: Optional[torch.Tensor] = None,
-                decoder_observed_values: Optional[torch.Tensor] = None, ):
+                past_observed_targets: Optional[torch.BoolTensor] = None,
+                decoder_observed_values: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor,
+                                                                                   Tuple[torch.Tensor, torch.Tensor]]:
+        if past_observed_targets is None:
+            past_observed_targets = torch.ones_like(past_targets, dtype=torch.bool)
+
         if self.window_size <= past_targets.shape[1]:
             past_targets = past_targets[:, -self.window_size:]
             past_observed_targets = past_observed_targets[:, -self.window_size:]
@@ -1179,5 +1193,5 @@ def forward(self,
         else:
             return forecast
 
-    def pred_from_net_output(self, net_output: torch.Tensor):
+    def pred_from_net_output(self, net_output: torch.Tensor) -> torch.Tensor:
         return net_output
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 490bcb904..7b627214f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Iterable, Tuple
+from typing import Any, Dict, Optional, Iterable, Tuple, List
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
@@ -35,7 +35,7 @@ def __init__(
         super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
 
     @property
-    def _required_fit_requirements(self):
+    def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
             FitRequirement('dataset_properties', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
@@ -162,13 +162,3 @@ def predict(self, loader: torch.utils.data.DataLoader) -> torch.Tensor:
             Y_batch_preds.append(Y_batch_pred.cpu())
 
         return torch.cat(Y_batch_preds, 0).cpu().numpy()
-
-    @staticmethod
-    def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-    ) -> ConfigurationSpace:
-        """
-        """
-        cs = ConfigurationSpace()
-
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index 59f7aed60..ce1667320 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -195,6 +195,6 @@ def _defaults_network(self) -> List[str]:
             'ConvNetImageBackbone',
         ]
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 20fce642e..25ff46db3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -49,7 +49,7 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
             Dict[str, autoPyTorchComponent]: all basebackbone components available
                 as choices for learning rate scheduling
         """
-        return self.default_components
+        return self.default_components  # type: ignore[return-value]
 
     def get_available_components(
             self,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 21f9c6c2e..bc44d918b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -377,8 +377,8 @@ def __init__(self,
                                                               ) for _ in range(n_hidden_states)]
 
         self.static_context_initial_hidden = nn.ModuleList(static_context_initial_hidden)
-        self.cached_static_contex = None
-        self.cached_static_embedding = None
+        self.cached_static_contex: Optional[torch.Tensor] = None
+        self.cached_static_embedding: Optional[torch.Tensor] = None
 
     @property
     def device(self) -> torch.device:
@@ -409,9 +409,13 @@ def forward(self,
             if len(self.static_input_sizes) > 0:
                 static_embedding, _ = self.static_variable_selection(x_static)
             else:
-                assert x_future is not None and x_past is not None
-                model_dtype = next(iter(x_past.values())).dtype if length_past > 0 else next(
-                    iter(x_future.values())).dtype
+                if length_past > 0:
+                    assert x_past is not None, "x_past must be given when length_past is greater than 0!"
+                    model_dtype = next(iter(x_past.values())).dtype
+                else:
+                    assert x_future is not None, "x_future must be given when length_future is greater than 0!"
+                    model_dtype = next(iter(x_future.values())).dtype
+
                 static_embedding = torch.zeros(
                     (batch_size, self.hidden_size), dtype=model_dtype, device=self.device
                 )
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index 671aecabe..471c067ef 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -141,5 +141,5 @@ def forward(self, x: torch.Tensor, pos_idx: Optional[Tuple[int]] = None) -> torc
         if pos_idx is None:
             x = x + self.pe[:, :x.size(1), :]
         else:
-            x = x + self.pe[:, pos_idx[0]: pos_idx[1], :]  # type: ignore
+            x = x + self.pe[:, pos_idx[0]: pos_idx[1], :]  # type: ignore[misc]
         return self.dropout(x)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index fe6026fbb..0ee90110a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -76,7 +76,7 @@ def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
             layers.append(nn.Dropout(self.dropout_rate))
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
-                pos_idx: Optional[Tuple[int]] = None) -> Union[torch.Module, Tuple[torch.Module, torch.Module]]:
+                pos_idx: Optional[Tuple[int]] = None) -> Union[nn.Module, Tuple[nn.Module, nn.Module]]:
         if self.backcast_head is None and self.forecast_head is None:
             # used to compute head dimensions
             return self.backbone(encoder_output)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 8b9cff22b..9c885c35f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -50,7 +50,7 @@ def __init__(self,
         self.decoder_choice: Optional[List[BaseForecastingDecoder]] = None
 
     @abstractmethod
-    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
         """Returns the available backbone components
 
         Args:
@@ -73,12 +73,12 @@ def additional_components(self) -> List[Callable]:
         # This function is deigned to add additional components rather than the components in __choice__
         return [self.get_decoder_components]
 
-    def get_available_components(
+    def get_available_components(  # type: ignore[override]
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             include: List[str] = None,
             exclude: List[str] = None,
-            components: Optional[Dict[str, autoPyTorchComponent]] = None
+            components: Optional[Dict[str, Type[autoPyTorchComponent]]] = None
     ) -> Dict[str, Type[autoPyTorchComponent]]:
         """Filters out components based on user provided
         include/exclude directives, as well as the dataset properties
@@ -90,6 +90,7 @@ def get_available_components(
              to remove from the configuration space
          dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics
              of the dataset to guide the pipeline choices of components
+         components (Optional[Dict[str, Type[autoPyTorchComponent]]]): components
 
         Returns:
             Dict[str, autoPyTorchComponent]: A filtered dict of learning
@@ -106,7 +107,7 @@ def get_available_components(
         if components is None:
             available_comp = self.get_components()
         else:
-            available_comp = components
+            available_comp = components  # type: ignore[assignment]
 
         if include is not None:
             for incl in include:
@@ -214,9 +215,9 @@ def get_hyperparameter_search_space(
         encoder2decoder: Dict[str, List[str]] = {}
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
-            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,
-                                                                                            # type: ignore
-                                                                                            **updates)
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                dataset_properties,  # type: ignore
+                **updates)
             parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
             cs.add_configuration_space(
                 encoder_name,
@@ -242,9 +243,10 @@ def get_hyperparameter_search_space(
             if not decoder2encoder[decoder_name]:
                 continue
             updates = self._get_search_space_updates(prefix=decoder_name)
-            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
-                                                                                            # type: ignore
-                                                                                            **updates)
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
+                dataset_properties,  # type: ignore
+                **updates
+            )
             compatible_encoders = decoder2encoder[decoder_name]
             encoders_with_multi_decoder = []
             encoder_with_uni_decoder = []
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index 20843d8cb..217bfecd5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -159,7 +159,7 @@ def get_hyperparameter_search_space(  # type: ignore
                                                                            default_value=0.1,
                                                                            ),
     ) -> ConfigurationSpace:
-        cs = MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,
+        cs = MLPBackbone.get_hyperparameter_search_space(dataset_properties=dataset_properties,  # type: ignore
                                                          num_groups=num_groups,
                                                          activation=activation,
                                                          use_dropout=use_dropout,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index f48599f1b..fe44e1a7e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -4,7 +4,7 @@
 
 import os
 from collections import OrderedDict
-from typing import Dict, Union, Optional
+from typing import Dict, Union, Optional, Type
 
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
@@ -27,7 +27,7 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
 
 
 class FlatForecastingEncoderChoice(AbstractForecastingEncoderChoice):
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         """Returns the available backbone components
 
         Args:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index b4ca02302..2d799298c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, Optional, List, Any, Union
+from typing import Dict, Optional, List, Any, Union, Type
 from sklearn.pipeline import Pipeline
 import inspect
 
@@ -56,7 +56,7 @@ class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
     deepAR_decoder_prefix = 'block_1'
     tf_prefix = "temporal_fusion"
 
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
         """Returns the available backbone components
 
         Args:
@@ -71,7 +71,7 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components.update(_addons.components)
         return components
 
-    def get_hyperparameter_search_space(  # type: ignore
+    def get_hyperparameter_search_space(  # type: ignore[override]
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index 0e1436165..5649227b1 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -64,6 +64,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
                                                    dropout=self.dropout_rate
                                                    )
         self.n_decoder_output_features = 2 ** self.attention_d_model_log
+        return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({"n_decoder_output_features": self.n_decoder_output_features,
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 66867847e..66ed8013b 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -34,7 +34,7 @@ def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_n
         # list of number of categories of categorical data
         # or 0 for numerical data
         self.num_input_features = num_input_features
-        categorical_features = self.num_input_features > 0
+        categorical_features: np.ndarray = self.num_input_features > 0
 
         self.num_categorical_features = self.num_input_features[categorical_features]
 
@@ -119,7 +119,7 @@ def __init__(self,
                  num_input_features: np.ndarray,
                  num_numerical_features: int,
                  embed_features: List[bool],
-                 num_output_dimensions: Optional[List[int]],
+                 num_output_dimensions: List[int],
                  ee_layers: nn.Module
                  ):
         super(_LearnedEntityEmbedding, self).__init__()
@@ -127,7 +127,7 @@ def __init__(self,
         # list of number of categories of categorical data
         # or 0 for numerical data
         self.num_input_features = num_input_features
-        categorical_features = self.num_input_features > 0
+        categorical_features: np.ndarray = self.num_input_features > 0
 
         self.num_categorical_features = self.num_input_features[categorical_features]
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
index 381e0735d..2add25b3f 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -205,6 +203,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index 84ca63b87..a6556afbe 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -191,6 +191,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
index fc878b669..100a9c5e0 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -175,6 +173,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
index 010cbad81..ae31a58af 100644
--- a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
@@ -177,6 +175,6 @@ def get_hyperparameter_search_space(
         self.dataset_properties_ = dataset_properties
         return cs
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
         return self.choice.transform(X)
diff --git a/autoPyTorch/pipeline/components/training/base_training.py b/autoPyTorch/pipeline/components/training/base_training.py
index ebf7ccbc4..48d0a3e06 100644
--- a/autoPyTorch/pipeline/components/training/base_training.py
+++ b/autoPyTorch/pipeline/components/training/base_training.py
@@ -12,15 +12,15 @@ class autoPyTorchTrainingComponent(autoPyTorchComponent):
     def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
         super(autoPyTorchTrainingComponent, self).__init__(random_state=random_state)
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Dict) -> Dict:
         """The transform function calls the transform function of the
         underlying model and returns the transformed array.
 
         Args:
-            X (np.ndarray): input features
+            X (Dict): input features
 
         Returns:
-            np.ndarray: Transformed features
+            Dict: Transformed features
         """
         raise NotImplementedError()
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index b1ee41bb2..483ac98d4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -138,7 +138,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
 
         return self
 
-    def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.inf,
+    def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: int = np.iinfo(np.int32).max,
                    ) -> torch.utils.data.DataLoader:
         """
         Creates a data loader object from the provided data,
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 60bd84b86..7c17a308a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,8 +1,7 @@
-from typing import Any, Dict, Optional, Union, Tuple, List, Callable
+from typing import Any, Dict, Optional, Union, Tuple, List, Callable, Iterator
 import warnings
 from functools import partial
 
-
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
 from ConfigSpace.conditions import EqualsCondition
@@ -10,11 +9,12 @@
 import numpy as np
 import pandas as pd
 from sklearn.compose import ColumnTransformer
-
 import torch
 
 import torchvision
 
+from gluonts.time_feature import TimeFeature
+
 from autoPyTorch.datasets.time_series_dataset import (
     TimeSeriesForecastingDataset,
     TimeSeriesSequence,
@@ -53,18 +53,23 @@ def __init__(self,
                  window_size: int = 1,
                  num_batches_per_epoch: Optional[int] = 50,
                  n_prediction_steps: int = 1,
-                 sample_strategy='SeqUniform',
-                 transform_time_features=False,
+                 sample_strategy: str = 'SeqUniform',
+                 transform_time_features: bool = False,
                  random_state: Optional[np.random.RandomState] = None) -> None:
         """
         initialize a dataloader
         Args:
             batch_size: batch size
-            sequence_length: length of each sequence
-            sample_interval: sample interval ,its value is the interval of the resolution
+            backcast (bool): if backcast is applied, where window_size is determined on the forecasting horizon
+            backcast_period (int): backcast period, window_size is computed by horizon * backcast_period
+            window_size(int): windows size, activate when backcast is false
+            num_batches_per_epoch (int): number of batches per epoch
+            n_prediction_steps (int): forecasting horizon
+            sample_strategy (str): sample strategy, if all the sequences are expected to be sampled with the same size
+                or all the time steps are expected to be sampled with the same size
+            transform_time_features (bool): if time features are transformed
+            random_state (Optional[np.random.RandomState]): random states
 
-            num_batches_per_epoch: how
-            n_prediction_steps: how many steps to predict in advance
         """
         super().__init__(batch_size=batch_size, random_state=random_state)
         self.backcast = backcast
@@ -81,16 +86,16 @@ def __init__(self,
         # self.subseq_length = self.sample_interval * (self.window_size - 1) + 1
         self.sample_strategy = sample_strategy
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
-        self.padding_collector = None
+        self.padding_collector: Optional[Callable] = None
 
         self.known_future_features_index = None
         self._is_uni_variant = False
 
         self.transform_time_features = transform_time_features
         self.freq = "1Y"
-        self.time_feature_transform = []
-        self.dataset_columns = []
-        self.sampler_train = None
+        self.time_feature_transform: List[TimeFeature] = []
+        self.dataset_columns: List[Union[int, str]] = []
+        self.sampler_train: Optional[Union[Iterator, torch.utils.data.sampler.Sampler]] = None
 
         # Applied for get loader
         self.feature_preprocessor: Optional[ColumnTransformer] = None
@@ -153,7 +158,7 @@ def compute_expected_num_instances_per_seq(self,
                                                           replace=False)
 
         if self.sample_strategy == 'LengthUniform':
-            available_seq_length = seq_train_length - min_start
+            available_seq_length: np.ndarray = seq_train_length - min_start
             available_seq_length = np.where(available_seq_length <= 0, 0, available_seq_length)
             num_instances_per_seqs = num_instances_epoch / np.sum(available_seq_length) * available_seq_length
         elif self.sample_strategy == 'SeqUniform':
@@ -202,7 +207,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             feature_names=X['dataset_properties']['feature_names'],
             queried_features=X['dataset_properties']['known_future_features']
         )
-        self.known_future_features_index = tuple(known_future_features_index)
+        self.known_future_features_index = known_future_features_index
 
         self.padding_collector = PadSequenceCollector(self.window_size, sample_interval, padding_value,
                                                       max_lagged_value)
@@ -349,7 +354,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
         return torchvision.transforms.Compose(candidate_transformations)
 
     def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.ndarray] = None,
-                   batch_size: int = np.inf,
+                   batch_size: int = np.iinfo(np.int32).max,
                    ) -> torch.utils.data.DataLoader:
         """
         Creates a data loader object from the provided data,
@@ -367,6 +372,7 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     sequence_lengths[seq_idx] = len(x_seq.X)
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
 
+                assert self.known_future_features_index is not None
                 if len(self.known_future_features_index) > 0:
                     sequence_lengths_test = [0] * num_sequences
                     for seq_idx, x_seq in enumerate(X):
@@ -418,16 +424,16 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     x_seq._cached_time_features = None
 
                 if self.dataset_small_preprocess and not self._is_uni_variant:
-                    x_seq.X = x_all.get_group(i).transform(np.array).values
-                    update_dict = {"known_future_features_index": self.known_future_features_index}
-                    if len(self.known_future_features_index) > 0:
-                        x_seq.X_test = x_all_test.get_group(i).transform(np.array).values
+                    x_seq.X = x_all.get_group(i).transform(np.array).values  # type: ignore[has-type]
+                    update_dict: Dict[str, Any] = {"known_future_features_index": self.known_future_features_index}
+                    if len(self.known_future_features_index) > 0:  # type: ignore[arg-type]
+                        x_seq.X_test = x_all_test.get_group(i).transform(np.array).values  # type: ignore[has-type]
 
                 else:
                     update_dict = {}
-                update_dict.update(dict(freq=self.freq,
-                                        transform_time_features=self.transform_time_features,
-                                        time_feature_transform=self.time_feature_transform, ))
+                update_dict.update({'freq': self.freq,
+                                    'transform_time_features': self.transform_time_features,
+                                    'time_feature_transform': self.time_feature_transform, })
 
                 x_seq.update_attribute(**update_dict)
                 if self.transform_time_features:
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 051eb27ea..e488b2354 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -1,4 +1,4 @@
-from typing import Optional, Sequence, List, Iterator, Sized, Union
+from typing import Optional, Sequence, List, Iterator, Sized, Union, Mapping
 
 import numpy as np
 
@@ -26,9 +26,9 @@ def __getitem__(self, idx: int) -> np.ndarray:
 
 def pad_sequence_with_minimal_length(sequences: List[torch.Tensor],
                                      seq_minimal_length: int = 1,
-                                     seq_max_length: int = np.inf,
-                                     batch_first=True,
-                                     padding_value=0.0) -> torch.Tensor:
+                                     seq_max_length: int = np.iinfo(np.int32).max,
+                                     batch_first: bool = True,
+                                     padding_value: float = 0.0) -> torch.Tensor:
     r"""
     This function is quite similar to  torch.nn.utils.rnn.pad_sequence except that we constraint the sequence to be
     at least seq_minimal_length and at most seq_max_length
@@ -69,21 +69,22 @@ class PadSequenceCollector:
 
     """
 
-    def __init__(self, window_size: int, sample_interval, target_padding_value: float = 0.0,
-                 seq_max_length: int = np.inf):
+    def __init__(self, window_size: int, sample_interval: int = 1, target_padding_value: float = 0.0,
+                 seq_max_length: int = np.iinfo(np.int32).max):
         self.window_size = window_size
         self.sample_interval = sample_interval
         self.target_padding_value = target_padding_value
         self.seq_max_length = seq_max_length
 
-    def __call__(self, batch, sample_interval=1, seq_minimal_length=1, padding_value=0.0):
+    def __call__(self, batch: Sequence[torch.Tensor], sample_interval: int = 1,
+                 seq_minimal_length: int = 1, padding_value: float = 0.0) -> Union[torch.Tensor, Mapping]:
         elem = batch[0]
         elem_type = type(elem)
         if isinstance(elem, torch.Tensor):
-            seq = pad_sequence_with_minimal_length(batch,
-                                                   seq_minimal_length=seq_minimal_length,
-                                                   seq_max_length=self.seq_max_length,
-                                                   batch_first=True, padding_value=padding_value)  # type: torch.Tensor
+            seq: torch.Tensor = pad_sequence_with_minimal_length(batch,  # type: ignore[arg-type]
+                                                                 seq_minimal_length=seq_minimal_length,
+                                                                 seq_max_length=self.seq_max_length,
+                                                                 batch_first=True, padding_value=padding_value)
 
             if sample_interval > 1:
                 subseq_length = seq.shape[1]
@@ -128,7 +129,7 @@ def __call__(self, batch, sample_interval=1, seq_minimal_length=1, padding_value
 class TimeSeriesSampler(SubsetRandomSampler):
     def __init__(self,
                  indices: Sequence[int],
-                 seq_lengths: Sequence[int],
+                 seq_lengths: Union[Sequence[int], np.ndarray],
                  num_instances_per_seqs: Optional[Union[List[float], np.ndarray]] = None,
                  min_start: int = 0,
                  generator: Optional[torch.Generator] = None) -> None:
@@ -146,7 +147,7 @@ def __init__(self,
         ----------
         indices: Sequence[int]
             The set of all the possible indices that can be sampled from
-        seq_lengths: Sequence[int]
+        seq_lengths:  Union[Sequence[int], np.ndarray]
             lengths of each sequence, applied to unsqueeze indices
         num_instances_per_seqs: Optional[List[int]]=None
             expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
@@ -177,13 +178,13 @@ def __init__(self,
 
                 num_interval = int(np.ceil(num_instances))
                 if num_interval > idx_end - idx_start or num_interval == 0:
-                    interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.int)
+                    interval = np.linspace(idx_start, idx_end, 2, endpoint=True, dtype=np.intp)
                     # In this case, seq_intervals_decimal contains the entire interval of the sequence.
                     num_expected_ins_decimal.append(num_instances)
                     seq_intervals_decimal.append(interval[:2])
                     seq_intervals_int.append(interval[1:])
                 else:
-                    interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.int)
+                    interval = np.linspace(idx_start, idx_end, num_interval + 1, endpoint=True, dtype=np.intp)
                     # The first two item determines the first sequence interval where most of the samples need to be
                     # padded, we then make it the interval for the expected decimal
                     num_expected_ins_decimal.append(np.modf(num_instances)[0])
@@ -192,7 +193,7 @@ def __init__(self,
                     seq_intervals_int.append(interval[1:])
                 idx_tracker += seq_length
 
-            num_expected_ins_decimal = np.stack(num_expected_ins_decimal)
+            num_expected_ins_decimal_stacked = np.stack(num_expected_ins_decimal)
 
             self.seq_lengths = seq_lengths
             self.seq_lengths_sum = np.sum(seq_lengths)
@@ -201,9 +202,9 @@ def __init__(self,
             self.seq_intervals_decimal = torch.from_numpy(np.stack(seq_intervals_decimal))
             self.seq_intervals_int = seq_intervals_int
 
-            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal) + 1e-8
+            self.num_expected_ins_decimal = torch.from_numpy(num_expected_ins_decimal_stacked) + 1e-8
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[int]:
         if self.iter_all_seqs:
             return super().__iter__()
         samples = torch.ones(self.num_instances, dtype=torch.int)
@@ -239,7 +240,7 @@ def __iter__(self):
 
         yield from (samples[i] for i in torch.randperm(self.num_instances, generator=self.generator))
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.num_instances
 
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 56432407f..80eb9eb8d 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -47,7 +47,7 @@ def __call__(self,
                  sp: int,
                  n_prediction_steps: int,
                  horizon_weight: Optional[List[float]] = None
-                 ):
+                 ) -> float:
         raise NotImplementedError()
 
 
@@ -191,7 +191,7 @@ def __call__(
 
 
 class _ForecastingMetric(ForecastingMetricMixin, autoPyTorchMetric):
-    def __call__(
+    def __call__(  # type: ignore[override]
             self,
             y_true: np.ndarray,
             y_pred: np.ndarray,
@@ -199,7 +199,7 @@ def __call__(
             n_prediction_steps: int,
             horizon_weight: Optional[List[float]] = None,
             sample_weight: Optional[List[float]] = None,
-            **kwarg: Dict,
+            **kwarg: Any,
     ) -> float:
         """Evaluate time series forecasting losses given input data
         The description is nearly the same as the one defined under
@@ -251,12 +251,12 @@ def __call__(
         y_true = y_true.reshape((n_prediction_steps, -1))
         y_pred = y_pred.reshape((n_prediction_steps, -1))
 
-        losses_all = self._metric_func(y_true=y_true,
-                                       y_pred=y_pred,
-                                       sp=sp,
-                                       horizon_weight=horizon_weight,
-                                       multioutput='raw_values',
-                                       **self._kwargs)
+        losses_all: np.ndarray = self._metric_func(y_true=y_true, # type: ignore[assignment]
+                                                   y_pred=y_pred,
+                                                   sp=sp,
+                                                   horizon_weight=horizon_weight,
+                                                   multioutput='raw_values',
+                                                   **self._kwargs)
 
         losses_all = losses_all.reshape([-1, n_outputs])
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index b6b781884..7b869c7da 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -121,7 +121,7 @@ def calculate_score(
         prediction: np.ndarray,
         task_type: int,
         metrics: Iterable[autoPyTorchMetric],
-        **score_kwargs: Dict) -> Dict[str, float]:
+        **score_kwargs: Any) -> Dict[str, float]:
     score_dict = dict()
     if task_type in FORECASTING_TASKS:
         cprediction = sanitize_array(prediction)
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index fdf73a357..4102f949a 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -69,7 +69,7 @@ def __init__(self,
         self.checkpoint_dir: Optional[str] = None
 
     @property
-    def _fit_requirements(self) -> Optional[List[FitRequirement]]:
+    def _fit_requirements(self) -> List[FitRequirement]:
         return [FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
                 FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
                 FitRequirement(
@@ -217,7 +217,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
 
         return cast(autoPyTorchComponent, self.choice)
 
-    def prepare_trainer(self, X):
+    def prepare_trainer(self, X: Dict) -> None:
         """
         prepare trainer, forecasting tasks require more parameters
         """
@@ -246,7 +246,7 @@ def prepare_trainer(self, X):
             step_interval=X['step_interval']
         )
 
-    def get_budget_tracker(self, X):
+    def get_budget_tracker(self, X: Dict) -> BudgetTracker:
         return BudgetTracker(
             budget_type=X['budget_type'],
             max_runtime=X['runtime'] if 'runtime' in X else None,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index f5f1aef99..eb266fd06 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -39,7 +39,7 @@ def add_trainer(trainer: ForecastingBaseTrainerComponent) -> None:
 
 class ForecastingTrainerChoice(TrainerChoice):
     @property
-    def _fit_requirements(self) -> Optional[List[FitRequirement]]:
+    def _fit_requirements(self) -> List[FitRequirement]:
         fit_requirements = super()._fit_requirements
         fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
                                                 user_defined=False, dataset_property=False),
@@ -47,7 +47,7 @@ def _fit_requirements(self) -> Optional[List[FitRequirement]]:
                                 )
         return fit_requirements
 
-    def get_budget_tracker(self, X):
+    def get_budget_tracker(self, X: Dict) -> BudgetTracker:
         if 'epochs' in X:
             max_epochs = X['epochs']
         elif X['budget_type'] in FORECASTING_BUDGET_TYPE:
@@ -60,7 +60,7 @@ def get_budget_tracker(self, X):
             max_epochs=max_epochs,
         )
 
-    def prepare_trainer(self, X):
+    def prepare_trainer(self, X: Dict) -> None:
         # Support additional user metrics
         metrics = get_metrics(dataset_properties=X['dataset_properties'])
         if 'additional_metrics' in X:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index c48e004a0..9b50e153f 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -24,7 +24,7 @@
 
 
 class ForecastingBaseTrainerComponent(BaseTrainerComponent, ABC):
-    def prepare(
+    def prepare(  # type: ignore[override]
             self,
             metrics: List[Any],
             model: ForecastingNet,
@@ -38,7 +38,7 @@ def prepare(
             labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
             step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
             window_size: int = 20,
-            dataset_properties: Optional[Dict] = None,
+            dataset_properties: Dict = {},
             target_scaler: BaseTargetScaler = TargetNoScaler(),
             backcast_loss_ratio: Optional[float] = None,
     ) -> None:
@@ -163,8 +163,8 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
                                                                          future_targets_values.to(self.device))
             backcast, forecast = self.model(past_targets=past_target, past_observed_targets=past_observed_targets)
 
-            loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
-            loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
+            loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)  # type: ignore[arg-type]
+            loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)  # type: ignore[arg-type]
 
             loss_backcast = loss_func_backcast(self.criterion, backcast) * past_observed_targets.to(self.device)
             loss_forecast = loss_func_forecast(self.criterion, forecast) * future_observed_targets.to(self.device)
@@ -197,7 +197,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
                                  future_targets=future_targets_values,
                                  past_observed_targets=past_observed_targets)
 
-            loss_func = self.criterion_preparation(**criterion_kwargs)
+            loss_func = self.criterion_preparation(**criterion_kwargs)  # type: ignore[arg-type]
 
             loss = torch.mean(loss_func(self.criterion, outputs) * future_observed_targets.to(self.device))
 
@@ -275,8 +275,8 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
                 future_targets_values = future_targets_values.to(self.device)
 
                 if isinstance(outputs, list) and self.model.output_type != 'quantile':
-                    loss = [self.criterion(output, future_targets_values) for output in outputs]
-                    loss = torch.mean(torch.Tensor(loss) * future_observed_targets)
+                    losses = [self.criterion(output, future_targets_values) for output in outputs]
+                    loss = torch.mean(torch.Tensor(losses) * future_observed_targets)
                 else:
                     loss = torch.mean(self.criterion(outputs, future_targets_values) * future_observed_targets)
                 outputs = self.model.pred_from_net_output(outputs)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index fd6e15a4b..4d9d9f535 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -95,7 +95,7 @@ def __init__(self,
         # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html
         torch.manual_seed(self.random_state.get_state()[1][0])
 
-    def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
+    def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None, **score_kwargs: Any) -> float:
         """Scores the fitted estimator on (X, y)
 
         Args:
@@ -107,10 +107,10 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None)
             np.ndarray: coefficient of determination R^2 of the prediction
         """
         from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
-        metrics = get_metrics(self.dataset_properties, ['r2'])
+        metrics = get_metrics(self.dataset_properties, ['mean_MAPE_forecasting'])
         y_pred = self.predict(X, batch_size=batch_size)
         r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']],
-                             metrics=metrics)['r2']
+                             metrics=metrics, **score_kwargs)['mean_MAPE_forecasting']
         return r2
 
     def _get_hyperparameter_search_space(self,
@@ -382,14 +382,14 @@ def get_pipeline_representation(self) -> Dict[str, str]:
         Returns:
             Dict: contains the pipeline representation in a short format
         """
-        preprocessing = []
-        estimator = []
+        preprocessing: List[str] = []
+        estimator: List[str] = []
         skip_steps = ['data_loader', 'trainer', 'lr_scheduler', 'optimizer', 'network_init',
                       'preprocessing', 'time_series_transformer']
         for step_name, step_component in self.steps:
             if step_name in skip_steps:
                 continue
-            properties = {}
+            properties: Dict[str, Any] = {}
             if isinstance(step_component, autoPyTorchChoice) and step_component.choice is not None:
                 properties = step_component.choice.get_properties()
             elif isinstance(step_component, autoPyTorchComponent):
diff --git a/requirements.txt b/requirements.txt
index 7940e7f04..202d192aa 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 pandas
-torch==1.10.1
+torch>=1.10.1
 torchvision
 tensorboard
 scikit-learn>=0.24.0,<0.25.0

From 43671dd75cccf22daea899402d34dff3579ef307 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 30 May 2022 14:40:59 +0200
Subject: [PATCH 291/347] maint

---
 .../components/training/data_loader/time_series_util.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index e488b2354..67729a97a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -223,10 +223,11 @@ def __iter__(self) -> Iterator[int]:
             idx_samples_start = idx_samples_end
         num_samples_remain = self.num_instances - idx_samples_end
         if num_samples_remain > 0:
-            if num_samples_remain > self.num_expected_ins_decimal[-1]:
+            if num_samples_remain > self.num_expected_ins_decimal.shape[-1]:
                 replacement = True
             else:
                 replacement = False
+
             samples_idx = torch.multinomial(self.num_expected_ins_decimal, num_samples_remain, replacement)
             seq_interval = self.seq_intervals_decimal[samples_idx]
 

From 806afb3aef58e99a1a487920de83c39d6281a7c0 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 30 May 2022 14:41:20 +0200
Subject: [PATCH 292/347] examples for forecasting

---
 .../example_time_series_forecasting.py        | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 examples/20_basics/example_time_series_forecasting.py

diff --git a/examples/20_basics/example_time_series_forecasting.py b/examples/20_basics/example_time_series_forecasting.py
new file mode 100644
index 000000000..5920a93d5
--- /dev/null
+++ b/examples/20_basics/example_time_series_forecasting.py
@@ -0,0 +1,90 @@
+"""
+======================
+Time Series Forecasting
+======================
+
+The following example shows how to fit a sample forecasting model
+with AutoPyTorch. This is only a dummmy example because of the limited size of the dataset.
+Thus, it could be possible that the AutoPyTorch model does not perform as well as a dummy predictor
+"""
+import os
+import tempfile as tmp
+import warnings
+import copy
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+from sktime.datasets import load_longley
+targets, features = load_longley()
+
+forecasting_horizon = 3
+
+# each series represent an element in the List
+# we take the last forecasting_horizon  as test targets. The itme before that as training targets
+# Normally the value to be forecasted should follow the training sets
+y_train = [targets[: -forecasting_horizon]]
+y_test = [targets[-forecasting_horizon:]]
+
+# same for features. For uni-variant models, X_train, X_test can be omitted
+X_train = [features[: -forecasting_horizon]]
+# Here x_test indicates the 'known future features': they are the features known previously, features that are unknown
+# could be replaced with NAN or zeros (which will not be used by our networks). If no feature is known beforehand,
+# we could also omit X_test
+known_future_features = list(features.columns)
+X_test = [features[-forecasting_horizon:]]
+
+start_times = [targets.index.to_timestamp()[0]]
+freq = '1Y'
+
+from autoPyTorch.api.time_series_forecasting import TimeSeriesForecastingTask
+############################################################################
+# Build and fit a forecaster
+# ==========================
+api = TimeSeriesForecastingTask()
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=copy.deepcopy(y_train),
+    X_test=X_test,
+    optimize_metric='mean_MASE_forecasting',
+    n_prediction_steps=forecasting_horizon,
+    memory_limit=None,
+    freq=freq,
+    start_times=start_times,
+    func_eval_time_limit_secs=50,
+    total_walltime_limit=60,
+    min_num_test_instances=1000,  # proxy validation sets. This only works for the tasks with more than 1000 series
+    known_future_features=known_future_features,
+)
+
+
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
+
+test_sets = []
+
+# We could construct test sets from scratch
+for feature, future_feature, target, start_time in zip(X_train, X_test,y_train, start_times):
+    test_sets.append(
+        TimeSeriesSequence(X=feature.values,
+                           Y=target.values,
+                           X_test=future_feature.values,
+                           start_time=start_time,
+                           is_test_set=True,
+                           # additional information required to construct a new time series sequence
+                           **api.dataset.sequences_builder_kwargs
+                           )
+    )
+# Alternatively, if we only want to forecast the value after the X_train, we could directly ask datamanager to
+# generate a test set:
+# test_sets2 = api.dataset.generate_test_seqs()
+
+pred = api.predict(test_sets)

From bc80bf129207ffc9d2cb903835912f28b76a3e50 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 30 May 2022 14:41:50 +0200
Subject: [PATCH 293/347] fix mypy

---
 autoPyTorch/api/time_series_forecasting.py    | 65 +++++++++----------
 .../data/time_series_feature_validator.py     | 16 +++--
 .../data/time_series_forecasting_validator.py | 54 ++++++++-------
 autoPyTorch/datasets/time_series_dataset.py   |  4 +-
 autoPyTorch/evaluation/abstract_evaluator.py  | 46 +++++++------
 ...time_series_forecasting_train_evaluator.py |  6 +-
 autoPyTorch/optimizer/smbo.py                 |  2 +
 autoPyTorch/optimizer/utils.py                |  4 +-
 .../forecasting_encoder/__init__.py           | 12 ++--
 .../base_forecasting_encoder.py               |  2 +-
 .../flat_encoder/MLPEncoder.py                |  4 +-
 .../flat_encoder/NBEATSEncoder.py             |  6 +-
 .../seq_encoder/__init__.py                   | 16 ++---
 .../time_series_forecasting_data_loader.py    |  6 +-
 .../components/training/metrics/base.py       |  4 +-
 .../forecasting_base_trainer.py               |  6 +-
 16 files changed, 133 insertions(+), 120 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 3a39d8fbe..52dd40472 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -164,7 +164,7 @@ def _get_dataset_input_validator(
             series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
             n_prediction_steps: int = 1,
             known_future_features: Tuple[Union[int, str]] = (),
-            **forecasting_dataset_kwargs,
+            **forecasting_dataset_kwargs: Any,
     ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -238,38 +238,37 @@ def _get_dataset_input_validator(
 
         return dataset, input_validator
 
-    def search(
-            self,
-            optimize_metric: str,
-            X_train: Optional[Union[List, pd.DataFrame]] = None,
-            y_train: Optional[Union[List, pd.DataFrame]] = None,
-            X_test: Optional[Union[List, pd.DataFrame]] = None,
-            y_test: Optional[Union[List, pd.DataFrame]] = None,
-            n_prediction_steps: int = 1,
-            freq: Optional[Union[str, int, List[int]]] = None,
-            start_times: Optional[List[pd.DatetimeIndex]] = None,
-            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
-            dataset_name: Optional[str] = None,
-            budget_type: str = 'epochs',
-            min_budget: Union[int, str] = 5,
-            max_budget: Union[int, str] = 50,
-            total_walltime_limit: int = 100,
-            func_eval_time_limit_secs: Optional[int] = None,
-            enable_traditional_pipeline: bool = False,
-            memory_limit: Optional[int] = 4096,
-            smac_scenario_args: Optional[Dict[str, Any]] = None,
-            get_smac_object_callback: Optional[Callable] = None,
-            all_supported_metrics: bool = True,
-            precision: int = 32,
-            disable_file_output: List = [],
-            load_models: bool = True,
-            portfolio_selection: Optional[str] = None,
-            suggested_init_models: Optional[List[str]] = None,
-            custom_init_setting_path: Optional[str] = None,
-            min_num_test_instances: Optional[int] = None,
-            dataset_compression: Union[Mapping[str, Any], bool] = False,
-            **forecasting_dataset_kwargs
-    ) -> 'BaseTask':
+    def search(self,
+               optimize_metric: str,
+               X_train: Optional[Union[List, pd.DataFrame]] = None,
+               y_train: Optional[Union[List, pd.DataFrame]] = None,
+               X_test: Optional[Union[List, pd.DataFrame]] = None,
+               y_test: Optional[Union[List, pd.DataFrame]] = None,
+               n_prediction_steps: int = 1,
+               freq: Optional[Union[str, int, List[int]]] = None,
+               start_times: Optional[List[pd.DatetimeIndex]] = None,
+               series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+               dataset_name: Optional[str] = None,
+               budget_type: str = 'epochs',
+               min_budget: Union[int, str] = 5,
+               max_budget: Union[int, str] = 50,
+               total_walltime_limit: int = 100,
+               func_eval_time_limit_secs: Optional[int] = None,
+               enable_traditional_pipeline: bool = False,
+               memory_limit: Optional[int] = 4096,
+               smac_scenario_args: Optional[Dict[str, Any]] = None,
+               get_smac_object_callback: Optional[Callable] = None,
+               all_supported_metrics: bool = True,
+               precision: int = 32,
+               disable_file_output: List = [],
+               load_models: bool = True,
+               portfolio_selection: Optional[str] = None,
+               suggested_init_models: Optional[List[str]] = None,
+               custom_init_setting_path: Optional[str] = None,
+               min_num_test_instances: Optional[int] = None,
+               dataset_compression: Union[Mapping[str, Any], bool] = False,
+               **forecasting_dataset_kwargs: Any
+               ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
 
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index e0e646d99..fb308097b 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Optional, Union, List
+from typing import Optional, Union, List, Tuple, Iterable
 import pandas as pd
 import numpy as np
 from scipy.sparse import issparse
@@ -25,7 +25,7 @@ def __init__(
     ):
         super().__init__(logger)
         self.only_contain_series_idx = False
-        self.static_features = ()
+        self.static_features: Union[Tuple[()], Tuple[int]] = ()
         self.series_idx: Optional[List[Union[str, int]]] = None
 
     def get_reordered_columns(self) -> List[str]:
@@ -34,7 +34,7 @@ def get_reordered_columns(self) -> List[str]:
     def fit(self,
             X_train: Union[pd.DataFrame, np.ndarray],
             X_test: Union[pd.DataFrame, np.ndarray] = None,
-            series_idx: Optional[List[Union[str, int]]] = None,
+            series_idx: Optional[Union[List[Union[str, int]]]] = None,
             sequence_lengths: Optional[List[int]] = None) -> BaseEstimator:
         """
 
@@ -59,8 +59,10 @@ def fit(self,
         if issparse(X_train):
             raise NotImplementedError('Sparse matrix is currently unsupported for Forecasting tasks')
         index = None
+
         if series_idx is not None:
             self.series_idx = series_idx
+
             # remove series idx as they are not part of features
             # TODO consider them as static features?
             if isinstance(X_train, pd.DataFrame):
@@ -112,11 +114,11 @@ def fit(self,
                 index = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
         X_train.index = index
 
-        static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()  # type: ignore[assignment]
+        static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
         self.static_features = tuple(idx for idx in static_features.index if static_features[idx])
         return self
 
-    def transform(  # type: ignore[override]
+    def transform(
             self,
             X: Union[pd.DataFrame, np.ndarray],
             index: Optional[Union[pd.Index, np.ndarray]] = None,
@@ -131,10 +133,10 @@ def transform(  # type: ignore[override]
                                           f"X_train is {type(X)} ")
         X_has_idx = isinstance(X, pd.DataFrame)
         if X_has_idx and index is None:
-            index = X.index  # type: ignore[union-attr]
+            index = X.index
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if X.ndim == 1:
-            X = np.expand_dims(X, -1)  # type: ignore[union-attr]
+            X = np.expand_dims(X, -1)   # type:ignore[no-redef]
         X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
         if index is None:
             if not X_has_idx:
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 88754c05a..d3695022d 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
-from typing import Optional, Tuple, List, Union, Dict
+from typing import Optional, Tuple, List, Union, Dict, Iterable
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator
@@ -25,24 +25,24 @@ def __init__(self,
                  dataset_compression: Optional[DatasetCompressionSpec] = None,
                  ) -> None:
         super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port, dataset_compression)
-        self.feature_validator = TimeSeriesFeatureValidator(logger=self.logger)
-        self.target_validator = TimeSeriesTargetValidator(is_classification=self.is_classification,
-                                                          logger=self.logger)
+        self.feature_validator: TimeSeriesFeatureValidator = TimeSeriesFeatureValidator(logger=self.logger)
+        self.target_validator: TimeSeriesTargetValidator = TimeSeriesTargetValidator(
+            is_classification=self.is_classification, logger=self.logger
+        )
         self._is_uni_variant = False
-        self.start_times = None
+        self.start_times: Optional[List[pd.DatetimeIndex]] = None
         self.feature_shapes: Dict[str, int] = {}
         self.feature_names: List[str] = []
-        self.series_idx = None
+        self.series_idx: Optional[Union[List[Union[str, int]], str, int]] = None
 
-    def fit(
-            self,
+    def fit(self,
             X_train: Optional[Union[List, pd.DataFrame]],
             y_train: Union[List, pd.DataFrame],
             series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
             X_test: Optional[Union[List, pd.DataFrame]] = None,
             y_test: Optional[Union[List, pd.DataFrame]] = None,
             start_times: Optional[List[pd.DatetimeIndex]] = None,
-    ) -> BaseEstimator:
+            ) -> BaseEstimator:
         """
         fit the validator with the training data, (optionally) start times and other information
         Args:
@@ -57,12 +57,14 @@ def fit(
                 sampled
 
         """
+        if series_idx is not None and not isinstance(series_idx, Iterable):
+            series_idx: Optional[List[Union[str, int]]] = [series_idx]
+
         self.series_idx = series_idx
 
         if X_train is None:
             self._is_uni_variant = True
 
-        if self._is_uni_variant:
             self.feature_validator.num_features = 0
             self.feature_validator.numerical_columns = []
             self.feature_validator.categorical_columns = []
@@ -101,6 +103,7 @@ def fit(
                                          " {} for features and {} for targets".format(len(X_test), len(y_test), ))
             elif isinstance(y_train, (pd.DataFrame, pd.Series)):
                 sequence_lengths = None
+                assert isinstance(X_train, pd.DataFrame)
                 if series_idx is not None:
                     n_seqs = len(X_train.groupby(series_idx))
                 else:
@@ -129,15 +132,18 @@ def fit(
 
         return self
 
-    def transform(
-            self,
-            X: Optional[Union[List, pd.DataFrame]],
-            y: Optional[Union[List, pd.DataFrame]] = None,
-            validate_for_future_features: bool = False
-    ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
+    def transform(self,
+                  X: Optional[Union[List, pd.DataFrame]],
+                  y: Optional[Union[List, pd.DataFrame]] = None,
+                  validate_for_future_features: bool = False
+                  ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
         """
         transform the data with the fitted validator
         Args:
+            X: Optional[Union[List, pd.DataFrame]]
+                time features
+            y: Optional[Union[List, pd.DataFrame]]
+                forecasting targets
             validate_for_future_features: bool
                 if the validator is applied to transform future features (for test sets), in this case we only validate
                 X
@@ -152,9 +158,9 @@ def transform(
                 sequence_lengths = [0] * num_sequences
                 for seq_idx in range(num_sequences):
                     sequence_lengths[seq_idx] = len(X[seq_idx])
-                sequence_lengths = np.asarray(sequence_lengths)
-                x_transformed, _ = self._transform_X(X, sequence_lengths)
-                return x_transformed, None, sequence_lengths
+                npa_sequence_lengths = np.asarray(sequence_lengths)
+                x_transformed, _ = self._transform_X(X, npa_sequence_lengths)
+                return x_transformed, None, npa_sequence_lengths
             elif isinstance(X, pd.DataFrame):
                 if self.series_idx is not None:
                     X = X.sort_values(self.series_idx)
@@ -179,17 +185,17 @@ def transform(
 
                 for seq_idx in range(num_sequences):
                     sequence_lengths[seq_idx] = len(y[seq_idx])
-                sequence_lengths = np.asarray(sequence_lengths)
+                npa_sequence_lengths = np.asarray(sequence_lengths)
 
                 y_stacked = self.join_series(y)
 
-                x_transformed, series_number = self._transform_X(X, sequence_lengths)
+                x_transformed, series_number = self._transform_X(X, npa_sequence_lengths)
                 y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked, index=series_number)
 
                 if self._is_uni_variant:
-                    return None, y_transformed, sequence_lengths
+                    return None, y_transformed, npa_sequence_lengths
 
-                return x_transformed, y_transformed, sequence_lengths
+                return x_transformed, y_transformed, npa_sequence_lengths
             elif isinstance(y, (pd.DataFrame, pd.Series)):
                 if self.series_idx is not None:
                     if isinstance(y, pd.Series):
@@ -273,7 +279,7 @@ def join_series(X: List[Union[pd.DataFrame, np.ndarray]],
         series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
         if not isinstance(X, List):
             raise ValueError(f'Input must be a list, but it is {type(X)}')
-        if isinstance(X[0], pd.DataFrame):
+        if isinstance(X[0], (pd.DataFrame, pd.Series)):
             joint_input = pd.concat(X)
         elif isinstance(X[0], (List, np.ndarray)):
             joint_input = np.concatenate(X)
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 25deafae2..fdf174b75 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -496,8 +496,8 @@ def __init__(self,
         self.numerical_columns = self.validator.feature_validator.numerical_columns
         self.categorical_columns = self.validator.feature_validator.categorical_columns
 
-        self.num_features = self.validator.feature_validator.num_features  # type: int
-        self.num_targets = self.validator.target_validator.out_dimensionality  # type: int
+        self.num_features: int = self.validator.feature_validator.num_features
+        self.num_targets: int = self.validator.target_validator.out_dimensionality
 
         self.categories = self.validator.feature_validator.categories
 
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 939dd2307..af39c64fe 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -148,6 +148,7 @@ class MyTraditionalTabularRegressionPipeline(BaseEstimator):
             An optional dictionary that is passed to the pipeline's steps. It complies
             a similar function as the kwargs
     """
+
     def __init__(self, config: str,
                  dataset_properties: Dict[str, Any],
                  random_state: Optional[np.random.RandomState] = None,
@@ -191,7 +192,7 @@ def get_pipeline_representation(self) -> Dict[str, str]:
 
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
-        return autoPyTorch.pipeline.traditional_tabular_regression.\
+        return autoPyTorch.pipeline.traditional_tabular_regression. \
             TraditionalTabularRegressionPipeline.get_default_pipeline_options()
 
 
@@ -327,7 +328,7 @@ def fit(self, X: Dict[str, Any], y: Any,
         y_train = subsampler(X['y_train'], X['train_indices'])
         return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train, sample_weight)
 
-    def _genreate_dummy_forecasting(self, X):
+    def _genreate_dummy_forecasting(self, X: List[Union[TimeSeriesSequence, np.ndarray]]) -> List:
         if isinstance(X[0], TimeSeriesSequence):
             X_tail = [x.get_target_values(-1) for x in X]
         else:
@@ -335,12 +336,12 @@ def _genreate_dummy_forecasting(self, X):
         return X_tail
 
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
-                      batch_size: int = 1000) -> np.array:
+                      batch_size: int = 1000) -> np.ndarray:
         X_tail = self._genreate_dummy_forecasting(X)
         return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
-                batch_size: int = 1000) -> np.array:
+                batch_size: int = 1000) -> np.ndarray:
         X_tail = np.asarray(self._genreate_dummy_forecasting(X))
         if X_tail.ndim == 1:
             X_tail = np.expand_dims(X_tail, -1)
@@ -454,6 +455,7 @@ class AbstractEvaluator(object):
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
             An object used to fine tune the hyperparameter search space of the pipeline
     """
+
     def __init__(self, backend: Backend,
                  queue: Queue,
                  metric: autoPyTorchMetric,
@@ -577,7 +579,7 @@ def __init__(self, backend: Backend,
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
 
     def _init_datamanager_info(
-        self,
+            self,
     ) -> None:
         """
         Initialises instance attributes that come from the datamanager.
@@ -624,10 +626,10 @@ def _init_datamanager_info(
         del datamanager
 
     def _init_fit_dictionary(
-        self,
-        logger_port: int,
-        pipeline_config: Dict[str, Any],
-        metrics_dict: Optional[Dict[str, List[str]]] = None,
+            self,
+            logger_port: int,
+            pipeline_config: Dict[str, Any],
+            metrics_dict: Optional[Dict[str, List[str]]] = None,
     ) -> None:
         """
         Initialises the fit dictionary
@@ -727,7 +729,7 @@ def _get_pipeline(self) -> BaseEstimator:
             raise ValueError("Invalid configuration entered")
         return pipeline
 
-    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Dict) -> Dict[str, float]:
+    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Any) -> Dict[str, float]:
         """SMAC follows a minimization goal, so the make_scorer
         sign is used as a guide to obtain the value to reduce.
         The calculate_loss internally translate a score function to
@@ -758,7 +760,7 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Dict) ->
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   opt_pred: np.ndarray, valid_pred: Optional[np.ndarray],
                   test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
-                  file_output: bool, status: StatusType, **metric_kwargs: Dict
+                  file_output: bool, status: StatusType, **metric_kwargs: Any
                   ) -> Optional[Tuple[float, float, int, Dict]]:
         """This function does everything necessary after the fitting is done:
 
@@ -788,7 +790,7 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                 Whether or not this pipeline should output information to disk
             status (StatusType)
                 The status of the run, following SMAC StatusType syntax.
-            metric_kwargs (Dict)
+            metric_kwargs (Any)
                 Additional arguments for computing metrics
 
         Returns:
@@ -842,10 +844,10 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         return None
 
     def calculate_auxiliary_losses(
-        self,
-        Y_valid_pred: np.ndarray,
-        Y_test_pred: np.ndarray,
-        **metric_kwargs: Dict
+            self,
+            Y_valid_pred: np.ndarray,
+            Y_test_pred: np.ndarray,
+            **metric_kwargs: Any
     ) -> Tuple[Optional[Dict[str, float]], Optional[Dict[str, float]]]:
         """
         A helper function to calculate the performance estimate of the
@@ -858,7 +860,7 @@ def calculate_auxiliary_losses(
             Y_test_pred (np.ndarray):
                 predictions on a test set provided by the user,
                 matching self.y_test
-            metric_kwargs (Dict)
+            metric_kwargs (Any)
                 additional argument for evaluating the loss metric
 
         Returns:
@@ -882,10 +884,10 @@ def calculate_auxiliary_losses(
         return validation_loss_dict, test_loss_dict
 
     def file_output(
-        self,
-        Y_optimization_pred: np.ndarray,
-        Y_valid_pred: np.ndarray,
-        Y_test_pred: np.ndarray
+            self,
+            Y_optimization_pred: np.ndarray,
+            Y_valid_pred: np.ndarray,
+            Y_test_pred: np.ndarray
     ) -> Tuple[Optional[float], Dict]:
         """
         This method decides what file outputs are written to disk.
@@ -1020,6 +1022,7 @@ def _predict_proba(self, X: np.ndarray, pipeline: BaseEstimator,
             (np.ndarray):
                 The predictions of pipeline for the given features X
         """
+
         @no_type_check
         def send_warnings_to_log(message, category, filename, lineno,
                                  file=None, line=None):
@@ -1054,6 +1057,7 @@ def _predict_regression(self, X: np.ndarray, pipeline: BaseEstimator,
             (np.ndarray):
                 The predictions of pipeline for the given features X
         """
+
         @no_type_check
         def send_warnings_to_log(message, category, filename, lineno,
                                  file=None, line=None):
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 0e43c14db..2e0fb4d3c 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -121,7 +121,7 @@ def fit_predict_and_loss(self) -> None:
 
             train_loss = None
 
-            loss = self._loss(self.Y_optimization, y_opt_pred, **forecasting_kwargs_val)
+            loss = self._loss(self.Y_optimization, y_opt_pred, **forecasting_kwargs_val)  # type: ignore[arg-type]
 
             additional_run_info = pipeline.get_additional_run_info() if hasattr(
                 pipeline, 'get_additional_run_info') else {}
@@ -132,7 +132,7 @@ def fit_predict_and_loss(self) -> None:
 
             self.finish_up(
                 loss=loss,
-                train_loss=train_loss,
+                train_loss=train_loss,  # type: ignore[arg-type]
                 opt_pred=y_opt_pred * mase_coefficient_val,
                 valid_pred=y_valid_pred,
                 test_pred=y_test_pred,
@@ -199,7 +199,7 @@ def fit_predict_and_loss(self) -> None:
 
                 # Compute validation loss of this fold and store it.
                 optimization_loss = self._loss(
-                    self.Y_targets[i],
+                    self.Y_targets[i],  # type: ignore[arg-type]
                     opt_pred,
                     **forecasting_kwargs_val
                 )
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 8a7f7b857..68661e224 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -61,6 +61,8 @@ def get_smac_object(
         ta (Callable): the function to be intensifier by smac
         ta_kwargs (Dict[str, Any]): Arguments to the above ta
         n_jobs (int): Amount of cores to use for this task
+        initial_budget (int): the minimal budget to be allocated to the target algorithm
+        max_budget (int): the max budget to be allocated to the target algorithm
         dask_client (dask.distributed.Client): User provided scheduler
         initial_configurations (List[Configuration]): List of initial
             configurations which smac will run before starting the search process
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index 31a9b3e00..aa1290234 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -36,7 +36,7 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
                                          suggested_init_models: Optional[List[str]] = None,
                                          custom_init_setting_path: Optional[str] = None,
                                          dataset_properties: Dict = {}
-                                         ):
+                                         ) -> List[Configuration]:
     forecasting_init_path = os.path.join(os.path.dirname(__file__), '../configs/forecasting_init_cfgs.json')
     initial_configurations_dict: List[Dict] = list()
     initial_configurations = []
@@ -46,7 +46,7 @@ def read_forecasting_init_configurations(config_space: ConfigurationSpace,
 
     if suggested_init_models or suggested_init_models is None:
         with open(forecasting_init_path, 'r') as f:
-            forecasting_init_dict: [Dict[str, Any]] = json.load(f)
+            forecasting_init_dict: Dict[str, Any] = json.load(f)
         cfg_trainer: Dict = forecasting_init_dict['trainer']
         models_name_to_cfgs: Dict = forecasting_init_dict['models']
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 9c885c35f..b409a273a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -50,7 +50,7 @@ def __init__(self,
         self.decoder_choice: Optional[List[BaseForecastingDecoder]] = None
 
     @abstractmethod
-    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         """Returns the available backbone components
 
         Args:
@@ -73,7 +73,7 @@ def additional_components(self) -> List[Callable]:
         # This function is deigned to add additional components rather than the components in __choice__
         return [self.get_decoder_components]
 
-    def get_available_components(  # type: ignore[override]
+    def get_available_components(
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             include: List[str] = None,
@@ -107,7 +107,7 @@ def get_available_components(  # type: ignore[override]
         if components is None:
             available_comp = self.get_components()
         else:
-            available_comp = components  # type: ignore[assignment]
+            available_comp = components
 
         if include is not None:
             for incl in include:
@@ -216,7 +216,7 @@ def get_hyperparameter_search_space(
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
             config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
-                dataset_properties,  # type: ignore
+                dataset_properties,
                 **updates)
             parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
             cs.add_configuration_space(
@@ -244,7 +244,7 @@ def get_hyperparameter_search_space(
                 continue
             updates = self._get_search_space_updates(prefix=decoder_name)
             config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
-                dataset_properties,  # type: ignore
+                dataset_properties,
                 **updates
             )
             compatible_encoders = decoder2encoder[decoder_name]
@@ -354,7 +354,7 @@ def set_hyperparameters(self,
 
         self.new_params = new_params
         self.choice = self.get_components()[choice](**new_params)
-        self.decoder_choice = decoder_components[decoder_type](**decoder_params)  # type: ignore[index]
+        self.decoder_choice = decoder_components[decoder_type](**decoder_params)
 
         self.pipeline = Pipeline([('net_structure', ForecastingNetworkStructure()),
                                   ('encoder', self.choice),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index b4c3490b0..e2da1c876 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -103,7 +103,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         has_hidden_states = self.encoder_properties().has_hidden_states
         self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
-        if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:  # type: ignore
+        if self.n_encoder_output_feature() != self.encoder_output_shape[-1]:
             raise ValueError(f'n_encoder_output_feature ({ self.n_encoder_output_feature()}) '
                              f'must equal to the output dimension f({self.encoder_output_shape})')
         return self
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index 217bfecd5..fcbb5955f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -63,7 +63,7 @@ def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class MLPEncoder(BaseForecastingEncoder, MLPBackbone):  # type:ignore[misc]
+class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
     _fixed_seq_length = True
     window_size = 1
 
@@ -131,7 +131,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     @staticmethod
-    def get_hyperparameter_search_space(  # type: ignore
+    def get_hyperparameter_search_space(
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
                                                                               value_range=(1, 5),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 9d034674b..23896ef7f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -51,13 +51,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             input_shape=output_shape,
         )
 
-        self.input_shape = [self.window_size, output_shape[-1]]  # type: ignore[assignment]
+        self.input_shape = [self.window_size, output_shape[-1]]
 
         has_hidden_states = self.encoder_properties().has_hidden_states
-        self.encoder_output_shape = get_output_shape(self.encoder, self.input_shape, has_hidden_states)  # type: ignore
+        self.encoder_output_shape = get_output_shape(self.encoder, self.input_shape, has_hidden_states)
         return self
 
-    def n_encoder_output_feature(self) -> None:  # type: ignore
+    def n_encoder_output_feature(self) -> None:
         # THIS function should never be called!!!
         raise NotImplementedError
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 2d799298c..60a31b309 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -56,7 +56,7 @@ class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
     deepAR_decoder_prefix = 'block_1'
     tf_prefix = "temporal_fusion"
 
-    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         """Returns the available backbone components
 
         Args:
@@ -71,7 +71,7 @@ def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: igno
         components.update(_addons.components)
         return components
 
-    def get_hyperparameter_search_space(  # type: ignore[override]
+    def get_hyperparameter_search_space(
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
@@ -228,12 +228,12 @@ def get_hyperparameter_search_space(  # type: ignore[override]
         # Compile a list of legal preprocessors for this problem
         available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(
             dataset_properties=dataset_properties,
-            include=include, exclude=exclude)  # type:ignore[assignment]
+            include=include, exclude=exclude)
 
         available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(
             dataset_properties=dataset_properties,
             include=None, exclude=exclude,
-            components=self.get_decoder_components())  # type:ignore[assignment]
+            components=self.get_decoder_components())
 
         if len(available_encoders) == 0:
             raise ValueError("No Encoder found")
@@ -394,8 +394,8 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                         encoders_with_multi_decoder.append(encoder)
                     else:
                         encoder_with_single_decoder.append(encoder)
-                encoders_with_multi_decoder = set(encoders_with_multi_decoder)  # type:ignore[assignment]
-                encoder_with_single_decoder = set(encoder_with_single_decoder)  # type:ignore[assignment]
+                encoders_with_multi_decoder = set(encoders_with_multi_decoder)
+                encoder_with_single_decoder = set(encoder_with_single_decoder)
 
                 cs.add_configuration_space(
                     block_prefix + decoder_name,
@@ -511,7 +511,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
             )
 
         for encoder_name, encoder in available_encoders.items():
-            encoder_is_casual = encoder.encoder_properties()  # type: ignore
+            encoder_is_casual = encoder.encoder_properties()
             if not encoder_is_casual:
                 # we do not allow non-casual encoder to appear in the lower layer of the network. e.g, if we have an
                 # encoder with 3 blocks, then non_casual encoder is only allowed to appear in the third layer
@@ -682,7 +682,7 @@ def set_hyperparameters(self,
             if 'auto_regressive' not in decoder_params:
                 decoder_params['auto_regressive'] = decoder_auto_regressive
             encoder = self.get_components()[choice](**new_params)
-            decoder = decoder_components[decoder_type](**decoder_params)  # type:ignore
+            decoder = decoder_components[decoder_type](**decoder_params)
             pipeline_steps.extend([(f'encoder_{i}', encoder), (f'decoder_{i}', decoder)])
             self.encoder_choice.append(encoder)
             self.decoder_choice.append(decoder)
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 7c17a308a..0bd1f580f 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -424,10 +424,10 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     x_seq._cached_time_features = None
 
                 if self.dataset_small_preprocess and not self._is_uni_variant:
-                    x_seq.X = x_all.get_group(i).transform(np.array).values  # type: ignore[has-type]
+                    x_seq.X = x_all.get_group(i).transform(np.array).values
                     update_dict: Dict[str, Any] = {"known_future_features_index": self.known_future_features_index}
-                    if len(self.known_future_features_index) > 0:  # type: ignore[arg-type]
-                        x_seq.X_test = x_all_test.get_group(i).transform(np.array).values  # type: ignore[has-type]
+                    if len(self.known_future_features_index) > 0:
+                        x_seq.X_test = x_all_test.get_group(i).transform(np.array).values
 
                 else:
                     update_dict = {}
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 80eb9eb8d..233b3d33d 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -191,7 +191,7 @@ def __call__(
 
 
 class _ForecastingMetric(ForecastingMetricMixin, autoPyTorchMetric):
-    def __call__(  # type: ignore[override]
+    def __call__(
             self,
             y_true: np.ndarray,
             y_pred: np.ndarray,
@@ -251,7 +251,7 @@ def __call__(  # type: ignore[override]
         y_true = y_true.reshape((n_prediction_steps, -1))
         y_pred = y_pred.reshape((n_prediction_steps, -1))
 
-        losses_all: np.ndarray = self._metric_func(y_true=y_true, # type: ignore[assignment]
+        losses_all: np.ndarray = self._metric_func(y_true=y_true,
                                                    y_pred=y_pred,
                                                    sp=sp,
                                                    horizon_weight=horizon_weight,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 9b50e153f..4948851da 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -163,8 +163,8 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
                                                                          future_targets_values.to(self.device))
             backcast, forecast = self.model(past_targets=past_target, past_observed_targets=past_observed_targets)
 
-            loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)  # type: ignore[arg-type]
-            loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)  # type: ignore[arg-type]
+            loss_func_backcast = self.criterion_preparation(**criterion_kwargs_past)
+            loss_func_forecast = self.criterion_preparation(**criterion_kwargs_future)
 
             loss_backcast = loss_func_backcast(self.criterion, backcast) * past_observed_targets.to(self.device)
             loss_forecast = loss_func_forecast(self.criterion, forecast) * future_observed_targets.to(self.device)
@@ -197,7 +197,7 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
                                  future_targets=future_targets_values,
                                  past_observed_targets=past_observed_targets)
 
-            loss_func = self.criterion_preparation(**criterion_kwargs)  # type: ignore[arg-type]
+            loss_func = self.criterion_preparation(**criterion_kwargs)
 
             loss = torch.mean(loss_func(self.criterion, outputs) * future_observed_targets.to(self.device))
 

From c584a58f876c6002d9461da46762e1050ef1f63a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 30 May 2022 15:05:48 +0200
Subject: [PATCH 294/347] properly memory limitation for forecasting example

---
 examples/20_basics/example_time_series_forecasting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/20_basics/example_time_series_forecasting.py b/examples/20_basics/example_time_series_forecasting.py
index 5920a93d5..e4c3d9ae2 100644
--- a/examples/20_basics/example_time_series_forecasting.py
+++ b/examples/20_basics/example_time_series_forecasting.py
@@ -57,7 +57,7 @@
     X_test=X_test,
     optimize_metric='mean_MASE_forecasting',
     n_prediction_steps=forecasting_horizon,
-    memory_limit=None,
+    memory_limit=16 * 1024,  # Currently, forecasting models need much more memories than it actually requires
     freq=freq,
     start_times=start_times,
     func_eval_time_limit_secs=50,

From 0e371788120a5c32959055294937a5f75dd94247 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 30 May 2022 21:21:06 +0200
Subject: [PATCH 295/347] fix pre-commit

---
 autoPyTorch/api/base_task.py                  |   4 +-
 autoPyTorch/api/time_series_forecasting.py    | 323 ++++++++++--------
 autoPyTorch/data/tabular_target_validator.py  |   9 +-
 .../data/time_series_feature_validator.py     |  84 +++--
 .../data/time_series_forecasting_validator.py | 179 ++++++----
 .../data/time_series_target_validator.py      |  49 +--
 autoPyTorch/datasets/time_series_dataset.py   |  44 ++-
 autoPyTorch/evaluation/abstract_evaluator.py  |  41 +--
 autoPyTorch/evaluation/tae.py                 |   2 +-
 ...time_series_forecasting_train_evaluator.py |  18 +-
 autoPyTorch/evaluation/train_evaluator.py     |  18 +-
 autoPyTorch/optimizer/smbo.py                 |   6 +-
 .../TimeSeriesTransformer.py                  |  12 +-
 .../base_time_series_preprocessing.py         |   4 +-
 .../encoding/NoEncoder.py                     |   8 +-
 .../encoding/OneHotEncoder.py                 |   8 +-
 .../encoding/__init__.py                      |  12 +-
 .../encoding/time_series_base_encoder.py      |   8 +-
 .../imputation/TimeSeriesImputer.py           |  18 +-
 .../scaling/base_scaler.py                    |  17 +-
 .../scaling/utils.py                          |   3 +-
 .../TimeSeriesEarlyPreProcessing.py           |  13 +-
 .../setup/early_preprocessor/utils.py         |   3 +-
 .../TargetMaxAbsScaler.py                     |   3 +-
 .../TargetMeanAbsScaler.py                    |   3 +-
 .../TargetMinMaxScaler.py                     |   3 +-
 .../TargetNoScaler.py                         |   3 +-
 .../TargetStandardScaler.py                   |   3 +-
 .../forecasting_target_scaling/__init__.py    |  14 +-
 .../base_target_scaler.py                     |   7 +-
 .../setup/forecasting_target_scaling/utils.py |   3 +-
 .../DistributionLoss.py                       |  16 +-
 .../forecasting_training_loss/QuantileLoss.py |  10 +-
 .../RegressionLoss.py                         |  18 +-
 .../forecasting_training_loss/__init__.py     |  16 +-
 .../base_forecasting_loss.py                  |   3 +-
 .../components/setup/network/base_network.py  |   2 +-
 .../setup/network/forecasting_architecture.py |  36 +-
 .../setup/network/forecasting_network.py      |  19 +-
 .../setup/network_backbone/__init__.py        |   4 +-
 .../forecasting_backbone/__init__.py          |  24 +-
 .../forecasting_backbone/cells.py             |  28 +-
 .../forecasting_backbone/components_util.py   |   4 +-
 .../forecasting_decoder/MLPDecoder.py         |  26 +-
 .../forecasting_decoder/NBEATSDecoder.py      |  34 +-
 .../forecasting_decoder/RNNDecoder.py         |  14 +-
 .../forecasting_decoder/TransformerDecoder.py |  33 +-
 .../forecasting_decoder/__init__.py           |   9 +-
 .../base_forecasting_decoder.py               |  13 +-
 .../forecasting_decoder/components.py         |   2 +-
 .../forecasting_encoder/__init__.py           |  25 +-
 .../base_forecasting_encoder.py               |  22 +-
 .../forecasting_encoder/components.py         |   2 +-
 .../flat_encoder/MLPEncoder.py                |  21 +-
 .../flat_encoder/NBEATSEncoder.py             |  14 +-
 .../flat_encoder/__init__.py                  |  12 +-
 .../seq_encoder/InceptionTimeEncoder.py       |   9 +-
 .../seq_encoder/RNNEncoder.py                 |  20 +-
 .../seq_encoder/TCNEncoder.py                 |  19 +-
 .../seq_encoder/TransformerEncoder.py         |  21 +-
 .../seq_encoder/__init__.py                   |  56 +--
 .../other_components/TemporalFusion.py        |  22 +-
 .../LearnedEntityEmbedding.py                 |   2 +-
 .../setup/network_embedding/NoEmbedding.py    |   2 +-
 .../setup/network_embedding/__init__.py       |   2 +-
 .../base_network_embedding.py                 |   2 +-
 .../components/setup/network_head/__init__.py |   4 +-
 .../forecasting_network_head/NBEATS_head.py   |  10 +-
 .../forecasting_network_head/distribution.py  |  71 ++--
 .../forecasting_head.py                       |  17 +-
 .../setup/network_initializer/__init__.py     |   2 +-
 .../components/setup/optimizer/__init__.py    |   2 +-
 .../time_series_forecasting_data_loader.py    |  32 +-
 .../training/data_loader/time_series_util.py  |   8 +-
 .../pipeline/components/training/losses.py    |   6 +-
 .../components/training/metrics/base.py       |   2 +-
 .../components/training/metrics/metrics.py    |   5 +-
 .../components/training/metrics/utils.py      |   6 +-
 .../training/trainer/base_trainer.py          |   9 +-
 .../ForecastingMixUpTrainer.py                |   3 +-
 .../ForecastingStandardTrainer.py             |   3 +-
 .../trainer/forecasting_trainer/__init__.py   |  31 +-
 .../forecasting_base_trainer.py               |  17 +-
 .../pipeline/time_series_forecasting.py       |  32 +-
 autoPyTorch/utils/common.py                   |   2 +-
 autoPyTorch/utils/pipeline.py                 |   4 +-
 test/conftest.py                              |   5 +-
 test/test_api/test_api.py                     |   1 -
 test/test_api/utils.py                        |   4 +-
 .../test_forecasting_input_validator.py       |   5 +-
 .../test_forecasting_target_validator.py      |   1 +
 .../test_resampling_strategies.py             |   1 +
 .../test_time_series_datasets.py              |  20 +-
 test/test_evaluation/evaluation_util.py       |   1 -
 .../test_forecasting_evaluators.py            |  10 +-
 .../preprocessing/forecasting/base.py         |   9 +-
 .../forecasting/test_encoder_choice.py        |   4 +-
 .../forecasting/test_encoders.py              |   9 +-
 .../preprocessing/forecasting/test_imputer.py |   6 +-
 .../preprocessing/forecasting/test_scaling.py |   2 +-
 .../test_time_series_transformer.py           |   4 +-
 .../test_base_components.py                   |  29 +-
 .../test_flat_backbones.py                    |  35 +-
 .../test_forecasting_architecture.py          |  29 +-
 .../forecasting_networks/test_seq_encoder.py  |  27 +-
 .../test_forecasting_target_scaling.py        |   9 +-
 .../test_forecasting_training_losses.py       |  24 +-
 .../test_pipeline/components/training/base.py |   1 -
 .../training/test_feature_data_loader.py      |   4 +-
 .../training/test_forecasting_training.py     |   2 +-
 .../training/test_time_series_data_loader.py  |  25 +-
 test/test_pipeline/test_losses.py             |   8 +-
 test/test_pipeline/test_metrics.py            |  27 +-
 113 files changed, 1102 insertions(+), 950 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 86b28ed7e..ccfceb389 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -34,11 +34,12 @@
 from autoPyTorch import metrics
 from autoPyTorch.automl_common.common.utils.backend import Backend, create
 from autoPyTorch.constants import (
-    REGRESSION_TASKS,
     FORECASTING_TASKS,
+    REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
 )
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
@@ -73,7 +74,6 @@
 from autoPyTorch.utils.results_visualizer import ColorLabelSettings, PlotSettingParams, ResultsVisualizer
 from autoPyTorch.utils.single_thread_client import SingleThreadedClient
 from autoPyTorch.utils.stopwatch import StopWatch
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 
 def _pipeline_predict(pipeline: BasePipeline,
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 52dd40472..87754e1a5 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -1,28 +1,28 @@
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Mapping
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 
 import pandas as pd
 
 from autoPyTorch.api.base_task import BaseTask
-from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
-from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
-    HoldoutValTypes,
-    ResamplingStrategies,
-)
-from autoPyTorch.data.utils import (
-    DatasetCompressionSpec,
-    get_dataset_compression_mapping,
-)
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
-from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
 from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE
+from autoPyTorch.data.time_series_forecasting_validator import \
+    TimeSeriesForecastingInputValidator
+from autoPyTorch.data.utils import (DatasetCompressionSpec,
+                                    get_dataset_compression_mapping)
+from autoPyTorch.datasets.base_dataset import (BaseDataset,
+                                               BaseDatasetPropertiesType)
+from autoPyTorch.datasets.resampling_strategy import (CrossValTypes,
+                                                      HoldoutValTypes,
+                                                      ResamplingStrategies)
+from autoPyTorch.datasets.time_series_dataset import (
+    TimeSeriesForecastingDataset, TimeSeriesSequence)
+from autoPyTorch.pipeline.time_series_forecasting import \
+    TimeSeriesForecastingPipeline
+from autoPyTorch.utils.hyperparameter_search_space_update import \
+    HyperparameterSearchSpaceUpdates
 
 
 class TimeSeriesForecastingTask(BaseTask):
@@ -53,24 +53,25 @@ class TimeSeriesForecastingTask(BaseTask):
     """
 
     def __init__(
-            self,
-            seed: int = 1,
-            n_jobs: int = 1,
-            logging_config: Optional[Dict] = None,
-            ensemble_size: int = 50,
-            ensemble_nbest: int = 50,
-            max_models_on_disc: int = 50,
-            temporary_directory: Optional[str] = None,
-            output_directory: Optional[str] = None,
-            delete_tmp_folder_after_terminate: bool = True,
-            delete_output_folder_after_terminate: bool = True,
-            include_components: Optional[Dict] = None,
-            exclude_components: Optional[Dict] = None,
-            resampling_strategy: Union[
-                CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            backend: Optional[Backend] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        self,
+        seed: int = 1,
+        n_jobs: int = 1,
+        logging_config: Optional[Dict] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
+        temporary_directory: Optional[str] = None,
+        output_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        delete_output_folder_after_terminate: bool = True,
+        include_components: Optional[Dict] = None,
+        exclude_components: Optional[Dict] = None,
+        resampling_strategy: Union[
+            CrossValTypes, HoldoutValTypes
+        ] = HoldoutValTypes.time_series_hold_out_validation,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        backend: Optional[Backend] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     ):
         super().__init__(
             seed=seed,
@@ -93,30 +94,32 @@ def __init__(
         )
         # here fraction of subset could be number of images, tabular data or resolution of time-series datasets.
         # TODO if budget type resolution is applied to all datasets, we will put it to configs
-        self.pipeline_options.update({"min_resolution": 0.1,
-                                      "full_resolution": 1.0})
+        self.pipeline_options.update({"min_resolution": 0.1, "full_resolution": 1.0})
 
         self.customized_window_size = False
         if self.search_space_updates is not None:
             for update in self.search_space_updates.updates:
                 # user has already specified a window_size range
-                if update.node_name == 'data_loader' and update.hyperparameter == 'window_size':
+                if (
+                    update.node_name == "data_loader"
+                    and update.hyperparameter == "window_size"
+                ):
                     self.customized_window_size = True
         self.time_series_forecasting = True
 
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TimeSeriesForecastingDataset):
-            raise ValueError("Dataset is incompatible for the given task,: {}".format(
-                type(dataset)
-            ))
+            raise ValueError(
+                "Dataset is incompatible for the given task,: {}".format(type(dataset))
+            )
         return dataset.get_required_dataset_info()
 
     def build_pipeline(
-            self,
-            dataset_properties: Dict[str, BaseDatasetPropertiesType],
-            include_components: Optional[Dict[str, Any]] = None,
-            exclude_components: Optional[Dict[str, Any]] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     ) -> TimeSeriesForecastingPipeline:
         """
         Build pipeline according to current task
@@ -144,27 +147,29 @@ def build_pipeline(
             TimeSeriesForecastingPipeline:
 
         """
-        return TimeSeriesForecastingPipeline(dataset_properties=dataset_properties,
-                                             include=include_components,
-                                             exclude=exclude_components,
-                                             search_space_updates=search_space_updates)
+        return TimeSeriesForecastingPipeline(
+            dataset_properties=dataset_properties,
+            include=include_components,
+            exclude=exclude_components,
+            search_space_updates=search_space_updates,
+        )
 
     def _get_dataset_input_validator(
-            self,
-            X_train: Union[List, pd.DataFrame, np.ndarray],
-            y_train: Union[List, pd.DataFrame, np.ndarray],
-            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            resampling_strategy: Optional[ResamplingStrategies] = None,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            dataset_name: Optional[str] = None,
-            dataset_compression: Optional[DatasetCompressionSpec] = None,
-            freq: Optional[Union[str, int, List[int]]] = None,
-            start_times: List[pd.DatetimeIndex] = [],
-            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
-            n_prediction_steps: int = 1,
-            known_future_features: Tuple[Union[int, str]] = (),
-            **forecasting_dataset_kwargs: Any,
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+        freq: Optional[Union[str, int, List[int]]] = None,
+        start_times: List[pd.DatetimeIndex] = [],
+        series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+        n_prediction_steps: int = 1,
+        known_future_features: Tuple[Union[int, str]] = (),
+        **forecasting_dataset_kwargs: Any,
     ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -204,27 +209,42 @@ def _get_dataset_input_validator(
                 the input validator fitted on the data.
         """
 
-        resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
-        resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
-            self.resampling_strategy_args
+        resampling_strategy = (
+            resampling_strategy
+            if resampling_strategy is not None
+            else self.resampling_strategy
+        )
+        resampling_strategy_args = (
+            resampling_strategy_args
+            if resampling_strategy_args is not None
+            else self.resampling_strategy_args
+        )
 
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         input_validator = TimeSeriesForecastingInputValidator(
             is_classification=False,
             logger_port=self._logger_port,
-            dataset_compression=dataset_compression
+            dataset_compression=dataset_compression,
         )
 
         # Fit an input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
-        input_validator.fit(X_train=X_train, y_train=y_train, start_times=start_times, series_idx=series_idx,
-                            X_test=X_test, y_test=y_test)
+        input_validator.fit(
+            X_train=X_train,
+            y_train=y_train,
+            start_times=start_times,
+            series_idx=series_idx,
+            X_test=X_test,
+            y_test=y_test,
+        )
 
         dataset = TimeSeriesForecastingDataset(
-            X=X_train, Y=y_train,
-            X_test=X_test, Y_test=y_test,
+            X=X_train,
+            Y=y_train,
+            X_test=X_test,
+            Y_test=y_test,
             freq=freq,
             start_times=start_times,
             series_idx=series_idx,
@@ -233,42 +253,43 @@ def _get_dataset_input_validator(
             resampling_strategy_args=resampling_strategy_args,
             n_prediction_steps=n_prediction_steps,
             known_future_features=known_future_features,
-            **forecasting_dataset_kwargs
+            **forecasting_dataset_kwargs,
         )
 
         return dataset, input_validator
 
-    def search(self,
-               optimize_metric: str,
-               X_train: Optional[Union[List, pd.DataFrame]] = None,
-               y_train: Optional[Union[List, pd.DataFrame]] = None,
-               X_test: Optional[Union[List, pd.DataFrame]] = None,
-               y_test: Optional[Union[List, pd.DataFrame]] = None,
-               n_prediction_steps: int = 1,
-               freq: Optional[Union[str, int, List[int]]] = None,
-               start_times: Optional[List[pd.DatetimeIndex]] = None,
-               series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
-               dataset_name: Optional[str] = None,
-               budget_type: str = 'epochs',
-               min_budget: Union[int, str] = 5,
-               max_budget: Union[int, str] = 50,
-               total_walltime_limit: int = 100,
-               func_eval_time_limit_secs: Optional[int] = None,
-               enable_traditional_pipeline: bool = False,
-               memory_limit: Optional[int] = 4096,
-               smac_scenario_args: Optional[Dict[str, Any]] = None,
-               get_smac_object_callback: Optional[Callable] = None,
-               all_supported_metrics: bool = True,
-               precision: int = 32,
-               disable_file_output: List = [],
-               load_models: bool = True,
-               portfolio_selection: Optional[str] = None,
-               suggested_init_models: Optional[List[str]] = None,
-               custom_init_setting_path: Optional[str] = None,
-               min_num_test_instances: Optional[int] = None,
-               dataset_compression: Union[Mapping[str, Any], bool] = False,
-               **forecasting_dataset_kwargs: Any
-               ) -> 'BaseTask':
+    def search(
+        self,
+        optimize_metric: str,
+        X_train: Optional[Union[List, pd.DataFrame]] = None,
+        y_train: Optional[Union[List, pd.DataFrame]] = None,
+        X_test: Optional[Union[List, pd.DataFrame]] = None,
+        y_test: Optional[Union[List, pd.DataFrame]] = None,
+        n_prediction_steps: int = 1,
+        freq: Optional[Union[str, int, List[int]]] = None,
+        start_times: Optional[List[pd.DatetimeIndex]] = None,
+        series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+        dataset_name: Optional[str] = None,
+        budget_type: str = "epochs",
+        min_budget: Union[int, str] = 5,
+        max_budget: Union[int, str] = 50,
+        total_walltime_limit: int = 100,
+        func_eval_time_limit_secs: Optional[int] = None,
+        enable_traditional_pipeline: bool = False,
+        memory_limit: Optional[int] = 4096,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: List = [],
+        load_models: bool = True,
+        portfolio_selection: Optional[str] = None,
+        suggested_init_models: Optional[List[str]] = None,
+        custom_init_setting_path: Optional[str] = None,
+        min_num_test_instances: Optional[int] = None,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
+        **forecasting_dataset_kwargs: Any,
+    ) -> "BaseTask":
         """
         Search for the best pipeline configuration for the given dataset.
 
@@ -387,7 +408,9 @@ def search(self,
 
         """
 
-        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
+        self._dataset_compression = get_dataset_compression_mapping(
+            memory_limit, dataset_compression
+        )
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -402,7 +425,7 @@ def search(self,
             start_times=start_times,
             series_idx=series_idx,
             n_prediction_steps=n_prediction_steps,
-            **forecasting_dataset_kwargs
+            **forecasting_dataset_kwargs,
         )
 
         if self.dataset.base_window_size is not None or not self.customized_window_size:
@@ -421,19 +444,26 @@ def search(self,
 
             window_size_scales = [1, 3]
 
-            self.search_space_updates.append(node_name="data_loader",
-                                             hyperparameter="window_size",
-                                             value_range=[int(window_size_scales[0] * base_window_size),
-                                                          int(window_size_scales[1] * base_window_size)],
-                                             default_value=int(np.ceil(1.25 * base_window_size)),
-                                             )
-
-        self._metrics_kwargs = {'sp': self.dataset.seasonality,
-                                'n_prediction_steps': n_prediction_steps}
-
-        forecasting_kwargs = dict(suggested_init_models=suggested_init_models,
-                                  custom_init_setting_path=custom_init_setting_path,
-                                  min_num_test_instances=min_num_test_instances)
+            self.search_space_updates.append(
+                node_name="data_loader",
+                hyperparameter="window_size",
+                value_range=[
+                    int(window_size_scales[0] * base_window_size),
+                    int(window_size_scales[1] * base_window_size),
+                ],
+                default_value=int(np.ceil(1.25 * base_window_size)),
+            )
+
+        self._metrics_kwargs = {
+            "sp": self.dataset.seasonality,
+            "n_prediction_steps": n_prediction_steps,
+        }
+
+        forecasting_kwargs = dict(
+            suggested_init_models=suggested_init_models,
+            custom_init_setting_path=custom_init_setting_path,
+            min_num_test_instances=min_num_test_instances,
+        )
 
         return self._search(
             dataset=self.dataset,
@@ -452,37 +482,50 @@ def search(self,
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
-            **forecasting_kwargs
+            **forecasting_kwargs,
         )
 
     def predict(
-            self,
-            X_test: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]] = None,
-            batch_size: Optional[int] = None,
-            n_jobs: int = 1,
-            past_targets: Optional[List[np.ndarray]] = None,
-            future_targets: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]] = None,
-            start_times: List[pd.DatetimeIndex] = []
+        self,
+        X_test: Optional[
+            List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]
+        ] = None,
+        batch_size: Optional[int] = None,
+        n_jobs: int = 1,
+        past_targets: Optional[List[np.ndarray]] = None,
+        future_targets: Optional[
+            List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]
+        ] = None,
+        start_times: List[pd.DatetimeIndex] = [],
     ) -> np.ndarray:
         """
-                    target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
-                (used for multi-variable prediction), indicates which value needs to be predicted
+            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
+        (used for multi-variable prediction), indicates which value needs to be predicted
         """
         if not isinstance(X_test[0], TimeSeriesSequence):
             # Validate and construct TimeSeriesSequence
-            X_test, _, _ = self.dataset.transform_data_into_time_series_sequence(X=X_test,
-                                                                                 Y=past_targets,
-                                                                                 X_test=future_targets,
-                                                                                 start_times=start_times,
-                                                                                 is_test_set=True
-                                                                                 )
-        flattened_res = super(TimeSeriesForecastingTask, self).predict(X_test, batch_size, n_jobs)
+            X_test, _, _ = self.dataset.transform_data_into_time_series_sequence(
+                X=X_test,
+                Y=past_targets,
+                X_test=future_targets,
+                start_times=start_times,
+                is_test_set=True,
+            )
+        flattened_res = super(TimeSeriesForecastingTask, self).predict(
+            X_test, batch_size, n_jobs
+        )
         if self.dataset.num_targets == 1:
             forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps])
         else:
-            forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps, self.dataset.num_target])
+            forecasting = flattened_res.reshape(
+                [-1, self.dataset.n_prediction_steps, self.dataset.num_target]
+            )
         if self.dataset.normalize_y:
-            mean = np.repeat(self.dataset.y_mean.values(), self.dataset.n_prediction_steps)
-            std = np.repeat(self.dataset.y_std.values(), self.dataset.n_prediction_steps)
+            mean = np.repeat(
+                self.dataset.y_mean.values(), self.dataset.n_prediction_steps
+            )
+            std = np.repeat(
+                self.dataset.y_std.values(), self.dataset.n_prediction_steps
+            )
             return forecasting * std + mean
         return forecasting
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index e19837707..b0a6a7019 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -1,6 +1,7 @@
 from typing import List, Optional, Union, cast
 
 import numpy as np
+import numpy.ma as ma
 
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
@@ -15,7 +16,6 @@
 
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
 from autoPyTorch.utils.common import ispandas
-import numpy.ma as ma
 
 ArrayType = Union[np.ndarray, spmatrix]
 
@@ -95,10 +95,9 @@ def _fit(
             # We should not reach this if statement as we check for type of targets before
             raise ValueError("Multi-dimensional classification is not yet supported. "
                              "Encoding multidimensional data converts multiple columns "
-                             "to a 1 dimensional encoding. Data involved = {}/{}".format(
-                np.shape(y_train),
-                self.type_of_target
-            ))
+                             "to a 1 dimensional encoding. Data involved = {}/{}".format(np.shape(y_train),
+                                                                                         self.type_of_target)
+                             )
 
         # Mypy redefinition
         assert self.encoder is not None
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index fb308097b..a503f85ea 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -1,11 +1,17 @@
 import logging
-from typing import Optional, Union, List, Tuple, Iterable
-import pandas as pd
+from typing import List, Optional, Tuple, Union
+
+
 import numpy as np
+
+import pandas as pd
+
 from scipy.sparse import issparse
-from sklearn.preprocessing import OrdinalEncoder
 
 from sklearn.base import BaseEstimator
+from sklearn.preprocessing import OrdinalEncoder
+
+
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
@@ -14,14 +20,16 @@ def df2index(df: Union[pd.DataFrame, pd.Series]) -> np.ndarray:
     if isinstance(df, pd.Series):
         seq_lengths = df.value_counts().values
     else:
-        seq_lengths = np.unique(OrdinalEncoder().fit_transform(df), axis=0, return_counts=True)[1]
+        seq_lengths = np.unique(
+            OrdinalEncoder().fit_transform(df), axis=0, return_counts=True
+        )[1]
     return np.arange(len(seq_lengths)).repeat(seq_lengths)
 
 
 class TimeSeriesFeatureValidator(TabularFeatureValidator):
     def __init__(
-            self,
-            logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+        self,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
     ):
         super().__init__(logger)
         self.only_contain_series_idx = False
@@ -29,13 +37,17 @@ def __init__(
         self.series_idx: Optional[List[Union[str, int]]] = None
 
     def get_reordered_columns(self) -> List[str]:
-        return self.transformed_columns + [col for col in self.column_order if col not in set(self.transformed_columns)]
-
-    def fit(self,
-            X_train: Union[pd.DataFrame, np.ndarray],
-            X_test: Union[pd.DataFrame, np.ndarray] = None,
-            series_idx: Optional[Union[List[Union[str, int]]]] = None,
-            sequence_lengths: Optional[List[int]] = None) -> BaseEstimator:
+        return self.transformed_columns + [
+            col for col in self.column_order if col not in set(self.transformed_columns)
+        ]
+
+    def fit(
+        self,
+        X_train: Union[pd.DataFrame, np.ndarray],
+        X_test: Union[pd.DataFrame, np.ndarray] = None,
+        series_idx: Optional[Union[List[Union[str, int]]]] = None,
+        sequence_lengths: Optional[List[int]] = None,
+    ) -> BaseEstimator:
         """
 
         Arguments:
@@ -57,7 +69,9 @@ def fit(self,
                 The fitted base estimator
         """
         if issparse(X_train):
-            raise NotImplementedError('Sparse matrix is currently unsupported for Forecasting tasks')
+            raise NotImplementedError(
+                "Sparse matrix is currently unsupported for Forecasting tasks"
+            )
         index = None
 
         if series_idx is not None:
@@ -68,10 +82,12 @@ def fit(self,
             if isinstance(X_train, pd.DataFrame):
                 for series_id in series_idx:
                     if series_id not in X_train.columns:
-                        raise ValueError(f"All Series ID must be contained in the training column, however, {series_id}"
-                                         f"is not part of {X_train.columns.tolist()}")
+                        raise ValueError(
+                            f"All Series ID must be contained in the training column, however, {series_id}"
+                            f"is not part of {X_train.columns.tolist()}"
+                        )
                 if X_train[list(series_idx)].isnull().values.any():
-                    raise ValueError('NaN should not exit in Series ID!')
+                    raise ValueError("NaN should not exit in Series ID!")
                 index = df2index(df=X_train[series_idx])
 
                 self.only_contain_series_idx = len(X_train.columns) == len(series_idx)
@@ -95,8 +111,10 @@ def fit(self,
 
                 super().fit(X_train, X_test)
             else:
-                raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
-                                          f"X_train is {type(X_train)} ")
+                raise NotImplementedError(
+                    f"series idx only works with pandas.DataFrame but the type of "
+                    f"X_train is {type(X_train)} "
+                )
         else:
             super().fit(X_train, X_test)
 
@@ -110,18 +128,24 @@ def fit(self,
                     index = X_train.index
             else:
                 if np.sum(sequence_lengths) != len(X_train):
-                    raise ValueError("The Sum of Sequence length must equal to the length of hte dataset")
+                    raise ValueError(
+                        "The Sum of Sequence length must equal to the length of hte dataset"
+                    )
                 index = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
         X_train.index = index
 
-        static_features: pd.Series = (X_train.groupby(X_train.index).nunique() <= 1).all()
-        self.static_features = tuple(idx for idx in static_features.index if static_features[idx])
+        static_features: pd.Series = (
+            X_train.groupby(X_train.index).nunique() <= 1
+        ).all()
+        self.static_features = tuple(
+            idx for idx in static_features.index if static_features[idx]
+        )
         return self
 
     def transform(
-            self,
-            X: Union[pd.DataFrame, np.ndarray],
-            index: Optional[Union[pd.Index, np.ndarray]] = None,
+        self,
+        X: Union[pd.DataFrame, np.ndarray],
+        index: Optional[Union[pd.Index, np.ndarray]] = None,
     ) -> Optional[pd.DataFrame]:
         if self.only_contain_series_idx:
             return None
@@ -129,20 +153,22 @@ def transform(
             if isinstance(X, pd.DataFrame):
                 X = X.drop(self.series_idx, axis=1)
             else:
-                raise NotImplementedError(f"series idx only works with pandas.DataFrame but the type of "
-                                          f"X_train is {type(X)} ")
+                raise NotImplementedError(
+                    f"series idx only works with pandas.DataFrame but the type of "
+                    f"X_train is {type(X)} "
+                )
         X_has_idx = isinstance(X, pd.DataFrame)
         if X_has_idx and index is None:
             index = X.index
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if X.ndim == 1:
-            X = np.expand_dims(X, -1)   # type:ignore[no-redef]
+            X = np.expand_dims(X, -1)  # type:ignore[no-redef]
         X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
         if index is None:
             if not X_has_idx:
                 index = np.array([0] * len(X))
         else:
             if len(index) != X.shape[0]:
-                raise ValueError('Given index must have length as the input features!')
+                raise ValueError("Given index must have length as the input features!")
         X.index = index
         return X
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index d3695022d..234f4ed94 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -1,14 +1,19 @@
 # -*- encoding: utf-8 -*-
-from typing import Optional, Tuple, List, Union, Dict, Iterable
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+
 import numpy as np
+
 import pandas as pd
+
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from autoPyTorch.data.utils import DatasetCompressionSpec
+
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.time_series_feature_validator import TimeSeriesFeatureValidator, df2index
 from autoPyTorch.data.time_series_target_validator import TimeSeriesTargetValidator
+from autoPyTorch.data.utils import DatasetCompressionSpec
 
 
 class TimeSeriesForecastingInputValidator(TabularInputValidator):
@@ -19,13 +24,18 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
     TODO for multiple output: target names and shapes
     """
 
-    def __init__(self,
-                 is_classification: bool = False,
-                 logger_port: Optional[int] = None,
-                 dataset_compression: Optional[DatasetCompressionSpec] = None,
-                 ) -> None:
-        super(TimeSeriesForecastingInputValidator, self).__init__(is_classification, logger_port, dataset_compression)
-        self.feature_validator: TimeSeriesFeatureValidator = TimeSeriesFeatureValidator(logger=self.logger)
+    def __init__(
+        self,
+        is_classification: bool = False,
+        logger_port: Optional[int] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
+    ) -> None:
+        super(TimeSeriesForecastingInputValidator, self).__init__(
+            is_classification, logger_port, dataset_compression
+        )
+        self.feature_validator: TimeSeriesFeatureValidator = TimeSeriesFeatureValidator(
+            logger=self.logger
+        )
         self.target_validator: TimeSeriesTargetValidator = TimeSeriesTargetValidator(
             is_classification=self.is_classification, logger=self.logger
         )
@@ -35,14 +45,15 @@ def __init__(self,
         self.feature_names: List[str] = []
         self.series_idx: Optional[Union[List[Union[str, int]], str, int]] = None
 
-    def fit(self,
-            X_train: Optional[Union[List, pd.DataFrame]],
-            y_train: Union[List, pd.DataFrame],
-            series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
-            X_test: Optional[Union[List, pd.DataFrame]] = None,
-            y_test: Optional[Union[List, pd.DataFrame]] = None,
-            start_times: Optional[List[pd.DatetimeIndex]] = None,
-            ) -> BaseEstimator:
+    def fit(
+        self,
+        X_train: Optional[Union[List, pd.DataFrame]],
+        y_train: Union[List, pd.DataFrame],
+        series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
+        X_test: Optional[Union[List, pd.DataFrame]] = None,
+        y_test: Optional[Union[List, pd.DataFrame]] = None,
+        start_times: Optional[List[pd.DatetimeIndex]] = None,
+    ) -> BaseEstimator:
         """
         fit the validator with the training data, (optionally) start times and other information
         Args:
@@ -86,8 +97,13 @@ def fit(self,
             if isinstance(y_train, List):
                 # Check that the data is valid
                 if len(X_train) != len(y_train):
-                    raise ValueError("Inconsistent number of sequences for features and targets,"
-                                     " {} for features and {} for targets".format(len(X_train), len(y_train), ))
+                    raise ValueError(
+                        "Inconsistent number of sequences for features and targets,"
+                        " {} for features and {} for targets".format(
+                            len(X_train),
+                            len(y_train),
+                        )
+                    )
                 n_seqs = len(y_train)
 
                 # X_train and y_train are stored as lists
@@ -95,12 +111,19 @@ def fit(self,
                 if y_test is not None:
                     y_test = self.join_series(y_test, return_seq_lengths=False)
 
-                X_train, sequence_lengths = self.join_series(X_train, return_seq_lengths=True)
+                X_train, sequence_lengths = self.join_series(
+                    X_train, return_seq_lengths=True
+                )
                 X_test = self.join_series(X_test) if X_test is not None else None
                 if X_test is not None and y_test is not None:
                     if len(X_test) != len(y_test):
-                        raise ValueError("Inconsistent number of test datapoints for features and targets,"
-                                         " {} for features and {} for targets".format(len(X_test), len(y_test), ))
+                        raise ValueError(
+                            "Inconsistent number of test datapoints for features and targets,"
+                            " {} for features and {} for targets".format(
+                                len(X_test),
+                                len(y_test),
+                            )
+                        )
             elif isinstance(y_train, (pd.DataFrame, pd.Series)):
                 sequence_lengths = None
                 assert isinstance(X_train, pd.DataFrame)
@@ -111,8 +134,12 @@ def fit(self,
             else:
                 raise NotImplementedError
 
-            self.feature_validator.fit(X_train, X_test,
-                                       series_idx=series_idx, sequence_lengths=sequence_lengths)
+            self.feature_validator.fit(
+                X_train,
+                X_test,
+                series_idx=series_idx,
+                sequence_lengths=sequence_lengths,
+            )
             self.target_validator.fit(y_train, y_test)
 
             if self.feature_validator.only_contain_series_idx:
@@ -121,22 +148,27 @@ def fit(self,
             self._is_fitted = True
 
             self.feature_names = self.feature_validator.get_reordered_columns()
-            self.feature_shapes = {feature_name: 1 for feature_name in self.feature_names}
+            self.feature_shapes = {
+                feature_name: 1 for feature_name in self.feature_names
+            }
 
         if start_times is None:
-            start_times = [pd.Timestamp('1900-01-01')] * n_seqs
+            start_times = [pd.Timestamp("1900-01-01")] * n_seqs
         else:
-            assert len(start_times) == n_seqs, 'start_times_train must have the same length as y_train!'
+            assert (
+                len(start_times) == n_seqs
+            ), "start_times_train must have the same length as y_train!"
 
         self.start_times = start_times
 
         return self
 
-    def transform(self,
-                  X: Optional[Union[List, pd.DataFrame]],
-                  y: Optional[Union[List, pd.DataFrame]] = None,
-                  validate_for_future_features: bool = False
-                  ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
+    def transform(
+        self,
+        X: Optional[Union[List, pd.DataFrame]],
+        y: Optional[Union[List, pd.DataFrame]] = None,
+        validate_for_future_features: bool = False,
+    ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
         """
         transform the data with the fitted validator
         Args:
@@ -149,7 +181,9 @@ def transform(self,
                 X
         """
         if not self._is_fitted:
-            raise NotFittedError("Cannot call transform on a validator that is not fitted")
+            raise NotFittedError(
+                "Cannot call transform on a validator that is not fitted"
+            )
         if validate_for_future_features and y is None:
             if X is None:
                 return None, None, np.asarray([])
@@ -171,17 +205,19 @@ def transform(self,
                 raise NotImplementedError
         else:
             if y is None:
-                raise ValueError('Targets must be given!')
+                raise ValueError("Targets must be given!")
 
             if isinstance(y, List):
                 num_sequences = len(y)
                 sequence_lengths = [0] * num_sequences
                 if not self._is_uni_variant:
                     if X is None:
-                        raise ValueError('Multi Variant dataset requires X as input!')
-                    assert len(X) == len(y), "Length of features must equal to length of targets!"
+                        raise ValueError("Multi Variant dataset requires X as input!")
+                    assert len(X) == len(
+                        y
+                    ), "Length of features must equal to length of targets!"
                 if self.series_idx is not None and X is None:
-                    raise ValueError('X must be given as series_idx!')
+                    raise ValueError("X must be given as series_idx!")
 
                 for seq_idx in range(num_sequences):
                     sequence_lengths[seq_idx] = len(y[seq_idx])
@@ -189,8 +225,12 @@ def transform(self,
 
                 y_stacked = self.join_series(y)
 
-                x_transformed, series_number = self._transform_X(X, npa_sequence_lengths)
-                y_transformed: pd.DataFrame = self.target_validator.transform(y_stacked, index=series_number)
+                x_transformed, series_number = self._transform_X(
+                    X, npa_sequence_lengths
+                )
+                y_transformed: pd.DataFrame = self.target_validator.transform(
+                    y_stacked, index=series_number
+                )
 
                 if self._is_uni_variant:
                     return None, y_transformed, npa_sequence_lengths
@@ -202,7 +242,7 @@ def transform(self,
                         y_columns = [y.name]
                     else:
                         if isinstance(y.columns, pd.RangeIndex):
-                            y_columns = [f'target_{i}' for i in y.columns]
+                            y_columns = [f"target_{i}" for i in y.columns]
                             y.columns = y_columns
                         y_columns = y.columns
                     xy = pd.concat([X, y], axis=1)
@@ -215,33 +255,51 @@ def transform(self,
                 x_transformed, series_number = self._transform_X(X, None)
 
                 if self._is_uni_variant:
-                    y_transformed: pd.DataFrame = self.target_validator.transform(y, series_number)
-                    return None, y_transformed, y_transformed.index.value_counts(sort=False).values
-
-                y_transformed: pd.DataFrame = self.target_validator.transform(y, x_transformed.index)
-                return x_transformed, y_transformed, y_transformed.index.value_counts(sort=False).values
+                    y_transformed: pd.DataFrame = self.target_validator.transform(
+                        y, series_number
+                    )
+                    return (
+                        None,
+                        y_transformed,
+                        y_transformed.index.value_counts(sort=False).values,
+                    )
+
+                y_transformed: pd.DataFrame = self.target_validator.transform(
+                    y, x_transformed.index
+                )
+                return (
+                    x_transformed,
+                    y_transformed,
+                    y_transformed.index.value_counts(sort=False).values,
+                )
 
             else:
                 raise NotImplementedError
 
-    def _transform_X(self,
-                     X: Optional[Union[List, pd.DataFrame]],
-                     sequence_lengths: Optional[np.ndarray] = None) -> Tuple[pd.DataFrame,
-                                                                             Optional[Union[np.ndarray, pd.Index]]]:
+    def _transform_X(
+        self,
+        X: Optional[Union[List, pd.DataFrame]],
+        sequence_lengths: Optional[np.ndarray] = None,
+    ) -> Tuple[pd.DataFrame, Optional[Union[np.ndarray, pd.Index]]]:
         if self.series_idx is None:
             if self._is_uni_variant:
                 x_transformed = None
                 if sequence_lengths is not None:
-                    series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+                    series_number = np.arange(len(sequence_lengths)).repeat(
+                        sequence_lengths
+                    )
                 else:
                     series_number = None
             else:
                 if isinstance(X, List):
                     assert sequence_lengths is not None
-                    series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
+                    series_number = np.arange(len(sequence_lengths)).repeat(
+                        sequence_lengths
+                    )
                     x_stacked = self.join_series(X)
-                    x_transformed = self.feature_validator.transform(x_stacked,
-                                                                     index=series_number)
+                    x_transformed = self.feature_validator.transform(
+                        x_stacked, index=series_number
+                    )
                 elif isinstance(X, pd.DataFrame):
                     series_number = X.index
                     x_transformed = self.feature_validator.transform(X)
@@ -258,17 +316,18 @@ def _transform_X(self,
             series_number = df2index(x_stacked[self.series_idx])
 
             if not self._is_uni_variant:
-                x_transformed = self.feature_validator.transform(x_stacked,
-                                                                 index=series_number)
+                x_transformed = self.feature_validator.transform(
+                    x_stacked, index=series_number
+                )
             else:
                 x_transformed = None
 
         return x_transformed, series_number
 
     @staticmethod
-    def join_series(X: List[Union[pd.DataFrame, np.ndarray]],
-                    return_seq_lengths: bool = False) -> Union[pd.DataFrame,
-                                                               Tuple[pd.DataFrame, List[int]]]:
+    def join_series(
+        X: List[Union[pd.DataFrame, np.ndarray]], return_seq_lengths: bool = False
+    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, List[int]]]:
         """
         join the series into one single value
         """
@@ -278,13 +337,13 @@ def join_series(X: List[Union[pd.DataFrame, np.ndarray]],
             sequence_lengths[seq_idx] = len(X[seq_idx])
         series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
         if not isinstance(X, List):
-            raise ValueError(f'Input must be a list, but it is {type(X)}')
+            raise ValueError(f"Input must be a list, but it is {type(X)}")
         if isinstance(X[0], (pd.DataFrame, pd.Series)):
             joint_input = pd.concat(X)
         elif isinstance(X[0], (List, np.ndarray)):
             joint_input = np.concatenate(X)
         else:
-            raise NotImplementedError(f'Unsupported input type: List[{type(X[0])}]')
+            raise NotImplementedError(f"Unsupported input type: List[{type(X[0])}]")
         joint_input = pd.DataFrame(joint_input)
         joint_input.index = series_number
 
diff --git a/autoPyTorch/data/time_series_target_validator.py b/autoPyTorch/data/time_series_target_validator.py
index 3962d9029..1ade4c361 100644
--- a/autoPyTorch/data/time_series_target_validator.py
+++ b/autoPyTorch/data/time_series_target_validator.py
@@ -1,41 +1,50 @@
 import logging
 from typing import Optional, Union
 
-import pandas as pd
+
 import numpy as np
+
+import pandas as pd
+
 from scipy.sparse import issparse
+
 from sklearn.base import BaseEstimator
 
-from autoPyTorch.utils.logging_ import PicklableClientLogger
+
 from autoPyTorch.data.base_target_validator import SupportedTargetTypes
-from autoPyTorch.data.tabular_target_validator import TabularTargetValidator, ArrayType
+from autoPyTorch.data.tabular_target_validator import ArrayType, TabularTargetValidator
+from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
 class TimeSeriesTargetValidator(TabularTargetValidator):
-    def __init__(self,
-                 is_classification: bool = False,
-                 logger: Optional[
-                     Union[PicklableClientLogger, logging.Logger]
-                 ] = None,
-                 ):
+    def __init__(
+        self,
+        is_classification: bool = False,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+    ):
         if is_classification:
-            raise NotImplementedError("Classification is currently not supported for forecasting tasks!")
+            raise NotImplementedError(
+                "Classification is currently not supported for forecasting tasks!"
+            )
         super().__init__(is_classification, logger)
 
     def fit(
-            self,
-            y_train: SupportedTargetTypes,
-            y_test: Optional[SupportedTargetTypes] = None,
+        self,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         if issparse(y_train):
             # TODO fix this
-            raise NotImplementedError("Sparse Target is unsupported for forecasting task!")
+            raise NotImplementedError(
+                "Sparse Target is unsupported for forecasting task!"
+            )
         return super().fit(y_train, y_test)
 
-    def transform(self,
-                  y: SupportedTargetTypes,
-                  index: Optional[Union[pd.Index, np.ndarray]] = None,
-                  ) -> pd.DataFrame:
+    def transform(
+        self,
+        y: SupportedTargetTypes,
+        index: Optional[Union[pd.Index, np.ndarray]] = None,
+    ) -> pd.DataFrame:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -61,11 +70,11 @@ def transform(self,
                 index = np.array([0] * y.shape[0])  # type: ignore[union-attr]
         else:
             if len(index) != y.shape[0]:  # type: ignore[union-attr]
-                raise ValueError('Index must have length as the input targets!')
+                raise ValueError("Index must have length as the input targets!")
         if y.ndim == 1:  # type: ignore[union-attr]
             y = np.expand_dims(y, -1)
         y: pd.DataFrame = pd.DataFrame(y)  # type: ignore[no-redef]
-        y.index = index   # type: ignore
+        y.index = index  # type: ignore
         return y
 
     @property
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index fdf174b75..3b163ce34 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1,28 +1,36 @@
-import os
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
-from numbers import Real
-import uuid
 import bisect
 import copy
+import os
+import uuid
 import warnings
+from numbers import Real
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+from gluonts.time_feature import Constant as ConstantTransform
+from gluonts.time_feature import TimeFeature, time_features_from_frequency_str
+from gluonts.time_feature.lag import get_lags_for_frequency
 
 import numpy as np
 
 import pandas as pd
 from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+
 from scipy.sparse import issparse
 
 import torch
-from torch.utils.data.dataset import Dataset, ConcatDataset
+from torch.utils.data.dataset import ConcatDataset, Dataset
 
 import torchvision.transforms
 
-from autoPyTorch.constants import (
-    CLASSIFICATION_OUTPUTS,
-    STRING_TO_OUTPUT_TYPES,
-    TASK_TYPES_TO_STRING,
-    TIMESERIES_FORECASTING,
-)
+
+from autoPyTorch.constants import (CLASSIFICATION_OUTPUTS,
+                                   STRING_TO_OUTPUT_TYPES,
+                                   TASK_TYPES_TO_STRING,
+                                   TIMESERIES_FORECASTING)
+from autoPyTorch.constants_forecasting import (MAX_WINDOW_SIZE_BASE,
+                                               SEASONALITY_MAP)
+from autoPyTorch.data.time_series_forecasting_validator import \
+    TimeSeriesForecastingInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset, type_of_target
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValFuncs,
@@ -31,18 +39,9 @@
     HoldOutFuncs,
     HoldoutValTypes
 )
-
-from gluonts.time_feature.lag import get_lags_for_frequency
-from gluonts.time_feature import (
-    Constant as ConstantTransform,
-    TimeFeature,
-    time_features_from_frequency_str,
-)
-
-from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.pipeline.components.training.metrics.metrics import \
+    compute_mase_coefficient
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.constants_forecasting import SEASONALITY_MAP, MAX_WINDOW_SIZE_BASE
-from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
 
 TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
 TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
@@ -560,7 +559,6 @@ def __init__(self,
         self.seq_length_median = int(np.median(self.sequence_lengths_train))
         self.seq_length_max = int(np.max(self.sequence_lengths_train))
 
-
         if int(freq_value) > self.seq_length_median:
             self.base_window_size = self.seq_length_median
         else:
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index af39c64fe..aeb48d5b3 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -19,39 +19,32 @@
 import autoPyTorch.pipeline.image_classification
 import autoPyTorch.pipeline.tabular_classification
 import autoPyTorch.pipeline.tabular_regression
-import autoPyTorch.pipeline.traditional_tabular_classification
 import autoPyTorch.pipeline.time_series_forecasting
+import autoPyTorch.pipeline.traditional_tabular_classification
 import autoPyTorch.pipeline.traditional_tabular_regression
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import (
-    CLASSIFICATION_TASKS,
-    IMAGE_TASKS,
-    MULTICLASS,
-    REGRESSION_TASKS,
-    STRING_TO_OUTPUT_TYPES,
-    STRING_TO_TASK_TYPES,
-    TABULAR_TASKS,
-    FORECASTING_TASKS,
-)
-from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
+from autoPyTorch.constants import (CLASSIFICATION_TASKS, FORECASTING_TASKS,
+                                   IMAGE_TASKS, MULTICLASS, REGRESSION_TASKS,
+                                   STRING_TO_OUTPUT_TYPES,
+                                   STRING_TO_TASK_TYPES, TABULAR_TASKS)
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.datasets.base_dataset import (BaseDataset,
+                                               BaseDatasetPropertiesType)
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
-
 from autoPyTorch.evaluation.utils import (
-    DisableFileOutputParameters,
-    VotingRegressorWrapper,
-    convert_multioutput_multiclass_to_multilabel,
-)
+    DisableFileOutputParameters, VotingRegressorWrapper,
+    convert_multioutput_multiclass_to_multilabel)
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
-from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.base import \
+    autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import (
-    calculate_loss,
-    get_metrics,
-)
+    calculate_loss, get_metrics)
 from autoPyTorch.utils.common import dict_repr, subsampler
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
-from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
+from autoPyTorch.utils.hyperparameter_search_space_update import \
+    HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.logging_ import (PicklableClientLogger,
+                                        get_named_client_logger)
 from autoPyTorch.utils.pipeline import get_dataset_requirements
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 __all__ = [
     'AbstractEvaluator',
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 705c6cb76..f4fda1a94 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -23,6 +23,7 @@
 from smac.tae.execute_func import AbstractTAFunc
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -40,7 +41,6 @@
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 from autoPyTorch.utils.parallel import preload_modules
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 
 
 def fit_predict_try_except_decorator(
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 2e0fb4d3c..5ebd4688d 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -1,7 +1,7 @@
 import copy
 import warnings
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Union, Sequence
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration
 
@@ -11,14 +11,18 @@
 
 from smac.tae import StatusType
 
-from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
-from autoPyTorch.evaluation.utils import DisableFileOutputParameters
-from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.evaluation.abstract_evaluator import DummyTimeSeriesForecastingPipeline
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.constants_forecasting import SEASONALITY_MAP
+from autoPyTorch.evaluation.abstract_evaluator import \
+    DummyTimeSeriesForecastingPipeline
+from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
+from autoPyTorch.pipeline.components.training.metrics.base import \
+    autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.metrics import \
+    MASE_LOSSES
+from autoPyTorch.utils.hyperparameter_search_space_update import \
+    HyperparameterSearchSpaceUpdates
 
 
 class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index cff9a2776..3ce1aed9c 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -10,19 +10,17 @@
 from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import (
-    CLASSIFICATION_TASKS,
-    MULTICLASSMULTIOUTPUT,
-)
-from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
+from autoPyTorch.constants import CLASSIFICATION_TASKS, MULTICLASSMULTIOUTPUT
+from autoPyTorch.datasets.resampling_strategy import (CrossValTypes,
+                                                      HoldoutValTypes)
 from autoPyTorch.evaluation.abstract_evaluator import (
-    AbstractEvaluator,
-    fit_and_suppress_warnings
-)
+    AbstractEvaluator, fit_and_suppress_warnings)
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
-from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.base import \
+    autoPyTorchMetric
 from autoPyTorch.utils.common import dict_repr, subsampler
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.hyperparameter_search_space_update import \
+    HyperparameterSearchSpaceUpdates
 
 __all__ = ['TrainEvaluator', 'eval_train_function']
 
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 68661e224..0070daa17 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -20,6 +20,7 @@
 from smac.utils.io.traj_logging import TrajEntry
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     DEFAULT_RESAMPLING_PARAMETERS,
@@ -29,15 +30,12 @@
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
-from autoPyTorch.optimizer.utils import read_return_initial_configurations, read_forecasting_init_configurations
-
+from autoPyTorch.optimizer.utils import read_forecasting_init_configurations, read_return_initial_configurations
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.logging_ import get_named_client_logger
 from autoPyTorch.utils.stopwatch import StopWatch
 
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
-
 
 def get_smac_object(
         scenario_dict: Dict[str, Any],
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 78d4b4dc4..65c1cf1f3 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -1,20 +1,18 @@
-from typing import Any, Dict, List, Optional, Union, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+
 import pandas as pd
 
 from sklearn.base import BaseEstimator
-from sklearn.pipeline import make_pipeline
 from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import make_pipeline
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent,
-    autoPyTorchTimeSeriesTargetPreprocessingComponent,
-)
+    autoPyTorchTimeSeriesTargetPreprocessingComponent)
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.utils import (
-    get_time_series_preprocessers,
-    get_time_series_target_preprocessers,
-)
+    get_time_series_preprocessers, get_time_series_target_preprocessers)
 from autoPyTorch.utils.common import FitRequirement
 
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
index f00cb95b5..ad327d14c 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -3,9 +3,7 @@
 from sklearn.base import BaseEstimator
 
 from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import (
-    autoPyTorchPreprocessingComponent,
-    autoPyTorchTargetPreprocessingComponent
-)
+    autoPyTorchPreprocessingComponent, autoPyTorchTargetPreprocessingComponent)
 
 
 class autoPyTorchTimeSeriesPreprocessingComponent(autoPyTorchPreprocessingComponent):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
index 6eb270a97..0bcdecc39 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
@@ -3,10 +3,10 @@
 import numpy as np
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import NoEncoder
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import (
-    TimeSeriesBaseEncoder,
-)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import \
+    NoEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
+    TimeSeriesBaseEncoder
 
 
 class TimeSeriesNoEncoder(TimeSeriesBaseEncoder):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
index c69008b86..5ac5e2550 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py
@@ -3,10 +3,10 @@
 import numpy as np
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import OneHotEncoder
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import (
-    TimeSeriesBaseEncoder,
-)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import \
+    OneHotEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
+    TimeSeriesBaseEncoder
 
 
 class TimeSeriesOneHotEncoder(TimeSeriesBaseEncoder):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
index 884ea0df8..4170fff8e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/__init__.py
@@ -3,15 +3,11 @@
 from typing import Dict
 
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import (
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import \
+    EncoderChoice
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.time_series_base_encoder import \
     TimeSeriesBaseEncoder
-)
-
 
 encoding_directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
index 381ebb22d..da9ad016f 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -1,9 +1,9 @@
 from typing import Any, Dict, List, Tuple, Union
 
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
-    autoPyTorchTimeSeriesPreprocessingComponent,
-)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import \
+    BaseEncoder
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
+    autoPyTorchTimeSeriesPreprocessingComponent
 from autoPyTorch.utils.common import FitRequirement
 
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index 822aa8ff3..2dc2891d0 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -1,19 +1,21 @@
 from typing import Any, Dict, List, Optional
 
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
 import numpy as np
+
 from sklearn.base import BaseEstimator
-from sktime.transformations.series.impute import Imputer
 
-from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter
-from autoPyTorch.utils.common import FitRequirement
+from sktime.transformations.series.impute import Imputer
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
     autoPyTorchTimeSeriesPreprocessingComponent,
-    autoPyTorchTimeSeriesTargetPreprocessingComponent
-)
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+    autoPyTorchTimeSeriesTargetPreprocessingComponent)
+from autoPyTorch.utils.common import (FitRequirement,
+                                      HyperparameterSearchSpace,
+                                      add_hyperparameter)
 
 
 class TimeSeriesFeatureImputer(autoPyTorchTimeSeriesPreprocessingComponent):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
index bfc9aab2e..ec782abdc 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -1,17 +1,18 @@
-from typing import Any, Dict, Optional, List, Union
-
-import numpy as np
+from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
+import numpy as np
+
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import (
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.base_time_series_preprocessing import \
     autoPyTorchTimeSeriesPreprocessingComponent
-)
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import TimeSeriesScaler
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.utils import \
+    TimeSeriesScaler
+from autoPyTorch.utils.common import (FitRequirement,
+                                      HyperparameterSearchSpace,
+                                      add_hyperparameter)
 
 
 class BaseScaler(autoPyTorchTimeSeriesPreprocessingComponent):
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
index f80140683..abd246072 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/utils.py
@@ -1,6 +1,7 @@
-from typing import Any, Union, Tuple
+from typing import Any, Tuple, Union
 
 import numpy as np
+
 import pandas as pd
 
 from sklearn.base import BaseEstimator
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index 5a4a88f8f..d68a1cb85 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -1,15 +1,16 @@
-from typing import Any, Dict, Optional, Union, Tuple, List
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+
 import pandas as pd
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import autoPyTorchTargetPreprocessingComponent
-from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
+from autoPyTorch.pipeline.components.preprocessing.base_preprocessing import \
+    autoPyTorchTargetPreprocessingComponent
+from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import \
+    EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import (
-    get_preprocess_transforms,
-    time_series_preprocess
-)
+    get_preprocess_transforms, time_series_preprocess)
 from autoPyTorch.utils.common import FitRequirement
 
 
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index c4c214fb5..a47270c7d 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -1,7 +1,8 @@
 import copy
-from typing import Any, Dict, List, Type, Optional, Union
+from typing import Any, Dict, List, Optional, Type, Union
 
 import numpy as np
+
 import pandas as pd
 
 from sklearn.utils import check_array
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
index 8245c01bd..7e913621e 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+    BaseTargetScaler
 
 
 class TargetMaxAbsScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
index 32add9b32..38ec595ba 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+    BaseTargetScaler
 
 
 class TargetMeanAbsScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
index 924e960b0..6e0319fb8 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+    BaseTargetScaler
 
 
 class TargetMinMaxScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
index 760dd2afe..b8eef62ce 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+    BaseTargetScaler
 
 
 class TargetNoScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
index 2b6f9a743..56ed89f48 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional, Union
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+    BaseTargetScaler
 
 
 class TargetStandardScaler(BaseTargetScaler):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
index fce48a271..4bf851717 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
@@ -1,18 +1,16 @@
 import os
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import \
+    BaseTargetScaler
 
 scaling_directory = os.path.split(__file__)[0]
 _scalers = find_components(__package__,
@@ -55,7 +53,7 @@ def get_hyperparameter_search_space(self,
         cs = ConfigurationSpace()
 
         if dataset_properties is None:
-            dataset_properties:  Dict[str, BaseDatasetPropertiesType] = self.dataset_properties  # type: ignore
+            dataset_properties: Dict[str, BaseDatasetPropertiesType] = self.dataset_properties  # type: ignore
 
         available_scalers = self.get_available_components(dataset_properties=dataset_properties,
                                                           include=include,
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index 5318c1ca4..fb9ca3dfc 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -1,5 +1,7 @@
 from typing import Any, Dict, Optional, Union
 
+from ConfigSpace import ConfigurationSpace
+
 import numpy as np
 
 from sklearn.base import BaseEstimator
@@ -7,11 +9,10 @@
 
 import torch
 
-from ConfigSpace import ConfigurationSpace
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import TargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import \
+    TargetScaler
 
 
 class BaseTargetScaler(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
index c8e499979..7b4782206 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/utils.py
@@ -1,8 +1,9 @@
 from typing import Any, Dict, Optional, Tuple
 
-import torch
 from sklearn.base import BaseEstimator
 
+import torch
+
 
 # Similar to / inspired by
 # https://github.com/tslearn-team/tslearn/blob/a3cf3bf/tslearn/preprocessing/preprocessing.py
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
index 12fcd4410..5039a09d5 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/DistributionLoss.py
@@ -1,18 +1,20 @@
-from typing import Optional, Dict, Union, Any
-import numpy as np
+from typing import Any, Dict, Optional, Union
 
 from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
 from ConfigSpace.conditions import EqualsCondition
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
+
+import numpy as np
+
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import (
+    ForecastingLossComponents
+)
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
     ALL_DISTRIBUTIONS,
     DisForecastingStrategy
 )
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import (
-    ForecastingLossComponents,
-)
 from autoPyTorch.pipeline.components.training.losses import LogProbLoss
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
index 34d0e576f..581fc8828 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/QuantileLoss.py
@@ -1,15 +1,19 @@
-from typing import Optional, Dict, Union, Any
 from functools import partial
-import numpy as np
+from typing import Any, Dict, Optional, Union
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import UniformFloatHyperparameter
 
+import numpy as np
+
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
 from autoPyTorch.pipeline.components.training.losses import QuantileLoss
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter
+)
 
 
 class NetworkQuantileLoss(ForecastingLossComponents):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
index 7aa32f285..a77cd9cb9 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/RegressionLoss.py
@@ -1,17 +1,19 @@
-from typing import Optional, Dict, Union
+from typing import Dict, Optional, Union
 
-import numpy as np
-from ConfigSpace import ConfigurationSpace, CategoricalHyperparameter
+from ConfigSpace import CategoricalHyperparameter, ConfigurationSpace
 
-from autoPyTorch.utils.common import (
-    HyperparameterSearchSpace,
-    add_hyperparameter
-)
+import numpy as np
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
-from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss, MAPELoss, MASELoss
+from autoPyTorch.pipeline.components.training.losses import (
+    L1Loss,
+    MAPELoss,
+    MASELoss,
+    MSELoss
+)
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
 class RegressionLoss(ForecastingLossComponents):
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index 32e5c0443..c6b9c50a8 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -1,21 +1,17 @@
 import os
-from typing import Dict, List, Optional, Any
 from collections import OrderedDict
+from typing import Any, Dict, List, Optional
 
-from ConfigSpace.configuration_space import ConfigurationSpace
 import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
 
+from autoPyTorch.constants import (CLASSIFICATION_TASKS, FORECASTING_TASKS,
+                                   REGRESSION_TASKS, STRING_TO_TASK_TYPES)
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
-
-from autoPyTorch.constants import REGRESSION_TASKS, CLASSIFICATION_TASKS, FORECASTING_TASKS, STRING_TO_TASK_TYPES
-
-from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import\
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss.base_forecasting_loss import \
     ForecastingLossComponents
 
 directory = os.path.split(__file__)[0]
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
index 6133ceae1..d5d4b36a2 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/base_forecasting_loss.py
@@ -1,7 +1,6 @@
-from typing import Dict, Any, Optional, Callable
+from typing import Any, Callable, Dict, Optional
 
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-
 from autoPyTorch.utils.common import FitRequirement
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index e191d3f76..28fe83902 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Union, List
+from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 61ac6c6d1..9d0c8c1f1 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1,31 +1,27 @@
-from typing import Dict, Optional, Union, Tuple, List, Any, TypeVar
-
+import warnings
 from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
 
 import torch
 from torch import nn
-import warnings
-
-from torch.distributions import (
-    AffineTransform,
-    TransformedDistribution,
-)
+from torch.distributions import AffineTransform, TransformedDistribution
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderBlockInfo,
-)
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
+    BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
-    VariableSelector,
-    StackedEncoder,
     StackedDecoder,
-    TemporalFusionLayer
+    StackedEncoder,
+    TemporalFusionLayer,
+    VariableSelector
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderBlockInfo
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
+    EncoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import \
+    _NoEmbedding
 
 ALL_NET_OUTPUT = TypeVar('ALL_NET_OUTPUT', torch.Tensor, List[torch.Tensor], torch.distributions.Distribution)
 
@@ -1150,7 +1146,7 @@ def predict(self,
 class NBEATSNet(ForecastingNet):
     future_target_required = False
 
-    def forward(self, # type: ignore[override]
+    def forward(self,  # type: ignore[override]
                 past_targets: torch.Tensor,
                 future_targets: Optional[torch.Tensor] = None,
                 past_features: Optional[torch.Tensor] = None,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 7b627214f..eaa90f065 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,6 +1,4 @@
-from typing import Any, Dict, Optional, Iterable, Tuple, List
-
-from ConfigSpace.configuration_space import ConfigurationSpace
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import numpy as np
 
@@ -8,21 +6,20 @@
 from torch import nn
 
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
-
-
-from autoPyTorch.utils.common import FitRequirement, get_device_from_fit_dictionary
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
-from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
 from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
+    ForecastingDeepARNet,
     ForecastingNet,
     ForecastingSeq2SeqNet,
-    ForecastingDeepARNet,
-    NBEATSNet,
+    NBEATSNet
 )
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
     DisForecastingStrategy
+from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    get_device_from_fit_dictionary
 )
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index ce1667320..07c4b6382 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 25ff46db3..49a1a8956 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -1,21 +1,21 @@
 from collections import OrderedDict
-import numpy as np
-from typing import Dict, Optional, List, Any
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+import numpy as np
 
-from autoPyTorch.pipeline.components.base_component import (
-    autoPyTorchComponent,
-)
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
-    import FlatForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder import\
-    SeqForecastingEncoderChoice
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    flat_encoder import FlatForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    seq_encoder import SeqForecastingEncoderChoice
+from autoPyTorch.utils.hyperparameter_search_space_update import \
+    HyperparameterSearchSpaceUpdate
 
 
 class ForecastingNetworkChoice(autoPyTorchChoice):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index bc44d918b..7f382aa27 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -1,22 +1,22 @@
-from typing import Any, Dict, Optional, List, Tuple, Union, Set
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
+    GateAddNorm,
+    GatedResidualNetwork,
+    InterpretableMultiHeadAttention,
+    VariableSelectionNetwork
+)
 
 import torch
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderBlockInfo, EncoderOutputForm
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
-    DecoderBlockInfo
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
-    NetworkStructure,
-    AddLayer
-)
 
-from pytorch_forecasting.models.temporal_fusion_transformer.sub_modules import (
-    GateAddNorm, GatedResidualNetwork, VariableSelectionNetwork, InterpretableMultiHeadAttention
-)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    AddLayer, NetworkStructure)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
+    DecoderBlockInfo
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+    EncoderBlockInfo, EncoderOutputForm)
 
 
 class TemporalFusionLayer(nn.Module):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index 471c067ef..e6b75277d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -1,11 +1,9 @@
 import math
+from typing import Any, Dict, NamedTuple, Optional, Tuple
 
 from sklearn.base import BaseEstimator
 
-from typing import Any, Dict, NamedTuple, Optional, Tuple
-
 import torch
-
 from torch import nn
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 7c9bbb131..49de1c050 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -1,22 +1,22 @@
-from typing import Dict, Optional, Tuple, Union, Any, List
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ConfigSpace.conditions import EqualsCondition, GreaterThanCondition
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
 
 import numpy as np
+
 import torch
 from torch import nn
 
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition, EqualsCondition
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
-
-from autoPyTorch.pipeline.components.setup.network_backbone. \
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderNetwork
-)
+from autoPyTorch.pipeline.components.setup.network_head.utils import \
+    _activations
+from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
 
 class MLPDecoderModule(DecoderNetwork):
@@ -31,7 +31,7 @@ def __init__(self,
         self.auto_regressive = auto_regressive
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
-                pos_idx: Optional[Tuple[int]] = None) ->torch.Tensor:
+                pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
         if not self.auto_regressive:
             if len(encoder_output.shape) == 3:
                 encoder_output = encoder_output.squeeze(1)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 0ee90110a..7db3eb245 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -1,22 +1,30 @@
-from typing import List
-import torch
-from ConfigSpace import ConfigurationSpace
-from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformIntegerHyperparameter, \
-    UniformFloatHyperparameter
-from ConfigSpace.conditions import GreaterThanCondition, EqualsCondition, AndConjunction
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-from typing import Dict, Optional, Tuple, Union, Any
+from ConfigSpace import ConfigurationSpace
+from ConfigSpace.conditions import (
+    AndConjunction,
+    EqualsCondition,
+    GreaterThanCondition
+)
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
+)
 
+import torch
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-
-from autoPyTorch.pipeline.components.setup.network_backbone. \
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderNetwork
+from autoPyTorch.pipeline.components.setup.network_head.utils import _activations
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
 )
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index f63edaa64..58b19e202 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -1,22 +1,20 @@
-from typing import Any, Dict, Optional, Tuple, List, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import Constant
 
+import numpy as np
+
 import torch
 from torch import nn
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
     base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderNetwork
-)
-
 from autoPyTorch.utils.common import FitRequirement
 
 
@@ -60,7 +58,7 @@ class ForecastingRNNDecoder(BaseForecastingDecoder):
     def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
-        self.rnn_kwargs:Optional[Dict] = None
+        self.rnn_kwargs: Optional[Dict] = None
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
 
     @property
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 9654e5071..f85e76bb0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -1,33 +1,32 @@
-from typing import Any, Dict, Optional, Tuple, List, Union
-
-import torch
-from torch import nn
-import numpy as np
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
     CategoricalHyperparameter,
-    UniformIntegerHyperparameter,
-    UniformFloatHyperparameter
+    UniformFloatHyperparameter,
+    UniformIntegerHyperparameter
 )
 
+import numpy as np
+
+import torch
+from torch import nn
+
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.utils.common import add_hyperparameter
-
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    PositionalEncoding, build_transformer_layers)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
     base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
-
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderNetwork
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
-    PositionalEncoding,
-    build_transformer_layers
-)
-
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter, FitRequirement
 
 
 class _TransformerDecoder(DecoderNetwork):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index 9d8ca3d5c..345dd39da 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -1,12 +1,9 @@
 import os
 
-from autoPyTorch.pipeline.components.setup.network_backbone. \
-    forecasting_backbone.forecasting_decoder.base_forecasting_decoder import BaseForecastingDecoder
-
 from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    find_components,
-)
+    ThirdPartyComponents, find_components)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder
 
 directory = os.path.split(__file__)[0]
 decoders = find_components(__package__,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 9f25ea2dc..02c0038a6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -1,15 +1,16 @@
 from abc import abstractmethod
-from typing import Any, Dict, Iterable, Tuple, List, Optional
 from collections import OrderedDict
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 from torch import nn
 
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.base_component import (
+    BaseEstimator, autoPyTorchComponent)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
-    DecoderBlockInfo, DecoderProperties
-)
+    DecoderBlockInfo, DecoderProperties)
+from autoPyTorch.utils.common import FitRequirement
 
 
 class BaseForecastingDecoder(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
index 33ed93f69..72350ce7d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple, NamedTuple
+from typing import NamedTuple, Optional, Tuple
 
 import torch
 from torch import nn
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index b409a273a..cc43c73e9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -1,32 +1,31 @@
 import os
 import warnings
-from collections import OrderedDict
-from typing import Dict, Optional, List, Any, Type, Callable
 from abc import abstractmethod
-from sklearn.pipeline import Pipeline
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Optional, Type
 
 import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
 from ConfigSpace.conditions import EqualsCondition, OrConjunction
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from sklearn.pipeline import Pipeline
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
-    find_components,
+    find_components
 )
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.setup.network_backbone import NetworkBackboneChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone. \
-    forecasting_encoder.base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     ForecastingNetworkStructure
-)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import \
-    decoders, decoder_addons
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder import (
+    decoder_addons, decoders)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
     base_forecasting_decoder import BaseForecastingDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index e2da1c876..596231cec 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -1,19 +1,21 @@
-import numpy as np
+from abc import abstractmethod
 from collections import OrderedDict
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import numpy as np
 
-import torchvision
-from autoPyTorch.utils.common import FitRequirement
 from torch import nn
-from abc import abstractmethod
-from typing import Any, Dict, Iterable, Optional, Tuple, List
 
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
-from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+import torchvision
+
+from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderProperties, EncoderBlockInfo
+    EncoderBlockInfo,
+    EncoderProperties
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
 
 
 class BaseForecastingEncoder(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
index 9fba83862..7f7b7e761 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -1,8 +1,8 @@
 from enum import Enum
+from typing import NamedTuple, Tuple
 
 import torch
 from torch import nn
-from typing import Tuple, NamedTuple
 
 
 class EncoderProperties(NamedTuple):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index fcbb5955f..eff88d11c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -1,21 +1,24 @@
-from typing import Any, Dict, List, Optional, Union, Tuple
-
-import torch
-from torch import nn
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
+import torch
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.MLPBackbone import MLPBackbone
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
     EncoderNetwork
-)
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.utils import _activations
-from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    HyperparameterSearchSpace,
+    add_hyperparameter
+)
 
 
 class TimeSeriesMLP(EncoderNetwork):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index 23896ef7f..cbcbbc39c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -1,17 +1,17 @@
-from typing import Any, Dict, List, Optional, Union, Tuple
-
-from torch import nn
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace import ConfigurationSpace
 
+from torch import nn
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
-from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
-from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import FitRequirement
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
     MLPEncoder import TimeSeriesMLP
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
+from autoPyTorch.utils.common import FitRequirement
 
 
 class NBEATSEncoder(BaseForecastingEncoder):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index fe44e1a7e..808209836 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -1,17 +1,15 @@
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import (
-    AbstractForecastingEncoderChoice
-)
-
 import os
 from collections import OrderedDict
-from typing import Dict, Union, Optional, Type
+from typing import Dict, Optional, Type, Union
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
-    find_components,
+    find_components
 )
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 094f26ab8..7ad06b515 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -1,9 +1,7 @@
-from typing import Any, Dict, Optional, Tuple, List
+from typing import Any, Dict, List, Optional, Tuple
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    UniformIntegerHyperparameter
-)
+from ConfigSpace.hyperparameters import UniformIntegerHyperparameter
 
 import torch
 from torch import nn
@@ -11,7 +9,8 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.common import (HyperparameterSearchSpace,
+                                      add_hyperparameter)
 
 
 # Code inspired by https://github.com/hfawaz/InceptionTime
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 1fd73507d..cd7a914a7 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -1,12 +1,10 @@
-from typing import Any, Dict, Optional, Tuple, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter
-)
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformFloatHyperparameter,
+                                         UniformIntegerHyperparameter)
 
 import torch
 from torch import nn
@@ -14,11 +12,15 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
-
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderNetwork, EncoderProperties
+    EncoderNetwork,
+    EncoderProperties
+)
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
 )
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 
 
 class _RNN(EncoderNetwork):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 666b89478..b97970d49 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -2,22 +2,23 @@
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter
-)
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformFloatHyperparameter,
+                                         UniformIntegerHyperparameter)
 
 import torch
 from torch import nn
 from torch.nn.utils import weight_norm
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
+
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
     EncoderNetwork
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    get_hyperparameter
 )
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
 
 
 # _Chomp1d, _TemporalBlock and _TemporalConvNet copied from
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index f1c817411..c8348bea6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -1,25 +1,26 @@
-from typing import Any, Dict, Optional, Tuple, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter,
-    UniformFloatHyperparameter,
-    UniformIntegerHyperparameter
-)
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformFloatHyperparameter,
+                                         UniformIntegerHyperparameter)
 
 import torch
 from torch import nn
 
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
+    PositionalEncoding, build_transformer_layers)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
     EncoderNetwork
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    add_hyperparameter,
+    get_hyperparameter
 )
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
-    PositionalEncoding, build_transformer_layers
 
 
 class _TransformerEncoder(EncoderNetwork):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 60a31b309..3c1abf8c9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -1,44 +1,48 @@
+import inspect
 import os
 from collections import OrderedDict
-from typing import Dict, Optional, List, Any, Union, Type
-from sklearn.pipeline import Pipeline
-import inspect
+from typing import Any, Dict, List, Optional, Type, Union
 
 import ConfigSpace as CS
+from ConfigSpace.conditions import (
+    EqualsCondition,
+    GreaterThanCondition,
+    OrConjunction
+)
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+from ConfigSpace.forbidden import (
+    ForbiddenAndConjunction,
+    ForbiddenEqualsClause,
+    ForbiddenInClause
+)
 from ConfigSpace.hyperparameters import (
-    Hyperparameter,
-    Constant,
     CategoricalHyperparameter,
-    UniformFloatHyperparameter,
+    Constant,
+    Hyperparameter,
     OrdinalHyperparameter,
+    UniformFloatHyperparameter
 )
-from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
-from ConfigSpace.conditions import (
-    EqualsCondition, OrConjunction, GreaterThanCondition
-)
-from ConfigSpace.forbidden import ForbiddenInClause, ForbiddenEqualsClause, ForbiddenAndConjunction
 
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from sklearn.pipeline import Pipeline
 
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents,
-    autoPyTorchComponent,
-    find_components,
-)
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.utils.common import HyperparameterSearchSpace, get_hyperparameter
-
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents, autoPyTorchComponent, find_components)
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    ForecastingNetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    base_forecasting_decoder import BaseForecastingDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
     AbstractForecastingEncoderChoice
-
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    base_forecasting_decoder import BaseForecastingDecoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
-    ForecastingNetworkStructure
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components. \
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.\
     TemporalFusion import TemporalFusion
+from autoPyTorch.utils.common import (
+    HyperparameterSearchSpace,
+    get_hyperparameter
+)
 
 directory = os.path.split(__file__)[0]
 _encoders = find_components(__package__,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index 5649227b1..8b3040c75 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -1,22 +1,26 @@
-import numpy as np
-import torch
+from typing import Any, Dict, Iterable, List, Optional
 
 from ConfigSpace import ConfigurationSpace
+from ConfigSpace.conditions import EqualsCondition
 from ConfigSpace.hyperparameters import (
     CategoricalHyperparameter,
     UniformFloatHyperparameter,
     UniformIntegerHyperparameter
 )
-from ConfigSpace.conditions import EqualsCondition
-from autoPyTorch.utils.common import FitRequirement
-from typing import Any, Dict, Iterable, Optional, List
+
+import numpy as np
+
+import torch
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import TemporalFusionLayer
-from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import \
+    TemporalFusionLayer
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
+    NetworkStructure
+from autoPyTorch.utils.common import (FitRequirement,
+                                      HyperparameterSearchSpace,
+                                      add_hyperparameter, get_hyperparameter)
 
 
 class TemporalFusion(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 66ed8013b..4a915d4f7 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index 0c2fe2d4d..52c56bc00 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Union, Tuple, List
+from typing import Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
index 2add25b3f..756e9e961 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 5091424d7..36b8204b7 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Optional, Tuple, Union, List
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index a6556afbe..67bbd8019 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -1,12 +1,10 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
 
-import numpy as np
-
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import (
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index 6fda054df..b23929175 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -1,13 +1,15 @@
 # This part of implementation follows pytorch-forecasting:
 # https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/nbeats/sub_modules.py
 
-import torch
-from typing import Tuple, List
+from typing import List, Tuple
+
 import numpy as np
+
+import torch
 from torch import nn
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.\
-    forecasting_decoder.NBEATSDecoder import NBEATSBLock
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    NBEATSDecoder import NBEATSBLock
 
 
 class TransposeLinear(nn.Module):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index fb601ce39..6c82eacfd 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -16,23 +16,16 @@
 # However, we don't simply follow their implementation mainly due to the different network backbone.
 # Additionally, scale information is not presented here to avoid
 
-
-from typing import Dict, Tuple, NamedTuple, Any, Type
-
 from abc import abstractmethod
+from typing import Any, Dict, NamedTuple, Tuple, Type
 
 import numpy as np
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.distributions import (
-    Beta,
-    Distribution,
-    Gamma,
-    Normal,
-    Poisson,
-    StudentT,
-)
+from torch.distributions import (Beta, Distribution, Gamma, Normal, Poisson,
+                                 StudentT)
 
 
 class ProjectionLayer(nn.Module):
@@ -44,13 +37,15 @@ class ProjectionLayer(nn.Module):
 
     # https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/modules/distribution_output.py
 
-    def __init__(self,
-                 num_in_features: int,
-                 output_shape: Tuple[int, ...],
-                 n_prediction_heads: int,
-                 auto_regressive: bool,
-                 decoder_has_local_layer: bool,
-                 **kwargs: Any, ):
+    def __init__(
+        self,
+        num_in_features: int,
+        output_shape: Tuple[int, ...],
+        n_prediction_heads: int,
+        auto_regressive: bool,
+        decoder_has_local_layer: bool,
+        **kwargs: Any,
+    ):
         super().__init__(**kwargs)
 
         # we consider all the prediction steps holistically. thus, the output of the poj layer is
@@ -68,12 +63,18 @@ def build_single_proj_layer(arg_dim: int) -> nn.Module:
                 proj_layer (nn.Module): projection layer that maps the decoder output to parameterize distributions
             """
             if decoder_has_local_layer:
-                return nn.Sequential(nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
-                                     nn.Unflatten(-1, (*output_shape, arg_dim)))
+                return nn.Sequential(
+                    nn.Linear(num_in_features, np.prod(output_shape).item() * arg_dim),
+                    nn.Unflatten(-1, (*output_shape, arg_dim)),
+                )
             else:
                 return nn.Sequential(
-                    nn.Linear(num_in_features, n_prediction_heads * np.prod(output_shape).item() * arg_dim),
-                    nn.Unflatten(-1, (n_prediction_heads, *output_shape, arg_dim)))
+                    nn.Linear(
+                        num_in_features,
+                        n_prediction_heads * np.prod(output_shape).item() * arg_dim,
+                    ),
+                    nn.Unflatten(-1, (n_prediction_heads, *output_shape, arg_dim)),
+                )
 
         self.proj = nn.ModuleList(
             [build_single_proj_layer(dim) for dim in self.arg_dims.values()]
@@ -128,10 +129,8 @@ def arg_dims(self) -> Dict[str, int]:
         return {"df": 1, "loc": 1, "scale": 1}
 
     def domain_map(  # type: ignore[override]
-            self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor
-    ) -> Tuple[torch.Tensor,
-               torch.Tensor,
-               torch.Tensor]:
+        self, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         scale = F.softplus(scale) + 1e-10
         df = 2.0 + F.softplus(df)
         return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)
@@ -149,7 +148,7 @@ def arg_dims(self) -> Dict[str, int]:
         return {"concentration1": 1, "concentration0": 1}
 
     def domain_map(  # type: ignore[override]
-            self, concentration1: torch.Tensor, concentration0: torch.Tensor
+        self, concentration1: torch.Tensor, concentration0: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO we need to adapt epsilon value given the datatype of this module
         epsilon = 1e-10
@@ -171,7 +170,7 @@ def arg_dims(self) -> Dict[str, int]:
         return {"concentration": 1, "rate": 1}
 
     def domain_map(  # type: ignore[override]
-            self, concentration: torch.Tensor, rate: torch.Tensor
+        self, concentration: torch.Tensor, rate: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO we need to adapt epsilon value given the datatype of this module
         epsilon = 1e-10
@@ -191,19 +190,20 @@ def arg_dims(self) -> Dict[str, int]:
 
     def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor]:  # type: ignore[override]
         rate_pos = F.softplus(rate).clone()
-        return rate_pos.squeeze(-1),
+        return (rate_pos.squeeze(-1),)
 
     @property
     def dist_cls(self) -> Type[Distribution]:
         return Poisson
 
 
-ALL_DISTRIBUTIONS = {'studentT': StudentTOutput,
-                     'normal': NormalOutput,
-                     # 'beta': BetaOutput,
-                     # 'gamma': GammaOutput,
-                     # 'poisson': PoissonOutput
-                     }  # type: Dict[str, Type[ProjectionLayer]]
+ALL_DISTRIBUTIONS = {
+    "studentT": StudentTOutput,
+    "normal": NormalOutput,
+    # 'beta': BetaOutput,
+    # 'gamma': GammaOutput,
+    # 'poisson': PoissonOutput
+}  # type: Dict[str, Type[ProjectionLayer]]
 
 
 class DisForecastingStrategy(NamedTuple):
@@ -212,6 +212,7 @@ class DisForecastingStrategy(NamedTuple):
     num_samples: int = 100
     aggregation: str = "mean"
 
+
 # TODO find components that are compatible with beta, gamma and poisson distribution!
 
 # TODO consider how to implement NegativeBinomialOutput without scale information
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 279a3084f..3fbc0afeb 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -1,20 +1,23 @@
-from typing import Any, Dict, Iterable, Tuple, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+from ConfigSpace import ConfigurationSpace
 
 import numpy as np
+
 import torch
 from torch import nn
-from ConfigSpace import ConfigurationSpace
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderBlockInfo
-)
 from autoPyTorch.pipeline.components.setup.network_head.base_network_head import NetworkHeadComponent
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import \
-    ALL_DISTRIBUTIONS, DisForecastingStrategy
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.NBEATS_head import build_NBEATS_network
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.utils.common import FitRequirement
 
 
 class QuantileHead(nn.Module):
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
index 100a9c5e0..a2d382999 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
diff --git a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
index ae31a58af..f7254f6dc 100644
--- a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.configuration_space import ConfigurationSpace
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 0bd1f580f..0c1bae6a9 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -1,42 +1,46 @@
-from typing import Any, Dict, Optional, Union, Tuple, List, Callable, Iterator
 import warnings
 from functools import partial
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
 
-from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
 from ConfigSpace.conditions import EqualsCondition
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
+                                         UniformIntegerHyperparameter)
+
+from gluonts.time_feature import TimeFeature
 
 import numpy as np
+
 import pandas as pd
+
 from sklearn.compose import ColumnTransformer
+
 import torch
 
 import torchvision
 
-from gluonts.time_feature import TimeFeature
 
 from autoPyTorch.datasets.time_series_dataset import (
     TimeSeriesForecastingDataset,
     TimeSeriesSequence,
     extract_feature_index
 )
+from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
+from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
+    ExpandTransformTimeSeries,
+    PadSequenceCollector,
+    SequentialSubSetSampler,
+    TestSequenceDataset,
+    TimeSeriesSampler
+)
 from autoPyTorch.utils.common import (
     FitRequirement,
     HyperparameterSearchSpace,
-    custom_collate_fn,
     add_hyperparameter,
+    custom_collate_fn,
     get_hyperparameter
 )
 
-from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
-from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
-    TestSequenceDataset,
-    PadSequenceCollector,
-    TimeSeriesSampler,
-    SequentialSubSetSampler,
-    ExpandTransformTimeSeries
-)
-
 
 class TimeSeriesForecastingDataLoader(FeatureDataLoader):
     """This class is an interface to read time sequence data
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index 67729a97a..bae840ef7 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -1,12 +1,12 @@
-from typing import Optional, Sequence, List, Iterator, Sized, Union, Mapping
+import collections
+from typing import Iterator, List, Mapping, Optional, Sequence, Sized, Union
 
 import numpy as np
 
 import torch
-import collections
-from torch.utils.data.sampler import SubsetRandomSampler, SequentialSampler
 from torch._six import string_classes
-from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format, default_collate
+from torch.utils.data._utils.collate import default_collate, default_collate_err_msg_format, np_str_obj_array_pattern
+from torch.utils.data.sampler import SequentialSampler, SubsetRandomSampler
 
 from autoPyTorch.datasets.base_dataset import TransformSubset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py
index 5c2c73d86..37b3e2bcf 100644
--- a/autoPyTorch/pipeline/components/training/losses.py
+++ b/autoPyTorch/pipeline/components/training/losses.py
@@ -16,7 +16,7 @@
             MASELoss: supports continuous output types
             L1Loss: supports continuous output types
 """
-from typing import Any, Dict, Optional, Type, List, Union
+from typing import Any, Dict, List, Optional, Type, Union
 
 import torch
 from torch.nn.modules.loss import (
@@ -27,8 +27,8 @@
 )
 from torch.nn.modules.loss import _Loss as Loss
 
-from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, MULTICLASS, REGRESSION_TASKS, \
-    FORECASTING_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
+from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, FORECASTING_TASKS, MULTICLASS, \
+    REGRESSION_TASKS, STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES, TASK_TYPES_TO_STRING
 
 
 class AbstractForecastingLoss(Loss):
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index 233b3d33d..d19d6618f 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -1,5 +1,5 @@
 from abc import ABCMeta
-from typing import Any, Callable, List, Optional, Dict, Union
+from typing import Any, Callable, List, Optional, Union
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 683a6d56a..2313fc82a 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -1,11 +1,12 @@
 from functools import partial
+from typing import List, Union
 
 import numpy as np
-from typing import List, Union
 
-import sktime.performance_metrics.forecasting as forecasting_metrics
 import sklearn.metrics
 
+import sktime.performance_metrics.forecasting as forecasting_metrics
+
 from smac.utils.constants import MAXINT
 
 from autoPyTorch.pipeline.components.training.metrics.base import make_metric
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 7b869c7da..080862555 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -5,17 +5,17 @@
 
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
-    REGRESSION_TASKS,
     FORECASTING_TASKS,
+    REGRESSION_TASKS,
     STRING_TO_TASK_TYPES,
     TASK_TYPES,
 )
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.metrics import (
     CLASSIFICATION_METRICS,
-    REGRESSION_METRICS,
     FORECASTING_METRICS,
-    MASE_LOSSES
+    MASE_LOSSES,
+    REGRESSION_METRICS,
 )
 
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index 2cd86a26d..04adce688 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -12,11 +12,14 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
+from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
-from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS, \
-    FORECASTING_METRICS
+from autoPyTorch.pipeline.components.training.metrics.metrics import (
+    CLASSIFICATION_METRICS,
+    FORECASTING_METRICS,
+    REGRESSION_METRICS,
+)
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
 from autoPyTorch.utils.implementations import get_loss_weight_strategy
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
index aaf3fe3d7..197887339 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingMixUpTrainer.py
@@ -1,10 +1,9 @@
 from typing import Dict, Optional, Union
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-
+from autoPyTorch.pipeline.components.training.trainer.MixUpTrainer import MixUpTrainer
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import \
     ForecastingBaseTrainerComponent
-from autoPyTorch.pipeline.components.training.trainer.MixUpTrainer import MixUpTrainer
 
 
 class ForecastingMixUpTrainer(ForecastingBaseTrainerComponent, MixUpTrainer):
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
index 81e3bc1b7..9235565fe 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/ForecastingStandardTrainer.py
@@ -1,10 +1,9 @@
 from typing import Dict, Optional, Union
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-
+from autoPyTorch.pipeline.components.training.trainer.StandardTrainer import StandardTrainer
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import \
     ForecastingBaseTrainerComponent
-from autoPyTorch.pipeline.components.training.trainer.StandardTrainer import StandardTrainer
 
 
 class ForecastingStandardTrainer(ForecastingBaseTrainerComponent, StandardTrainer):
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index eb266fd06..cc6f0cf2a 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -1,27 +1,25 @@
 import collections
 import os
-
-from typing import Dict, List, Optional
-
-from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
-    ForecastingBaseTrainerComponent,
-)
+from typing import Dict, List
 
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
-
+from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
-    find_components,
+    find_components
 )
-from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker
-
-from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
-
-from autoPyTorch.utils.common import get_device_from_fit_dictionary
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
+from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
+from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer.forecasting_base_trainer import (
+    ForecastingBaseTrainerComponent
+)
+from autoPyTorch.utils.common import (
+    FitRequirement,
+    get_device_from_fit_dictionary
+)
 
 trainer_directory = os.path.split(__file__)[0]
 _trainers = find_components(__package__,
@@ -34,9 +32,6 @@ def add_trainer(trainer: ForecastingBaseTrainerComponent) -> None:
     _addons.add_component(trainer)
 
 
-from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
-
-
 class ForecastingTrainerChoice(TrainerChoice):
     @property
     def _fit_requirements(self) -> List[FitRequirement]:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 4948851da..17aca5ca2 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -10,17 +10,22 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.tensorboard.writer import SummaryWriter
 
-from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
+from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
-from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNet, ForecastingDeepARNet, \
-    NBEATSNet, ForecastingSeq2SeqNet
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import (
+    ForecastingDeepARNet,
+    ForecastingNet,
+    ForecastingSeq2SeqNet,
+    NBEATSNet
+)
 from autoPyTorch.pipeline.components.training.losses import MASELoss
-
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
-
-from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
+from autoPyTorch.pipeline.components.training.trainer.base_trainer import (
+    BaseTrainerComponent,
+    BudgetTracker
+)
 
 
 class ForecastingBaseTrainerComponent(BaseTrainerComponent, ABC):
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 4d9d9f535..7dbea4dcb 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -3,9 +3,14 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
-from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause, ForbiddenInClause
+from ConfigSpace.forbidden import (
+    ForbiddenAndConjunction,
+    ForbiddenEqualsClause,
+    ForbiddenInClause
+)
 
 import numpy as np
+
 import pandas as pd
 
 from sklearn.base import RegressorMixin
@@ -20,31 +25,27 @@
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
     TimeSeriesFeatureTransformer
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
-    TimeSeriesEncoderChoice
-)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
     TimeSeriesFeatureImputer,
-    TimeSeriesTargetImputer,
+    TimeSeriesTargetImputer
+)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import (
+    BaseScaler
 )
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
 from autoPyTorch.pipeline.components.setup.early_preprocessor.TimeSeriesEarlyPreProcessing import (
     TimeSeriesEarlyPreprocessing,
     TimeSeriesTargetEarlyPreprocessing
 )
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScalerChoice
+from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
-from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_embedding import NetworkEmbeddingChoice
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
-from autoPyTorch.pipeline.components.setup.network_initializer import (
-    NetworkInitializerChoice
-)
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import (
-    TargetScalerChoice
-)
+from autoPyTorch.pipeline.components.setup.network_initializer import NetworkInitializerChoice
 from autoPyTorch.pipeline.components.setup.optimizer import OptimizerChoice
-from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
 from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
     TimeSeriesForecastingDataLoader
 )
@@ -106,7 +107,8 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None,
         Returns:
             np.ndarray: coefficient of determination R^2 of the prediction
         """
-        from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score
+        from autoPyTorch.pipeline.components.training.metrics.utils import (
+            calculate_score, get_metrics)
         metrics = get_metrics(self.dataset_properties, ['mean_MAPE_forecasting'])
         y_pred = self.predict(X, batch_size=batch_size)
         r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']],
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index a0f29b95b..77f250164 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union, Callable
+from typing import Any, Callable, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py
index 59d747363..570dbc0e8 100644
--- a/autoPyTorch/utils/pipeline.py
+++ b/autoPyTorch/utils/pipeline.py
@@ -4,10 +4,10 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 
 from autoPyTorch.constants import (
-    IMAGE_TASKS,
-    REGRESSION_TASKS,
     CLASSIFICATION_TASKS,
     FORECASTING_TASKS,
+    IMAGE_TASKS,
+    REGRESSION_TASKS,
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS,
 )
diff --git a/test/conftest.py b/test/conftest.py
index 908c0b6f3..e686b4904 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,3 +1,4 @@
+import datetime
 import logging.handlers
 import os
 import re
@@ -12,7 +13,7 @@
 import openml
 
 import pandas as pd
-import datetime
+
 
 import pytest
 
@@ -25,8 +26,8 @@
 
 from autoPyTorch.automl_common.common.utils.backend import create
 from autoPyTorch.data.tabular_validator import TabularInputValidator
-from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
+from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index bf8ab2eb0..49aca1c37 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -15,7 +15,6 @@
 
 import pytest
 
-
 import sklearn
 import sklearn.datasets
 from sklearn.base import BaseEstimator, clone
diff --git a/test/test_api/utils.py b/test/test_api/utils.py
index beff5a2c9..f4cd1c2d9 100644
--- a/test/test_api/utils.py
+++ b/test/test_api/utils.py
@@ -2,15 +2,15 @@
 
 from smac.runhistory.runhistory import DataOrigin, RunHistory, RunKey, RunValue, StatusType
 
-from autoPyTorch.constants import REGRESSION_TASKS, FORECASTING_TASKS
+from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
 from autoPyTorch.evaluation.abstract_evaluator import (
     DummyClassificationPipeline,
     DummyRegressionPipeline,
     DummyTimeSeriesForecastingPipeline,
     fit_and_suppress_warnings
 )
-from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
 from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
+from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
 from autoPyTorch.pipeline.traditional_tabular_classification import TraditionalTabularClassificationPipeline
 
 
diff --git a/test/test_data/test_forecasting_input_validator.py b/test/test_data/test_forecasting_input_validator.py
index e5419198f..8148f1f1d 100644
--- a/test/test_data/test_forecasting_input_validator.py
+++ b/test/test_data/test_forecasting_input_validator.py
@@ -1,6 +1,9 @@
 import numpy as np
-import pytest
+
 import pandas as pd
+
+import pytest
+
 from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 
 
diff --git a/test/test_data/test_forecasting_target_validator.py b/test/test_data/test_forecasting_target_validator.py
index 901a15caa..0d4288cbc 100644
--- a/test/test_data/test_forecasting_target_validator.py
+++ b/test/test_data/test_forecasting_target_validator.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 import pandas as pd
+
 import pytest
 
 from scipy import sparse
diff --git a/test/test_datasets/test_resampling_strategies.py b/test/test_datasets/test_resampling_strategies.py
index 1f046afb5..c37467433 100644
--- a/test/test_datasets/test_resampling_strategies.py
+++ b/test/test_datasets/test_resampling_strategies.py
@@ -1,4 +1,5 @@
 import numpy as np
+
 from autoPyTorch.datasets.resampling_strategy import CrossValFuncs, HoldOutFuncs
 
 
diff --git a/test/test_datasets/test_time_series_datasets.py b/test/test_datasets/test_time_series_datasets.py
index 4b1a6bea9..fa8faa625 100644
--- a/test/test_datasets/test_time_series_datasets.py
+++ b/test/test_datasets/test_time_series_datasets.py
@@ -1,20 +1,24 @@
-from typing import List, Callable, Tuple
+import unittest
+from typing import Callable, List, Tuple
+
+from gluonts.time_feature import Constant as ConstantTransform
+from gluonts.time_feature import DayOfMonth
 
 import numpy as np
-import torch
+
 import pandas as pd
+
 import pytest
-import unittest
-from gluonts.time_feature import Constant as ConstantTransform, DayOfMonth
+
+import torch
+
+
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.datasets.time_series_dataset import (
     TimeSeriesForecastingDataset,
     TimeSeriesSequence,
     extract_feature_index
 )
-from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
-    HoldoutValTypes
-)
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
 
diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py
index c44782d06..a40b0e112 100644
--- a/test/test_evaluation/evaluation_util.py
+++ b/test/test_evaluation/evaluation_util.py
@@ -16,7 +16,6 @@
 from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes
 from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset
-
 from autoPyTorch.pipeline.components.training.metrics.metrics import (
     accuracy,
     balanced_accuracy,
diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
index ba25c0740..677a268e1 100644
--- a/test/test_evaluation/test_forecasting_evaluators.py
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -8,6 +8,7 @@
 from ConfigSpace import Configuration
 
 import numpy as np
+
 from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import create
@@ -19,12 +20,9 @@
 this_directory = os.path.dirname(__file__)
 sys.path.append(this_directory)
 from evaluation_util import (  # noqa (E402: module level import not at top of file)
-    BaseEvaluatorTest,
-    get_binary_classification_datamanager,
-    get_multiclass_classification_datamanager,
-    get_regression_datamanager,
-    get_forecasting_dataset
-)  # noqa (E402: module level import not at top of file)
+    BaseEvaluatorTest, get_binary_classification_datamanager,
+    get_forecasting_dataset, get_multiclass_classification_datamanager,
+    get_regression_datamanager)
 
 from test_evaluators import TestTrainEvaluator
 
diff --git a/test/test_pipeline/components/preprocessing/forecasting/base.py b/test/test_pipeline/components/preprocessing/forecasting/base.py
index dd7936c98..eed947113 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/base.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/base.py
@@ -3,14 +3,13 @@
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.TimeSeriesTransformer import (
-    TimeSeriesFeatureTransformer, TimeSeriesTargetTransformer
-)
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
-    TimeSeriesEncoderChoice
+    TimeSeriesFeatureTransformer,
+    TimeSeriesTargetTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
     TimeSeriesFeatureImputer,
-    TimeSeriesTargetImputer,
+    TimeSeriesTargetImputer
 )
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.scaling.base_scaler import BaseScaler
 from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
index b0015a7fd..9079c6bec 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoder_choice.py
@@ -1,8 +1,6 @@
 import unittest
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import (
-    TimeSeriesEncoderChoice
-)
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding import TimeSeriesEncoderChoice
 
 
 class TestEncoderChoice(unittest.TestCase):
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
index e5134b890..5769650f2 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_encoders.py
@@ -1,18 +1,17 @@
 import unittest
 
-import pandas as pd
 import numpy as np
 from numpy.testing import assert_array_equal
 
+import pandas as pd
+
 from sklearn.base import BaseEstimator
 from sklearn.compose import make_column_transformer
 
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.NoEncoder import (
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.NoEncoder import \
     TimeSeriesNoEncoder
-)
-from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.OneHotEncoder import (
+from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.encoding.OneHotEncoder import \
     TimeSeriesOneHotEncoder
-)
 
 
 class TestEncoders(unittest.TestCase):
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
index 9abee07ab..e219f2dca 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
@@ -1,16 +1,18 @@
 import unittest
 
 import numpy as np
-import pandas as pd
 from numpy.testing import assert_array_equal
 
+import pandas as pd
+
 import pytest
 
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
+
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
     TimeSeriesFeatureImputer,
-    TimeSeriesTargetImputer,
+    TimeSeriesTargetImputer
 )
 
 
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
index eabd99e4d..047806bc5 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_scaling.py
@@ -1,6 +1,7 @@
 import unittest
 
 import numpy as np
+
 import pandas as pd
 
 from sklearn.base import BaseEstimator
@@ -108,7 +109,6 @@ def test_min_max(self):
         transformed_test = np.concatenate([scaler.transform(raw_data) for raw_data in self.raw_data])
         self.assertTrue(np.allclose(transformed_test[:, [0, -1]], transformed_test[:, [0, -1]]))
 
-
     def test_max_abs_scaler(self):
         scaler = TimeSeriesScaler(mode='max_abs',
                                   static_features=self.static_features
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
index 842c994ea..d65c9070f 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_time_series_transformer.py
@@ -1,6 +1,8 @@
-from test.test_pipeline.components.preprocessing.forecasting.base import ForecastingPipeline
+from test.test_pipeline.components.preprocessing.forecasting.base import \
+    ForecastingPipeline
 
 import numpy as np
+
 import pytest
 
 from sklearn.compose import ColumnTransformer
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
index 237b77c3f..a32117b0c 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_base_components.py
@@ -3,30 +3,33 @@
 import unittest
 
 from ConfigSpace import Configuration
-import pandas as pd
+
 import numpy as np
+
+import pandas as pd
+
 import torch
-from autoPyTorch.constants import (
-    TASK_TYPES_TO_STRING,
-    TIMESERIES_FORECASTING,
-)
 
+from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
-    base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
-    MLPDecoder import ForecastingMLPDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.MLPDecoder import (
+    ForecastingMLPDecoder
+)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import (
     DecoderBlockInfo
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    base_forecasting_encoder import BaseForecastingEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import (
-    EncoderBlockInfo, EncoderNetwork
+    EncoderBlockInfo,
+    EncoderNetwork
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
-    ALL_DISTRIBUTIONS, DisForecastingStrategy
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
 )
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
 
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
index cfb0cda9f..7f1c225e0 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_flat_backbones.py
@@ -1,31 +1,32 @@
 import copy
 import unittest
-
-from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import \
     generate_fit_dict_and_dataset_property
-)
+
 from ConfigSpace import Configuration
-import torch
 
 from sklearn.pipeline import Pipeline
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder \
-    import FlatForecastingEncoderChoice
+
+import torch
+
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
+    StackedDecoder,
+    StackedEncoder
+)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
-from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder. \
-    MLPEncoder import MLPEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder. \
-    NBEATSEncoder import NBEATSEncoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.MLPDecoder import (
     ForecastingMLPDecoder
 )
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.NBEATSDecoder \
-    import NBEATSDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+    NBEATSDecoder import NBEATSDecoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    flat_encoder import FlatForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    MLPEncoder import MLPEncoder
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.flat_encoder.\
+    NBEATSEncoder import NBEATSEncoder
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
-    StackedEncoder,
-    StackedDecoder
-)
 
 
 class TestFlatEncoder(unittest.TestCase):
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
index 2e8ef547e..812a40e89 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
@@ -1,33 +1,28 @@
 import copy
 import unittest
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import \
+    generate_fit_dict_and_dataset_property
 
 import pytest
-import torch
 
-from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
-    generate_fit_dict_and_dataset_property
-)
+import torch
 
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
-from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
-
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import (
-    ForecastingHead
+from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
+    AbstractForecastingNet,
+    get_lagged_subsequences,
+    get_lagged_subsequences_inference
 )
+from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone import ForecastingNetworkChoice
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
 from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
-    DisForecastingStrategy,
     ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
 )
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
-from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
-from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
-    get_lagged_subsequences,
-    get_lagged_subsequences_inference,
-    AbstractForecastingNet
-)
-
 
 class ReducedEmbedding(torch.nn.Module):
     # a dummy reduced embedding, it simply cut row for each categorical features
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
index 496a88829..67b6f85f6 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_seq_encoder.py
@@ -1,28 +1,25 @@
 import copy
 import unittest
-import torch
 from itertools import product
-from sklearn.pipeline import Pipeline
-
-from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import (
+from test.test_pipeline.components.setup.forecasting.forecasting_networks.test_base_components import \
     generate_fit_dict_and_dataset_property
-)
-
-from autoPyTorch.utils.common import HyperparameterSearchSpace
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.seq_encoder \
-    import SeqForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
 
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from sklearn.pipeline import Pipeline
 
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
+import torch
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
-    StackedEncoder,
     StackedDecoder,
-    TemporalFusionLayer,
+    StackedEncoder,
+    TemporalFusionLayer
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import NetworkStructure
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+    seq_encoder import SeqForecastingEncoderChoice
+from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import _NoEmbedding
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.forecasting_head import ForecastingHead
+from autoPyTorch.utils.common import HyperparameterSearchSpace
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
 
 
 class TestSeqEncoder(unittest.TestCase):
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
index 3998f518a..d094e1933 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -1,14 +1,15 @@
-import torch
-
 import copy
 import unittest
+
+import torch
+
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScalerChoice
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMaxAbsScaler import TargetMaxAbsScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMeanAbsScaler import TargetMeanAbsScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMinMaxScaler import TargetMinMaxScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 
 
 class TestTargetScalar(unittest.TestCase):
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
index 664f1e802..1b795a0cd 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_training_losses.py
@@ -1,21 +1,23 @@
 import copy
 import unittest
 
-from autoPyTorch.constants import (
-    TASK_TYPES_TO_STRING,
-    TIMESERIES_FORECASTING,
-)
-from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
-    ALL_DISTRIBUTIONS,
-    DisForecastingStrategy
-)
-from autoPyTorch.pipeline.components.training.losses import LogProbLoss, QuantileLoss
-from autoPyTorch.pipeline.components.training.losses import L1Loss, MSELoss, MAPELoss, MASELoss
-
+from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.DistributionLoss import DistributionLoss
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.QuantileLoss import NetworkQuantileLoss
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss.RegressionLoss import RegressionLoss
+from autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head.distribution import (
+    ALL_DISTRIBUTIONS,
+    DisForecastingStrategy
+)
+from autoPyTorch.pipeline.components.training.losses import (
+    L1Loss,
+    LogProbLoss,
+    MAPELoss,
+    MASELoss,
+    MSELoss,
+    QuantileLoss
+)
 
 
 class TestForecastingTrainingLoss(unittest.TestCase):
diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py
index 71f871765..b5feba9f1 100644
--- a/test/test_pipeline/components/training/base.py
+++ b/test/test_pipeline/components/training/base.py
@@ -12,7 +12,6 @@
     OUTPUT_TYPES_TO_STRING,
     REGRESSION_TASKS,
     TASK_TYPES_TO_STRING)
-
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
 
diff --git a/test/test_pipeline/components/training/test_feature_data_loader.py b/test/test_pipeline/components/training/test_feature_data_loader.py
index 7d4c9d80d..77cf82152 100644
--- a/test/test_pipeline/components/training/test_feature_data_loader.py
+++ b/test/test_pipeline/components/training/test_feature_data_loader.py
@@ -3,9 +3,7 @@
 
 import torchvision
 
-from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import (
-    FeatureDataLoader
-)
+from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
 
 
 class TestFeatureDataLoader(unittest.TestCase):
diff --git a/test/test_pipeline/components/training/test_forecasting_training.py b/test/test_pipeline/components/training/test_forecasting_training.py
index 4734c6ab0..4c5f21517 100644
--- a/test/test_pipeline/components/training/test_forecasting_training.py
+++ b/test/test_pipeline/components/training/test_forecasting_training.py
@@ -1,7 +1,7 @@
 import unittest
 
-from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
 from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
 
 
 class TestGetBudgetTracker(unittest.TestCase):
diff --git a/test/test_pipeline/components/training/test_time_series_data_loader.py b/test/test_pipeline/components/training/test_time_series_data_loader.py
index d109be5df..f17c4e10e 100644
--- a/test/test_pipeline/components/training/test_time_series_data_loader.py
+++ b/test/test_pipeline/components/training/test_time_series_data_loader.py
@@ -1,30 +1,31 @@
-from typing import List
 import copy
 import unittest
+import unittest.mock
+from typing import List
 from unittest import mock
+
 import numpy as np
-import unittest.mock
 
 import pandas as pd
 
-from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes, HoldOutFuncs
-from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
 import torch
+
 import torchvision
+
+from autoPyTorch.datasets.resampling_strategy import HoldOutFuncs, HoldoutValTypes
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
+    TimeSeriesForecastingDataLoader
+)
 from autoPyTorch.pipeline.components.training.data_loader.time_series_util import (
-    TestSequenceDataset,
-    pad_sequence_with_minimal_length,
     PadSequenceCollector,
+    SequentialSubSetSampler,
+    TestSequenceDataset,
     TimeSeriesSampler,
-    SequentialSubSetSampler
+    pad_sequence_with_minimal_length
 )
-
 from autoPyTorch.utils.common import HyperparameterSearchSpace
 
-from autoPyTorch.pipeline.components.training.data_loader.time_series_forecasting_data_loader import (
-    TimeSeriesForecastingDataLoader
-)
-
 
 class TestTimeSeriesForecastingDataLoader(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/test/test_pipeline/test_losses.py b/test/test_pipeline/test_losses.py
index 43bbcb587..0929dc1fe 100644
--- a/test/test_pipeline/test_losses.py
+++ b/test/test_pipeline/test_losses.py
@@ -7,17 +7,17 @@
 from torch.nn.modules.loss import _Loss as Loss
 
 from autoPyTorch.pipeline.components.training.losses import (
-    get_loss,
-    losses,
     LogProbLoss,
     MAPELoss,
     MASELoss,
-    QuantileLoss
+    QuantileLoss,
+    get_loss,
+    losses
 )
 from autoPyTorch.utils.implementations import (
     LossWeightStrategyWeighted,
     LossWeightStrategyWeightedBinary,
-    get_loss_weight_strategy,
+    get_loss_weight_strategy
 )
 
 
diff --git a/test/test_pipeline/test_metrics.py b/test/test_pipeline/test_metrics.py
index 0772239c5..0a40d84bb 100644
--- a/test/test_pipeline/test_metrics.py
+++ b/test/test_pipeline/test_metrics.py
@@ -3,6 +3,7 @@
 import pytest
 
 import sklearn.metrics
+
 import sktime.performance_metrics.forecasting as forecasting_metrics
 
 from autoPyTorch.constants import (
@@ -12,26 +13,24 @@
     STRING_TO_TASK_TYPES,
     TABULAR_CLASSIFICATION,
     TABULAR_REGRESSION,
-    TIMESERIES_FORECASTING,
-    TASK_TYPES_TO_STRING
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING
+)
+from autoPyTorch.metrics import (
+    accuracy,
+    balanced_accuracy,
+    compute_mase_coefficient,
+    mean_squared_error
 )
-from autoPyTorch.metrics import (accuracy,
-                                 balanced_accuracy,
-                                 mean_squared_error,
-                                 compute_mase_coefficient)
 from autoPyTorch.pipeline.components.training.metrics.base import (
+    ForecastingMetricMixin,
+    _ForecastingMetric,
     _PredictMetric,
     _ThresholdMetric,
-    _ForecastingMetric,
     autoPyTorchMetric,
-    ForecastingMetricMixin,
-    make_metric,
-)
-from autoPyTorch.pipeline.components.training.metrics.utils import (
-    calculate_loss,
-    calculate_score,
-    get_metrics,
+    make_metric
 )
+from autoPyTorch.pipeline.components.training.metrics.utils import calculate_loss, calculate_score, get_metrics
 
 
 @pytest.mark.parametrize('output_type', ['multiclass',

From 1cf31b2df3e9975576fa58a015591e0076b76a47 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 11:20:50 +0200
Subject: [PATCH 296/347] maint dataloader

---
 .../data_loader/time_series_forecasting_data_loader.py         | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 0c1bae6a9..5ad9fad36 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -416,7 +416,8 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                         x_all_test.index = series_number_test
 
                 x_all = x_all.groupby(x_all.index)
-                x_all_test = x_all_test.groupby(x_all_test.index)
+                if len(self.known_future_features_index) > 0:
+                    x_all_test = x_all_test.groupby(x_all_test.index)
 
             for i, x_seq in enumerate(X):
                 if not isinstance(x_seq, TimeSeriesSequence):

From a8fa53c30ba9a81d6a6e65f1b74d017c42edeefc Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 11:21:44 +0200
Subject: [PATCH 297/347] remove unused auto-regressive arguments

---
 .../network_head/forecasting_network_head/distribution.py   | 4 +---
 .../forecasting_network_head/forecasting_head.py            | 6 ------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 6c82eacfd..4e0b61455 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -24,8 +24,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.distributions import (Beta, Distribution, Gamma, Normal, Poisson,
-                                 StudentT)
+from torch.distributions import Beta, Distribution, Gamma, Normal, Poisson, StudentT
 
 
 class ProjectionLayer(nn.Module):
@@ -42,7 +41,6 @@ def __init__(
         num_in_features: int,
         output_shape: Tuple[int, ...],
         n_prediction_heads: int,
-        auto_regressive: bool,
         decoder_has_local_layer: bool,
         **kwargs: Any,
     ):
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 3fbc0afeb..f1601af85 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -94,8 +94,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 raise ValueError("For Quantile losses, quantiles must be given in X!")
             num_quantiles = len(X['quantile_values'])
 
-        auto_regressive = X.get('auto_regressive', False)
-
         head_n_in_features: int = X["n_decoder_output_features"]
         n_prediction_heads = X["n_prediction_heads"]
 
@@ -104,7 +102,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         head_components = self.build_head(
             head_n_in_features=head_n_in_features,
             output_shape=output_shape,
-            auto_regressive=auto_regressive,
             decoder_has_local_layer=decoder_has_local_layer,
             net_output_type=net_output_type,
             dist_cls=dist_cls,
@@ -163,7 +160,6 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
     def build_head(self,  # type: ignore[override]
                    head_n_in_features: int,
                    output_shape: Tuple[int, ...],
-                   auto_regressive: bool = False,
                    decoder_has_local_layer: bool = True,
                    net_output_type: str = "distribution",
                    dist_cls: Optional[str] = None,
@@ -176,7 +172,6 @@ def build_head(self,  # type: ignore[override]
         Args:
             head_n_in_features (int): shape of the input to the head (usually the shape of the backbone output)
             output_shape (Tuple[int, ...]): shape of the output of the head
-            auto_regressive (bool): if the network is auto-regressive
             decoder_has_local_layer (bool): if the decoder has local layer
             net_output_type (str): network output type
             dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
@@ -191,7 +186,6 @@ def build_head(self,  # type: ignore[override]
             proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=head_n_in_features,
                                                      output_shape=output_shape[1:],
                                                      n_prediction_heads=n_prediction_heads,
-                                                     auto_regressive=auto_regressive,
                                                      decoder_has_local_layer=decoder_has_local_layer
                                                      )
             return proj_layer

From a8bd54d32e58e3f35e08629e8aa6ea9262b407f9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 17:57:58 +0200
Subject: [PATCH 298/347] fix pre-commit

---
 autoPyTorch/api/base_task.py                  |   4 +-
 autoPyTorch/api/time_series_forecasting.py    |  29 ++--
 autoPyTorch/data/tabular_target_validator.py  |  22 ++-
 .../data/time_series_feature_validator.py     |   4 +-
 .../data/time_series_forecasting_validator.py |  14 +-
 autoPyTorch/datasets/time_series_dataset.py   |  49 +++---
 autoPyTorch/evaluation/tae.py                 |   6 +-
 ...time_series_forecasting_train_evaluator.py |  52 +++---
 autoPyTorch/evaluation/train_evaluator.py     |  14 +-
 autoPyTorch/optimizer/smbo.py                 |  30 ++--
 .../TimeSeriesEarlyPreProcessing.py           |   2 +-
 .../forecasting_training_loss/__init__.py     |  12 +-
 .../setup/network/forecasting_architecture.py |  12 +-
 .../setup/network/forecasting_network.py      |   2 +-
 .../setup/network_backbone/__init__.py        |   2 +-
 .../forecasting_backbone/__init__.py          |  15 +-
 .../forecasting_backbone/cells.py             |   5 +-
 .../forecasting_decoder/MLPDecoder.py         |  10 +-
 .../forecasting_decoder/NBEATSDecoder.py      |   4 +-
 .../base_forecasting_decoder.py               |   2 +-
 .../forecasting_encoder/__init__.py           |  15 +-
 .../flat_encoder/MLPEncoder.py                |   6 +-
 .../flat_encoder/NBEATSEncoder.py             |   9 +-
 .../flat_encoder/__init__.py                  |   2 +-
 .../seq_encoder/InceptionTimeEncoder.py       |   2 +-
 .../seq_encoder/RNNEncoder.py                 |  11 +-
 .../seq_encoder/TCNEncoder.py                 |   8 +-
 .../seq_encoder/TransformerEncoder.py         |   8 +-
 .../seq_encoder/__init__.py                   | 164 ++++++++++--------
 .../setup/network_embedding/__init__.py       |   2 +-
 .../components/setup/network_head/__init__.py |   2 +-
 .../forecasting_network_head/distribution.py  |  10 +-
 .../forecasting_head.py                       |   3 +-
 .../components/setup/optimizer/__init__.py    |   2 +-
 .../time_series_forecasting_data_loader.py    |  17 +-
 .../components/training/metrics/base.py       |   6 +-
 .../components/training/trainer/__init__.py   |   2 +
 .../trainer/forecasting_trainer/__init__.py   |   1 +
 .../pipeline/time_series_forecasting.py       |  13 +-
 test/test_pipeline/test_losses.py             |  60 +++----
 40 files changed, 335 insertions(+), 298 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index ccfceb389..cba22da38 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -933,7 +933,7 @@ def _search(
             load_models: bool = True,
             portfolio_selection: Optional[str] = None,
             dask_client: Optional[dask.distributed.Client] = None,
-            **kwargs: Dict[str, Any]
+            **kwargs: Any
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1063,7 +1063,7 @@ def _search(
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
             time_series_forecasting: bool
                 if time series forecasting task is implemented.
-            kwargs: Dict
+            kwargs: Any
                 additional arguments
 
         Returns:
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 87754e1a5..46a75833a 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -165,10 +165,10 @@ def _get_dataset_input_validator(
         dataset_name: Optional[str] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
         freq: Optional[Union[str, int, List[int]]] = None,
-        start_times: List[pd.DatetimeIndex] = [],
+        start_times: Optional[List[pd.DatetimeIndex]] = None,
         series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
         n_prediction_steps: int = 1,
-        known_future_features: Tuple[Union[int, str]] = (),
+        known_future_features: Union[Tuple[Union[int, str]], Tuple[()]] = (),
         **forecasting_dataset_kwargs: Any,
     ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
         """
@@ -271,8 +271,8 @@ def search(
         series_idx: Optional[Union[List[Union[str, int]], str, int]] = None,
         dataset_name: Optional[str] = None,
         budget_type: str = "epochs",
-        min_budget: Union[int, str] = 5,
-        max_budget: Union[int, str] = 50,
+        min_budget: Union[int, float] = 5,
+        max_budget: Union[int, float] = 50,
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = False,
@@ -338,7 +338,7 @@ def search(
                     min_budget will refer to seconds.
                 + 'resolution': The sample resolution of time series, for instance, if a time series sequence is
                 [0, 1, 2, 3, 4] with resolution 0.5, the sequence fed to the network is [0, 2, 4]
-            min_budget Union[int, str]:
+            min_budget Union[int, float]:
                 Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>`_ to
                 trade-off resources between running many pipelines at min_budget and
                 running the top performing pipelines on max_budget.
@@ -346,7 +346,7 @@ def search(
                 so that we can compare and quickly discard bad performing models.
                 For example, if the budget_type is epochs, and min_budget=5, then we will
                 run every pipeline to a minimum of 5 epochs before performance comparison.
-            max_budget Union[int, str]:
+            max_budget Union[int, float]:
                 Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>`_ to
                 trade-off resources between running many pipelines at min_budget and
                 running the top performing pipelines on max_budget.
@@ -407,10 +407,10 @@ def search(
             self
 
         """
-
-        self._dataset_compression = get_dataset_compression_mapping(
-            memory_limit, dataset_compression
-        )
+        if memory_limit is not None:
+            self._dataset_compression = get_dataset_compression_mapping(
+                memory_limit, dataset_compression
+            )
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -428,7 +428,7 @@ def search(
             **forecasting_dataset_kwargs,
         )
 
-        if self.dataset.base_window_size is not None or not self.customized_window_size:
+        if self.dataset.base_window_size is not None and not self.customized_window_size:
             base_window_size = int(np.ceil(self.dataset.base_window_size))
             # we don't want base window size to large, which might cause a too long computation time, in which case
             # we will use n_prediction_step instead (which is normally smaller than base_window_size)
@@ -487,9 +487,7 @@ def search(
 
     def predict(
         self,
-        X_test: Optional[
-            List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]
-        ] = None,
+        X_test: List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]] = None,
         batch_size: Optional[int] = None,
         n_jobs: int = 1,
         past_targets: Optional[List[np.ndarray]] = None,
@@ -503,8 +501,9 @@ def predict(
         (used for multi-variable prediction), indicates which value needs to be predicted
         """
         if not isinstance(X_test[0], TimeSeriesSequence):
+            assert past_targets is not None
             # Validate and construct TimeSeriesSequence
-            X_test, _, _ = self.dataset.transform_data_into_time_series_sequence(
+            X_test, _, _, _ = self.dataset.transform_data_into_time_series_sequence(
                 X=X_test,
                 Y=past_targets,
                 X_test=future_targets,
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index b0a6a7019..5cd66eaa4 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -20,7 +20,7 @@
 ArrayType = Union[np.ndarray, spmatrix]
 
 
-def _check_and_to_array(y: SupportedTargetTypes, allow_nan=False) -> ArrayType:
+def _check_and_to_array(y: SupportedTargetTypes, allow_nan: bool = False) -> ArrayType:
     """ sklearn check array will make sure we have the correct numerical features for the array """
     if allow_nan:
         return sklearn.utils.check_array(y, force_all_finite=False, accept_sparse='csr', ensure_2d=False)
@@ -28,7 +28,7 @@ def _check_and_to_array(y: SupportedTargetTypes, allow_nan=False) -> ArrayType:
         return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
 
 
-def _modify_regression_target(y: ArrayType, allow_nan=False) -> ArrayType:
+def _modify_regression_target(y: ArrayType, allow_nan: bool = False) -> ArrayType:
     # Regression targets must have numbers after a decimal point.
     # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
     if allow_nan:
@@ -173,11 +173,11 @@ def transform(self, y: SupportedTargetTypes) -> np.ndarray:
             y = np.ravel(y)
 
         if self.allow_missing_values:
-            func_fill_na = np.nan_to_num
+            y_filled = np.nan_to_num(y)
         else:
-            func_fill_na = lambda x: x
+            y_filled = y
 
-        if not self.is_classification and "continuous" not in type_of_target(func_fill_na(y)):
+        if not self.is_classification and "continuous" not in type_of_target(y_filled):
             y = _modify_regression_target(y, self.allow_missing_values)
 
         return y
@@ -229,9 +229,8 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
                 and not issparse(y):  # type: ignore[misc]
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " pd.Series, sparse data and Python Lists as targets, yet, "
-                             "the provided input is of type {}".format(
-                type(y)
-            ))
+                             "the provided input is of type {}".format(type(y))
+                             )
 
         # Sparse data muss be numerical
         # Type ignore on attribute because sparse targets have a dtype
@@ -298,7 +297,6 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
                                   )
         if self.type_of_target not in supported_output_types:
             raise ValueError("Provided targets are not supported by AutoPyTorch. "
-                             "Provided type is {} whereas supported types are {}.".format(
-                self.type_of_target,
-                supported_output_types
-            ))
+                             "Provided type is {} whereas supported types are {}.".format(self.type_of_target,
+                                                                                          supported_output_types)
+                             )
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index a503f85ea..462d49a0b 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -33,7 +33,7 @@ def __init__(
     ):
         super().__init__(logger)
         self.only_contain_series_idx = False
-        self.static_features: Union[Tuple[()], Tuple[int]] = ()
+        self.static_features: Union[Tuple[()], Tuple[Union[int, str]]] = ()
         self.series_idx: Optional[List[Union[str, int]]] = None
 
     def get_reordered_columns(self) -> List[str]:
@@ -163,7 +163,7 @@ def transform(
         X = super(TimeSeriesFeatureValidator, self).transform(X)
         if X.ndim == 1:
             X = np.expand_dims(X, -1)  # type:ignore[no-redef]
-        X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())
+        X: pd.DataFrame = pd.DataFrame(X, columns=self.get_reordered_columns())  # type:ignore[no-redef]
         if index is None:
             if not X_has_idx:
                 index = np.array([0] * len(X))
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 234f4ed94..ca6df2eef 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -45,7 +45,7 @@ def __init__(
         self.feature_names: List[str] = []
         self.series_idx: Optional[Union[List[Union[str, int]], str, int]] = None
 
-    def fit(
+    def fit(  # type: ignore[override]
         self,
         X_train: Optional[Union[List, pd.DataFrame]],
         y_train: Union[List, pd.DataFrame],
@@ -69,7 +69,7 @@ def fit(
 
         """
         if series_idx is not None and not isinstance(series_idx, Iterable):
-            series_idx: Optional[List[Union[str, int]]] = [series_idx]
+            series_idx: Optional[List[Union[str, int]]] = [series_idx]  # type: ignore[no-redef]
 
         self.series_idx = series_idx
 
@@ -137,7 +137,7 @@ def fit(
             self.feature_validator.fit(
                 X_train,
                 X_test,
-                series_idx=series_idx,
+                series_idx=series_idx,  # type: ignore[arg-type]
                 sequence_lengths=sequence_lengths,
             )
             self.target_validator.fit(y_train, y_test)
@@ -163,7 +163,7 @@ def fit(
 
         return self
 
-    def transform(
+    def transform(  # type: ignore[override]
         self,
         X: Optional[Union[List, pd.DataFrame]],
         y: Optional[Union[List, pd.DataFrame]] = None,
@@ -228,7 +228,7 @@ def transform(
                 x_transformed, series_number = self._transform_X(
                     X, npa_sequence_lengths
                 )
-                y_transformed: pd.DataFrame = self.target_validator.transform(
+                y_transformed = self.target_validator.transform(
                     y_stacked, index=series_number
                 )
 
@@ -255,7 +255,7 @@ def transform(
                 x_transformed, series_number = self._transform_X(X, None)
 
                 if self._is_uni_variant:
-                    y_transformed: pd.DataFrame = self.target_validator.transform(
+                    y_transformed = self.target_validator.transform(
                         y, series_number
                     )
                     return (
@@ -264,7 +264,7 @@ def transform(
                         y_transformed.index.value_counts(sort=False).values,
                     )
 
-                y_transformed: pd.DataFrame = self.target_validator.transform(
+                y_transformed = self.target_validator.transform(
                     y, x_transformed.index
                 )
                 return (
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 3b163ce34..0fd33dd81 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -22,7 +22,6 @@
 
 import torchvision.transforms
 
-
 from autoPyTorch.constants import (CLASSIFICATION_OUTPUTS,
                                    STRING_TO_OUTPUT_TYPES,
                                    TASK_TYPES_TO_STRING,
@@ -37,7 +36,8 @@
     CrossValTypes,
     DEFAULT_RESAMPLING_PARAMETERS,
     HoldOutFuncs,
-    HoldoutValTypes
+    HoldoutValTypes,
+    NoResamplingStrategyTypes,
 )
 from autoPyTorch.pipeline.components.training.metrics.metrics import \
     compute_mase_coefficient
@@ -247,7 +247,7 @@ def __getitem__(self, index: int, train: bool = True) \
                         [past_features, self._cached_time_features[:index + 1]]  # type: ignore[index]
                     )
                 else:
-                    past_features = self._cached_time_features[:index + 1]   # type: ignore[index]
+                    past_features = self._cached_time_features[:index + 1]  # type: ignore[index]
                 if future_features is not None:
                     future_features = np.hstack([
                         future_features,
@@ -264,17 +264,17 @@ def __getitem__(self, index: int, train: bool = True) \
         targets = self.Y
         if self.is_test_set:
             if self.Y_test is not None:
-                future_targets = {
+                future_targets: Optional[Dict[str, torch.Tensor]] = {
                     'future_targets': torch.from_numpy(self.Y_test),
                     'future_observed_targets': torch.from_numpy(self.future_observed_target)
                 }
             else:
                 future_targets = None
         else:
-            future_targets = targets[index + 1: index + self.n_prediction_steps + 1]
-            future_targets = torch.from_numpy(future_targets)
+            future_targets_np = targets[index + 1: index + self.n_prediction_steps + 1]
+            future_targets_tt = torch.from_numpy(future_targets_np)
             future_targets = {
-                'future_targets': future_targets,
+                'future_targets': future_targets_tt,
                 'future_observed_targets': torch.from_numpy(
                     self.observed_target[index + 1: index + self.n_prediction_steps + 1]
                 )
@@ -298,7 +298,7 @@ def __getitem__(self, index: int, train: bool = True) \
                     0]}, future_targets
 
     def __len__(self) -> int:
-        return self.Y.shape[0] if self.is_test_set else self.Y.shape[0] - self.n_prediction_steps
+        return int(self.Y.shape[0]) if self.is_test_set else int(self.Y.shape[0]) - self.n_prediction_steps
 
     def get_target_values(self, index: int) -> np.ndarray:
         """
@@ -437,7 +437,8 @@ def __init__(self,
                  time_feature_transform: Optional[List[TimeFeature]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
                  resampling_strategy: Optional[
-                     Union[CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
+                     Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
+                 ] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
@@ -495,14 +496,15 @@ def __init__(self,
         self.numerical_columns = self.validator.feature_validator.numerical_columns
         self.categorical_columns = self.validator.feature_validator.categorical_columns
 
-        self.num_features: int = self.validator.feature_validator.num_features
-        self.num_targets: int = self.validator.target_validator.out_dimensionality
+        self.num_features: int = self.validator.feature_validator.num_features  # type: ignore[assignment]
+        self.num_targets: int = self.validator.target_validator.out_dimensionality  # type: ignore[assignment]
 
         self.categories = self.validator.feature_validator.categories
 
         self.feature_shapes = self.validator.feature_shapes
         self.feature_names = tuple(self.validator.feature_names)
 
+        assert self.validator.start_times is not None
         self.start_times = self.validator.start_times
 
         self.static_features = self.validator.feature_validator.static_features
@@ -643,7 +645,7 @@ def __init__(self,
 
     @staticmethod
     def compute_freq_values(freq: Optional[Union[str, int, List[int]]],
-                            n_prediction_steps: int) -> Tuple[Real, str, Real]:
+                            n_prediction_steps: int) -> Tuple[Union[int, float], str, Union[int, float]]:
         """
         Compute frequency related values
         """
@@ -672,7 +674,7 @@ def compute_freq_values(freq: Optional[Union[str, int, List[int]]],
 
         if isinstance(seasonality, list):
             seasonality = min(seasonality)  # Use to calculate MASE
-        return seasonality, freq, freq_value
+        return seasonality, freq, freq_value  # type: ignore[return-value]
 
     @staticmethod
     def compute_time_features(start_times: List[pd.DatetimeIndex],
@@ -708,7 +710,7 @@ def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Unio
         return dataset_idx, sample_idx
 
     def __len__(self) -> int:
-        return ConcatDataset.__len__(self)
+        return ConcatDataset.__len__(self)  # type: ignore[no-any-return]
 
     def __getitem__(self, idx: int,  # type: ignore[override]
                     train: bool = True) -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
@@ -748,7 +750,7 @@ def transform_data_into_time_series_sequence(self,
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
                                                  Y_test: Optional[
                                                      Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None,
-                                                 is_test_set: bool = False,) -> Tuple[
+                                                 is_test_set: bool = False, ) -> Tuple[
         List[TimeSeriesSequence],
         Tuple[Optional[pd.DataFrame], pd.DataFrame],
         Optional[Tuple[Optional[pd.DataFrame], pd.DataFrame]],
@@ -896,7 +898,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
     def replace_data(self,
                      X_train: pd.DataFrame,
                      X_test: Optional[pd.DataFrame],
-                     known_future_features_index: List[int] = []) -> 'BaseDataset':
+                     known_future_features_index: Union[Tuple[int], Tuple[()]] = tuple()) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
         if X_train is None:
             return self
@@ -1025,24 +1027,26 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
     @staticmethod
     def get_split_strategy(sequence_lengths: List[int],
                            n_prediction_steps: int,
-                           freq_value: Real,
-                           resampling_strategy: Union[
-                               CrossValTypes, HoldoutValTypes] = HoldoutValTypes.time_series_hold_out_validation,
+                           freq_value: Union[float, int],
+                           resampling_strategy: Optional[Union[
+                               CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
                            resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
-            Tuple[Union[CrossValTypes, HoldoutValTypes], Optional[Dict[str, Any]]]:
+            Tuple[Optional[Union[CrossValTypes, HoldoutValTypes]], Optional[Dict[str, Any]]]:
         """
         Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
         enough to support cross-validation split, thus we need to carefully compute the number of folds
         Args:
             sequence_lengths (List[int]): lengths of each sequence
             n_prediction_steps (int): forecasting horizon
-            freq_value (Real): period of the dataset, determined by its sampling frequency
+            freq_value (Union[float, int]): period of the dataset, determined by its sampling frequency
             resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy to be checked
             resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments to be checked
         Returns:
             resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy
             resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments
         """
+        if resampling_strategy is None:
+            return None, None
         # check if dataset could be split with cross validation
         minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
         if isinstance(resampling_strategy, CrossValTypes):
@@ -1068,7 +1072,8 @@ def get_split_strategy(sequence_lengths: List[int],
                     resampling_strategy_args = None
             else:
                 seasonality_h_value = int(
-                    np.round((n_prediction_steps // int(freq_value) + 1) * freq_value))
+                    np.round((n_prediction_steps // int(freq_value) + 1) * freq_value)
+                )
 
                 while minimal_seq_length < (num_splits - 1) * seasonality_h_value:
                     if num_splits <= 2:
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index f4fda1a94..7f7013eb2 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -202,15 +202,15 @@ def __init__(
         self.search_space_updates = search_space_updates
 
     def _check_and_get_default_budget(self) -> float:
-        budget_type_choices = ('epochs', 'runtime')
+        budget_type_choices_tabular = ('epochs', 'runtime')
         budget_choices = {
             budget_type: float(self.pipeline_config.get(budget_type, np.inf))
-            for budget_type in budget_type_choices
+            for budget_type in budget_type_choices_tabular
         }
 
         budget_choices_forecasting = {budget_type: 1.0 for budget_type in FORECASTING_BUDGET_TYPE}
         budget_choices.update(budget_choices_forecasting)
-        budget_type_choices = budget_type_choices + FORECASTING_BUDGET_TYPE
+        budget_type_choices = budget_type_choices_tabular + FORECASTING_BUDGET_TYPE
 
         # budget is defined by epochs by default
         budget_type = str(self.pipeline_config.get('budget_type', 'epochs'))
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 5ebd4688d..2b0c3b220 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -83,7 +83,7 @@ def __init__(self, backend: Backend, queue: Queue,
         seasonality = SEASONALITY_MAP.get(self.datamanager.freq, 1)
         if isinstance(seasonality, list):
             seasonality = min(seasonality)  # Use to calculate MASE
-        self.seasonality = int(seasonality)
+        self.seasonality = int(seasonality)  # type: ignore[call-overload]
 
         self.max_budget = max_budget
         self.min_num_test_instances = min_num_test_instances
@@ -264,7 +264,7 @@ def fit_predict_and_loss(self) -> None:
             self.logger.debug("In train evaluator fit_predict_and_loss, loss:{}".format(opt_loss))
             self.finish_up(
                 loss=opt_loss,
-                train_loss=train_loss,
+                train_loss=train_loss,  # type: ignore[arg-type]
                 opt_pred=Y_optimization_preds.flatten(),
                 valid_pred=Y_valid_preds,
                 test_pred=Y_test_preds,
@@ -274,7 +274,7 @@ def fit_predict_and_loss(self) -> None:
                 **forecasting_kwargs,
             )
 
-    def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.ndarray:
+    def generate_mase_coefficient_for_validation(self, test_split: Sequence[int]) -> np.ndarray:
         """
         Compute the denominator for Mean Absolute Scaled Losses,
         For detail, please check sktime.performance_metrics.forecasting._functions.mean_absolute_scaled_error
@@ -289,9 +289,10 @@ def generate_mase_coefficient_for_validation(self, test_split: Sequence) -> np.n
             inverse of the mase_denominator
         """
         mase_coefficient = np.ones([len(test_split), self.num_targets])
-        if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
-            for seq_idx, test_idx in enumerate(test_split):
-                mase_coefficient[seq_idx] = self.datamanager.get_time_series_seq(test_idx).mase_coefficient
+        if self.additional_metrics is not None:
+            if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
+                for seq_idx, test_idx in enumerate(test_split):
+                    mase_coefficient[seq_idx] = self.datamanager.get_time_series_seq(test_idx).mase_coefficient
 
         mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
         return mase_coefficient
@@ -311,35 +312,34 @@ def generate_mase_coefficient_for_test_set(self) -> np.ndarray:
             inverse of the mase_denominator
         """
         mase_coefficient = np.ones([len(self.datamanager.datasets), self.num_targets])
-        if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
-            for seq_idx, test_idx in enumerate(self.datamanager.datasets):
-                mase_coefficient[seq_idx] = self.datamanager.datasets[seq_idx].mase_coefficient
+        if self.additional_metrics is not None:
+            if any(mase_loss in self.additional_metrics for mase_loss in MASE_LOSSES) or self.metric in MASE_LOSSES:
+                for seq_idx, test_idx in enumerate(self.datamanager.datasets):
+                    mase_coefficient[seq_idx] = self.datamanager.datasets[seq_idx].mase_coefficient
         mase_coefficient = np.repeat(mase_coefficient, self.n_prediction_steps, axis=0)
         return mase_coefficient
 
     def create_validation_sub_set(self, test_indices: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
-        num_test_instances = len(test_indices)
-
-        if num_test_instances < self.min_num_test_instances or self.budget >= self.max_budget:
-            # if the length of test indices is smaller than the
+        if self.min_num_test_instances is not None:
+            num_test_instances = len(test_indices)
+
+            if num_test_instances < self.min_num_test_instances or self.budget >= self.max_budget:
+                # if the length of test indices is smaller than the
+                return test_indices, None
+            num_val_instance = min(num_test_instances,
+                                   max(self.min_num_test_instances,
+                                       int(num_test_instances * self.budget / self.max_budget)
+                                       ))
+            test_subset_indices = np.linspace(0, num_test_instances, num_val_instance, endpoint=False, dtype=np.int)
+            return test_indices[test_subset_indices], test_subset_indices
+        else:
             return test_indices, None
-        num_val_instance = min(num_test_instances,
-                               max(self.min_num_test_instances,
-                                   int(num_test_instances * self.budget / self.max_budget)
-                                   ))
-        test_subset_indices = np.linspace(0, num_test_instances, num_val_instance, endpoint=False, dtype=np.int)
-        return test_indices[test_subset_indices], test_subset_indices
 
     def _predict(self, pipeline: BaseEstimator,
-                 train_indices: Union[np.ndarray, List],
                  test_indices: Union[np.ndarray, List],
+                 train_indices: Union[np.ndarray, List],
                  ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
-
-        if self.min_num_test_instances is not None:
-            test_indices_subset, test_split_subset_idx = self.create_validation_sub_set(test_indices)
-        else:
-            test_indices_subset = test_indices
-            test_split_subset_idx = None
+        test_indices_subset, test_split_subset_idx = self.create_validation_sub_set(test_indices)
 
         val_sets = []
 
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index 3ce1aed9c..a7bb9af86 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -1,5 +1,5 @@
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 from ConfigSpace.configuration_space import Configuration
 
@@ -425,8 +425,8 @@ def eval_train_function(
         all_supported_metrics: bool = True,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
         instance: str = None,
-        evaluator_class: Optional[AbstractEvaluator] = None,
-        **evaluator_kwargs,
+        evaluator_class: Type[TrainEvaluator] = TrainEvaluator,
+        **evaluator_kwargs: Any,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
@@ -489,11 +489,11 @@ def eval_train_function(
             with a single instance, being the provided X_train, y_train of a single dataset.
             This instance is a compatibility argument for SMAC, that is capable of working
             with multiple datasets at the same time.
-        evaluator_class (Optional[AbstractEvaluator]):
+        evaluator_class (Type[AbstractEvaluator]):
             the class name of evaluator, when not specified, it is set as vanilla TrainEvaluator
+        evaluator_kwargs: Any
+            additionally evaluation kwargs
     """
-    if evaluator_class is None:
-        evaluator_class = TrainEvaluator
     evaluator = evaluator_class(
         backend=backend,
         queue=queue,
@@ -512,6 +512,6 @@ def eval_train_function(
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
         search_space_updates=search_space_updates,
-        **evaluator_kwargs
+        **evaluator_kwargs  # type: ignore
     )
     evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 0070daa17..174dd19c4 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -21,6 +21,7 @@
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     DEFAULT_RESAMPLING_PARAMETERS,
@@ -71,7 +72,7 @@ def get_smac_object(
     """
     if initial_budget == max_budget:
         intensifier = Intensifier
-        intensifier_kwargs = {'deterministic': True, }
+        intensifier_kwargs: Dict[str, Any] = {'deterministic': True, }
         rh2EPM = RunHistory2EPM4LogScaledCost
 
     else:
@@ -126,8 +127,8 @@ def __init__(self,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  portfolio_selection: Optional[str] = None,
                  pynisher_context: str = 'spawn',
-                 min_budget: int = 5,
-                 max_budget: int = 50,
+                 min_budget: Union[int, float] = 5,
+                 max_budget: Union[int, float] = 50,
                  time_series_forecasting: bool = False,
                  **kwargs: Dict[str, Any]
                  ):
@@ -207,8 +208,14 @@ def __init__(self,
             time_series_forecasting (bool):
                 If we want to apply this optimizer to optimize time series prediction tasks (which has a different
                 tae)
-            kwargs (Dict):
-                Additional Arguments for forecasting intialization tasks
+            kwargs (Any):
+                Additional Arguments for forecasting tasks. It includes:
+                    min_num_test_instances (int): minimal number of instances used to initialize a proxy validation set
+                    suggested_init_models (List[str]): A set of initial models suggested by the users.
+                        Their hyperparameters are still determined by the default configurations
+                    custom_init_setting_path (str): the path to the initial hyperparameter configurations set by the
+                        users
+
         """
         super(AutoMLSMBO, self).__init__()
         # data related
@@ -267,12 +274,14 @@ def __init__(self,
         initial_configurations = []
 
         if self.time_series_forecasting:
-            suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models', None)
-            custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path', None)
+            suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models',  # type:ignore[assignment]
+                                                                    None)
+            custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path',  # type:ignore[assignment]
+                                                                 None)
             # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
             # do not provide any initial configurations
             if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
-                datamanager = self.backend.load_datamanager()
+                datamanager: BaseDataset = self.backend.load_datamanager()
                 dataset_properties = datamanager.get_dataset_properties([])
                 initial_configurations = read_forecasting_init_configurations(
                     config_space=config_space,
@@ -281,7 +290,8 @@ def __init__(self,
                     dataset_properties=dataset_properties
                 )
             # proxy-validation sets
-            self.min_num_test_instances = kwargs.get('min_num_test_instances', None)
+            self.min_num_test_instances: Optional[int] = kwargs.get('min_num_test_instances',  # type:ignore[assignment]
+                                                                    None)
         else:
             if portfolio_selection is not None:
                 initial_configurations = read_return_initial_configurations(config_space=config_space,
@@ -391,7 +401,7 @@ def run_smbo(self, func: Optional[Callable] = None
         budget_type = self.pipeline_config['budget_type']
         if budget_type in FORECASTING_BUDGET_TYPE:
             if self.min_budget > 1. or self.max_budget > 1.:
-                self.min_budget = self.min_budget / self.max_budget
+                self.min_budget = float(self.min_budget) / float(self.max_budget)
                 self.max_budget = 1.0
 
         if self.time_series_forecasting:
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index d68a1cb85..b052ce5fc 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -22,7 +22,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
             FitRequirement('X_train', (pd.DataFrame, ), user_defined=True,
                            dataset_property=False),
-            FitRequirement('feature_names', (Tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True),
             FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
         ])
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index c6b9c50a8..b15ec761a 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -94,16 +94,16 @@ def get_available_components(
             if entry == ForecastingLossChoices or hasattr(entry, 'get_components'):
                 continue
 
-            task_type = str(dataset_properties['task_type'])
+            task_type_name = str(dataset_properties['task_type'])
             properties = entry.get_properties()
-            if 'tabular' in task_type and not bool(properties['handles_tabular']):
+            if 'tabular' in task_type_name and not bool(properties['handles_tabular']):
                 continue
-            elif 'image' in task_type and not bool(properties['handles_image']):
+            elif 'image' in task_type_name and not bool(properties['handles_image']):
                 continue
-            elif 'time_series' in task_type and not bool(properties['handles_time_series']):
+            elif 'time_series' in task_type_name and not bool(properties['handles_time_series']):
                 continue
 
-            task_type = STRING_TO_TASK_TYPES[task_type]
+            task_type = STRING_TO_TASK_TYPES[task_type_name]
 
             if task_type in CLASSIFICATION_TASKS and not bool(properties['handles_classification']):
                 continue
@@ -190,4 +190,4 @@ def get_hyperparameter_search_space(
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 9d0c8c1f1..b51cdcd0f 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1,6 +1,6 @@
 import warnings
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -23,7 +23,7 @@
 from autoPyTorch.pipeline.components.setup.network_embedding.NoEmbedding import \
     _NoEmbedding
 
-ALL_NET_OUTPUT = TypeVar('ALL_NET_OUTPUT', torch.Tensor, List[torch.Tensor], torch.distributions.Distribution)
+ALL_NET_OUTPUT = Union[torch.Tensor, List[torch.Tensor], torch.distributions.Distribution]
 
 
 class TransformedDistribution_(TransformedDistribution):
@@ -423,7 +423,7 @@ def pre_processing(self,
                                                                 dtype=past_targets.dtype,
                                                                 device=self.device)
             else:
-                feat_dict_past = None
+                feat_dict_past = None  # type: ignore[assignment]
             if length_future > 0:
                 if future_features is not None:
                     future_features = self.decoder_embedding(future_features.to(self.device))
@@ -434,8 +434,6 @@ def pre_processing(self,
                                                                    length_future, 1),
                                                                   dtype=past_targets.dtype,
                                                                   device=self.device)
-                else:
-                    feat_dict_future = {}
                 if future_features is not None:
                     for feature_name in self.variable_selector.known_future_features:
                         tensor_idx = self.variable_selector.future_feature_name2tensor_idx[feature_name]
@@ -448,7 +446,7 @@ def pre_processing(self,
                                 feat_dict_static[feature_name] = static_feature
 
             else:
-                feat_dict_future = None
+                feat_dict_future = None  # type: ignore[assignment]
 
             x_past, x_future, x_static, static_context_initial_hidden = self.variable_selector(
                 x_past=feat_dict_past,
@@ -651,7 +649,7 @@ def forward(self,
             encoder2decoder, encoder_output = self.encoder(encoder_input=x_past, additional_input=encoder_additional)
 
             if self.has_temporal_fusion:
-                decoder_output_all = None
+                decoder_output_all: Optional[torch.Tensor] = None
 
             if self.forecast_strategy != 'sample':
                 all_predictions = []
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index eaa90f065..586a479e7 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -49,7 +49,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
-            FitRequirement('static_features', (Tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('static_features', (tuple,), user_defined=True, dataset_property=True),
             FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
         ]
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
index 07c4b6382..67e877960 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/__init__.py
@@ -195,4 +195,4 @@ def _defaults_network(self) -> List[str]:
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 49a1a8956..b6e9f85c2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -10,6 +10,8 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
+    AbstractForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     flat_encoder import FlatForecastingEncoderChoice
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
@@ -51,13 +53,13 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         """
         return self.default_components  # type: ignore[return-value]
 
-    def get_available_components(
+    def get_available_components(  # type: ignore[override]
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             include: List[str] = None,
             exclude: List[str] = None,
-            components: Optional[Dict[str, autoPyTorchComponent]] = None
-    ) -> Dict[str, autoPyTorchComponent]:
+            components: Optional[Dict[str, AbstractForecastingEncoderChoice]] = None
+    ) -> Dict[str, AbstractForecastingEncoderChoice]:
         """Filters out components based on user provided
         include/exclude directives, as well as the dataset properties
 
@@ -220,7 +222,8 @@ def get_hyperparameter_search_space(
                 dataset_properties=dataset_properties,  # type: ignore
                 include=include_encoder,
                 exclude=exclude_encoder,
-                **updates)
+                **updates  # type: ignore[call-args]
+            )
             parent_hyperparameter = {'parent': hp_encoder, 'value': name}
             cs.add_configuration_space(
                 name,
@@ -259,7 +262,7 @@ def set_hyperparameters(self,
         self.new_params = new_params
         sub_configuration_space = choice_component.get_hyperparameter_search_space(
             self.dataset_properties,
-            **updates
+            **updates  # type: ignore[call-args]
         )
 
         sub_configuration = Configuration(sub_configuration_space,
@@ -305,4 +308,4 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
 
     def transform(self, X: Dict) -> Dict:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 7f382aa27..26b8eca30 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -421,8 +421,9 @@ def forward(self,
                 )
 
             static_context_variable_selection = self.static_context_variable_selection(static_embedding)[:, None]
-            static_context_initial_hidden = tuple(init_hidden(static_embedding) for init_hidden in
-                                                  self.static_context_initial_hidden)
+            static_context_initial_hidden: Optional[Tuple[torch.Tensor, ...]] = tuple(
+                init_hidden(static_embedding) for init_hidden in self.static_context_initial_hidden
+            )
             if cache_static_contex:
                 self.cached_static_contex = static_context_variable_selection
                 self.cached_static_embedding = static_embedding
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 49de1c050..472093576 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -10,7 +10,7 @@
 from torch import nn
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
     base_forecasting_decoder import BaseForecastingDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderNetwork
@@ -79,11 +79,9 @@ def _build_decoder(self,
                 local_layers.append(_activations[self.config["activation"]]())
             local_layers.append(nn.Unflatten(-1, (n_prediction_heads, self.config['units_local_layer'])))
             num_decoder_output_features = self.config['units_local_layer'] + future_variable_input[-1]
-        else:
-            local_layers = None
 
         return MLPDecoderModule(global_layers=nn.Sequential(*global_layers),
-                                local_layers=nn.Sequential(*local_layers) if local_layers is not None else None,
+                                local_layers=nn.Sequential(*local_layers) if has_local_layer else None,
                                 auto_regressive=self.auto_regressive), num_decoder_output_features
 
     @staticmethod
@@ -214,7 +212,9 @@ def get_hyperparameter_search_space(
         cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
 
         if can_be_auto_regressive:
-            auto_regressive = get_hyperparameter(auto_regressive, CategoricalHyperparameter)
+            auto_regressive: CategoricalHyperparameter = get_hyperparameter(  # type:ignore[no-redef]
+                auto_regressive, CategoricalHyperparameter
+            )
             cs.add_hyperparameters([auto_regressive])
 
             if False in auto_regressive.choices:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 7db3eb245..35c181193 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -62,8 +62,8 @@ def __init__(self,
 
         self.backbone = nn.Sequential(*self.build_backbone())
 
-        self.backcast_head = None
-        self.forecast_head = None
+        self.backcast_head: Optional[nn.Module] = None
+        self.forecast_head: Optional[nn.Module] = None
 
     def build_backbone(self) -> List[nn.Module]:
         layers: List[nn.Module] = list()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 02c0038a6..9c327a5e3 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -38,7 +38,7 @@ def __init__(self,
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:
         return [
-            FitRequirement('known_future_features', (Tuple,), user_defined=False, dataset_property=True),
+            FitRequirement('known_future_features', (tuple,), user_defined=False, dataset_property=True),
             FitRequirement('feature_shapes', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
             FitRequirement('n_prediction_steps', (int,), user_defined=False, dataset_property=True),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index cc43c73e9..8f41c541b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -45,11 +45,11 @@ def __init__(self,
                  **kwargs: Any,
                  ):
         super().__init__(**kwargs)
-        self.pipeline = None
+        self.pipeline: Optional[Pipeline] = None
         self.decoder_choice: Optional[List[BaseForecastingDecoder]] = None
 
     @abstractmethod
-    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
         """Returns the available backbone components
 
         Args:
@@ -72,7 +72,7 @@ def additional_components(self) -> List[Callable]:
         # This function is deigned to add additional components rather than the components in __choice__
         return [self.get_decoder_components]
 
-    def get_available_components(
+    def get_available_components(  # type: ignore[override]
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             include: List[str] = None,
@@ -214,7 +214,7 @@ def get_hyperparameter_search_space(
         encoder2decoder: Dict[str, List[str]] = {}
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
-            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type: ignore[call-args]
                 dataset_properties,
                 **updates)
             parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
@@ -242,7 +242,7 @@ def get_hyperparameter_search_space(
             if not decoder2encoder[decoder_name]:
                 continue
             updates = self._get_search_space_updates(prefix=decoder_name)
-            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(  # type: ignore[call-args]
                 dataset_properties,
                 **updates
             )
@@ -327,7 +327,7 @@ def set_hyperparameters(self,
 
         decoder_components = self.get_decoder_components()
 
-        decoder_type = None
+        decoder_type: Optional[str] = None
 
         decoder_params = {}
         decoder_params_names = []
@@ -344,6 +344,7 @@ def set_hyperparameters(self,
                     decoder_params_names.append(param)
                     param = param.replace(decoder_type + ':', '')
                     decoder_params[param] = value
+        assert decoder_type is not None, 'Decoder must be given to initialize a network backbone'
 
         for param_name in decoder_params_names:
             del new_params[param_name]
@@ -364,7 +365,7 @@ def set_hyperparameters(self,
     def _defaults_network(self) -> List[str]:
         return ['MLPEncoder', 'RNNEncoder', 'NBEATSEncoder']
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchComponent:
+    def fit(self, X: Dict[str, Any], y: Any = None) -> Pipeline:  # type: ignore[override]
         """Handy method to check if a component is fitted
 
         Args:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index eff88d11c..70471b12c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -66,7 +66,7 @@ def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class MLPEncoder(BaseForecastingEncoder, MLPBackbone):
+class MLPEncoder(BaseForecastingEncoder, MLPBackbone):  # type: ignore[misc]
     _fixed_seq_length = True
     window_size = 1
 
@@ -100,7 +100,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
 
     def n_encoder_output_feature(self) -> int:
         # This function should never be called!!
-        return self.config["num_units_%d" % (self.config['num_groups'])]
+        return self.config["num_units_%d" % (self.config['num_groups'])]  # type: int
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
@@ -134,7 +134,7 @@ def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesT
         }
 
     @staticmethod
-    def get_hyperparameter_search_space(
+    def get_hyperparameter_search_space(  # type: ignore[override]
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_groups: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_groups",
                                                                               value_range=(1, 5),
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
index cbcbbc39c..b6abe9ffe 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/NBEATSEncoder.py
@@ -51,14 +51,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             input_shape=output_shape,
         )
 
-        self.input_shape = [self.window_size, output_shape[-1]]
+        input_shape = (self.window_size, output_shape[-1])
+        self.input_shape = input_shape
 
         has_hidden_states = self.encoder_properties().has_hidden_states
-        self.encoder_output_shape = get_output_shape(self.encoder, self.input_shape, has_hidden_states)
+        self.encoder_output_shape = get_output_shape(self.encoder, input_shape, has_hidden_states)
         return self
 
-    def n_encoder_output_feature(self) -> None:
-        # THIS function should never be called!!!
+    def n_encoder_output_feature(self) -> int:
+        # This function should never be called!!!
         raise NotImplementedError
 
     def build_encoder(self,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
index 808209836..79b1e5cc5 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/__init__.py
@@ -25,7 +25,7 @@ def add_encoder(encoder: BaseForecastingEncoder) -> None:
 
 
 class FlatForecastingEncoderChoice(AbstractForecastingEncoderChoice):
-    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
         """Returns the available backbone components
 
         Args:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index 7ad06b515..ca1bad4e4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -147,7 +147,7 @@ def build_encoder(self, input_shape: Tuple[int, ...] = (0,)) -> nn.Module:
 
     def n_encoder_output_feature(self) -> int:
         # see _InceptionBlock.forward()
-        return self.config['num_filters'] * 4
+        return self.config['num_filters'] * 4  # type: int
 
     @staticmethod
     def allowed_decoders() -> List[str]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index cd7a914a7..3a6e678d8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -9,6 +9,7 @@
 import torch
 from torch import nn
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder
@@ -95,7 +96,10 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         return encoder
 
     def n_encoder_output_feature(self) -> int:
-        return 2 * self.config['hidden_size'] if self.config['bidirectional'] else self.config['hidden_size']
+        if self.config['bidirectional']:
+            return 2 * self.config['hidden_size']  # type: int
+        else:
+            return self.config['hidden_size']  # type: int
 
     def n_hidden_states(self) -> int:
         if self.config['cell_type'] == 'lstm':
@@ -131,7 +135,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return super().transform(X)
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+    def get_properties(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Union[str, bool]]:
         return {
             'shortname': 'RNNEncoder',
             'name': 'RNNEncoder',
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index b97970d49..5ec427568 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -142,7 +142,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
 
     def n_encoder_output_feature(self) -> int:
         num_blocks = self.config["num_blocks"]
-        return self.config[f"num_filters_{num_blocks}"]
+        return self.config[f"num_filters_{num_blocks}"]  # type: int
 
     @staticmethod
     def allowed_decoders() -> List[str]:
@@ -152,8 +152,8 @@ def allowed_decoders() -> List[str]:
         return ['MLPDecoder']
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Any]:
+    def get_properties(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Union[str, bool]]:
         return {
             "shortname": "TCNBackbone",
             "name": "TCNBackbone",
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index c8348bea6..0641056d4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -9,6 +9,7 @@
 import torch
 from torch import nn
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
     PositionalEncoding, build_transformer_layers)
@@ -100,7 +101,7 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         return encoder
 
     def n_encoder_output_feature(self) -> int:
-        return 2 ** self.config['d_model_log']
+        return 2 ** self.config['d_model_log']  # type: int
 
     @staticmethod
     def allowed_decoders() -> List[str]:
@@ -125,7 +126,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return super().transform(X)
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+    def get_properties(
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Union[str, bool]]:
         return {
             'shortname': 'TransformerEncoder',
             'name': 'TransformerEncoder',
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 3c1abf8c9..20319b2bf 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -31,13 +31,13 @@
     ThirdPartyComponents, autoPyTorchComponent, find_components)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import \
     ForecastingNetworkStructure
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
     base_forecasting_decoder import BaseForecastingDecoder
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder import \
     AbstractForecastingEncoderChoice
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder. \
     base_forecasting_encoder import BaseForecastingEncoder
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.other_components. \
     TemporalFusion import TemporalFusion
 from autoPyTorch.utils.common import (
     HyperparameterSearchSpace,
@@ -60,7 +60,7 @@ class SeqForecastingEncoderChoice(AbstractForecastingEncoderChoice):
     deepAR_decoder_prefix = 'block_1'
     tf_prefix = "temporal_fusion"
 
-    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
+    def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:  # type: ignore[override]
         """Returns the available backbone components
 
         Args:
@@ -75,7 +75,7 @@ def get_components(self) -> Dict[str, Type[autoPyTorchComponent]]:
         components.update(_addons.components)
         return components
 
-    def get_hyperparameter_search_space(
+    def get_hyperparameter_search_space(  # type: ignore[override]
             self,
             dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
             num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
@@ -171,12 +171,16 @@ def get_hyperparameter_search_space(
 
         cs = ConfigurationSpace()
 
-        min_num_blocks, max_num_blocks = num_blocks.value_range
+        min_num_blocks: int = num_blocks.value_range[0]
+        max_num_blocks: int = num_blocks.value_range[1]
 
-        variable_selection = get_hyperparameter(variable_selection, CategoricalHyperparameter)
+        variable_selection_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+            variable_selection, CategoricalHyperparameter)
         share_single_variable_networks = get_hyperparameter(share_single_variable_networks, CategoricalHyperparameter)
 
-        decoder_auto_regressive = get_hyperparameter(decoder_auto_regressive, CategoricalHyperparameter)
+        decoder_auto_regressive_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+            decoder_auto_regressive, CategoricalHyperparameter
+        )
 
         if min_num_blocks == max_num_blocks:
             num_blocks = Constant(num_blocks.hyperparameter, num_blocks.value_range[0])
@@ -186,55 +190,62 @@ def get_hyperparameter_search_space(
                 sequence=list(range(min_num_blocks, max_num_blocks + 1))
             )
 
-        skip_connection = get_hyperparameter(skip_connection, CategoricalHyperparameter)
+        skip_connection_hp: CategoricalHyperparameter = get_hyperparameter(skip_connection,  # type: ignore[assignment]
+                                                                           CategoricalHyperparameter)
 
-        hp_network_structures = [num_blocks, decoder_auto_regressive, variable_selection,
-                                 skip_connection]
+        hp_network_structures = [num_blocks, decoder_auto_regressive_hp, variable_selection_hp,
+                                 skip_connection_hp]
         cond_skip_connections = []
 
-        if True in skip_connection.choices:
-            skip_connection_type = get_hyperparameter(skip_connection_type, CategoricalHyperparameter)
-            hp_network_structures.append(skip_connection_type)
-            cond_skip_connections.append(EqualsCondition(skip_connection_type, skip_connection, True))
-            if 'gate_add_norm' in skip_connection_type.choices:
-                grn_use_dropout = get_hyperparameter(grn_use_dropout, CategoricalHyperparameter)
-                hp_network_structures.append(grn_use_dropout)
-                if True in variable_selection.choices:
+        if True in skip_connection_hp.choices:
+            skip_connection_type_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+                skip_connection_type, CategoricalHyperparameter
+            )
+            hp_network_structures.append(skip_connection_type_hp)
+            cond_skip_connections.append(EqualsCondition(skip_connection_type_hp, skip_connection_hp, True))
+            if 'gate_add_norm' in skip_connection_type_hp.choices:
+                grn_use_dropout_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
+                    grn_use_dropout, CategoricalHyperparameter
+                )
+                hp_network_structures.append(grn_use_dropout_hp)
+                if True in variable_selection_hp.choices:
                     cond_skip_connections.append(
-                        EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm")
+                        EqualsCondition(grn_use_dropout_hp, skip_connection_type_hp, "gate_add_norm")
                     )
                 else:
                     cond_skip_connections.append(
-                        EqualsCondition(grn_use_dropout, skip_connection_type, "gate_add_norm"))
-                if True in grn_use_dropout.choices:
-                    grn_dropout_rate = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
-                    hp_network_structures.append(grn_dropout_rate)
-                    cond_skip_connections.append(EqualsCondition(grn_dropout_rate, grn_use_dropout, True))
+                        EqualsCondition(grn_use_dropout_hp, skip_connection_type_hp, "gate_add_norm"))
+                if True in grn_use_dropout_hp.choices:
+                    grn_dropout_rate_hp = get_hyperparameter(grn_dropout_rate, UniformFloatHyperparameter)
+                    hp_network_structures.append(grn_dropout_rate_hp)
+                    cond_skip_connections.append(EqualsCondition(grn_dropout_rate_hp, grn_use_dropout_hp, True))
         cs.add_hyperparameters(hp_network_structures)
         if cond_skip_connections:
             cs.add_conditions(cond_skip_connections)
 
-        if True in variable_selection.choices:
-            variable_selection_use_dropout = get_hyperparameter(variable_selection_use_dropout,
-                                                                CategoricalHyperparameter)
-            variable_selection_dropout_rate = get_hyperparameter(variable_selection_dropout_rate,
-                                                                 UniformFloatHyperparameter)
-            cs.add_hyperparameters([variable_selection_use_dropout, variable_selection_dropout_rate])
-
-            cond_vs_dropout = EqualsCondition(variable_selection_use_dropout, variable_selection, True)
-            cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate, variable_selection_use_dropout, True)
+        if True in variable_selection_hp.choices:
+            variable_selection_use_dropout_hp = get_hyperparameter(variable_selection_use_dropout,
+                                                                   CategoricalHyperparameter)
+            variable_selection_dropout_rate_hp = get_hyperparameter(variable_selection_dropout_rate,
+                                                                    UniformFloatHyperparameter)
+            cs.add_hyperparameters([variable_selection_use_dropout_hp, variable_selection_dropout_rate_hp])
+
+            cond_vs_dropout = EqualsCondition(variable_selection_use_dropout_hp, variable_selection_hp, True)
+            cond_vs_dropoutrate = EqualsCondition(variable_selection_dropout_rate_hp,
+                                                  variable_selection_use_dropout_hp,
+                                                  True)
             cs.add_conditions([cond_vs_dropout, cond_vs_dropoutrate])
 
-        if True in variable_selection.choices:
+        if True in variable_selection_hp.choices:
             cs.add_hyperparameter(share_single_variable_networks)
-            cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection, True))
+            cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection_hp, True))
 
         # Compile a list of legal preprocessors for this problem
-        available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(
+        available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(  # type: ignore[call-arg]
             dataset_properties=dataset_properties,
             include=include, exclude=exclude)
 
-        available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(
+        available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(  # type: ignore[call-arg]
             dataset_properties=dataset_properties,
             include=None, exclude=exclude,
             components=self.get_decoder_components())
@@ -266,17 +277,18 @@ def get_hyperparameter_search_space(
         #   disable the recurrent decoders without auto-regressive or variable selection
         #   this is judged by add_forbidden_for_non_ar_recurrent_decoder
 
-        if True in decoder_auto_regressive.choices:
-            forbidden_decoder_ar: Optional[ForbiddenEqualsClause] = ForbiddenEqualsClause(decoder_auto_regressive, True)
+        if True in decoder_auto_regressive_hp.choices:
+            forbidden_decoder_ar: Optional[ForbiddenEqualsClause] = ForbiddenEqualsClause(decoder_auto_regressive_hp,
+                                                                                          True)
         else:
             forbidden_decoder_ar = None
 
         add_forbidden_for_non_ar_recurrent_decoder = False
         if static_features_shape + future_feature_shapes[-1] == 0:
-            if False in decoder_auto_regressive.choices and False in variable_selection.choices:
+            if False in decoder_auto_regressive_hp.choices and False in variable_selection_hp.choices:
                 add_forbidden_for_non_ar_recurrent_decoder = True
 
-        if len(decoder_auto_regressive.choices) == 1 and True in decoder_auto_regressive.choices:
+        if len(decoder_auto_regressive_hp.choices) == 1 and True in decoder_auto_regressive_hp.choices:
             conds_decoder_ar: Optional[List[CS.conditions.ConditionComponent]] = None
         else:
             conds_decoder_ar = []
@@ -391,15 +403,15 @@ def get_hyperparameter_search_space(
                                                                                                 # type: ignore
                                                                                                 **updates)
                 compatible_encoders = decoder2encoder[decoder_name]
-                encoders_with_multi_decoder = []
-                encoder_with_single_decoder = []
+                encoders_with_multi_decoder_l = []
+                encoder_with_single_decoder_l = []
                 for encoder in compatible_encoders:
                     if len(encoder2decoder[encoder]) > 1:
-                        encoders_with_multi_decoder.append(encoder)
+                        encoders_with_multi_decoder_l.append(encoder)
                     else:
-                        encoder_with_single_decoder.append(encoder)
-                encoders_with_multi_decoder = set(encoders_with_multi_decoder)
-                encoder_with_single_decoder = set(encoder_with_single_decoder)
+                        encoder_with_single_decoder_l.append(encoder)
+                encoders_with_multi_decoder = set(encoders_with_multi_decoder_l)
+                encoder_with_single_decoder = set(encoder_with_single_decoder_l)
 
                 cs.add_configuration_space(
                     block_prefix + decoder_name,
@@ -448,12 +460,12 @@ def get_hyperparameter_search_space(
                             # add_forbidden_for_non_ar_recurrent_decoder is True:False in decoder_auto_regressive
                             if conds_decoder_ar is not None:
                                 conds_decoder_ar.append(
-                                    EqualsCondition(decoder_auto_regressive, hp_encoder, encoder)
+                                    EqualsCondition(decoder_auto_regressive_hp, hp_encoder, encoder)
                                 )
                                 if add_forbidden_for_non_ar_recurrent_decoder:
                                     forbiddens_decoder_auto_regressive.append(
                                         ForbiddenAndConjunction(
-                                            ForbiddenEqualsClause(variable_selection, False),
+                                            ForbiddenEqualsClause(variable_selection_hp, False),
                                             ForbiddenEqualsClause(hp_encoder, encoder)
                                         )
                                     )
@@ -462,8 +474,8 @@ def get_hyperparameter_search_space(
                                     forbiddens_decoder_auto_regressive.append(
                                         ForbiddenAndConjunction(
                                             ForbiddenAndConjunction(
-                                                ForbiddenEqualsClause(variable_selection, False),
-                                                ForbiddenEqualsClause(decoder_auto_regressive, False)
+                                                ForbiddenEqualsClause(variable_selection_hp, False),
+                                                ForbiddenEqualsClause(decoder_auto_regressive_hp, False)
                                             ),
                                             ForbiddenEqualsClause(hp_encoder, encoder)
                                         )
@@ -489,8 +501,8 @@ def get_hyperparameter_search_space(
                                     forbiddens_decoder_auto_regressive.append(
                                         ForbiddenAndConjunction(
                                             ForbiddenAndConjunction(
-                                                ForbiddenEqualsClause(variable_selection, False),
-                                                ForbiddenEqualsClause(decoder_auto_regressive, False)
+                                                ForbiddenEqualsClause(variable_selection_hp, False),
+                                                ForbiddenEqualsClause(decoder_auto_regressive_hp, False)
                                             ),
                                             ForbiddenEqualsClause(hp_decoder_type, decoder)
                                         )
@@ -501,13 +513,13 @@ def get_hyperparameter_search_space(
         if conds_decoder_ar:
             cs.add_condition(OrConjunction(*conds_decoder_ar))
 
-        use_temporal_fusion = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
-        cs.add_hyperparameter(use_temporal_fusion)
-        if True in use_temporal_fusion.choices:
+        use_temporal_fusion_hp = get_hyperparameter(use_temporal_fusion, CategoricalHyperparameter)
+        cs.add_hyperparameter(use_temporal_fusion_hp)
+        if True in use_temporal_fusion_hp.choices:
             update = self._get_search_space_updates(prefix=self.tf_prefix)
             cs_tf = TemporalFusion.get_hyperparameter_search_space(dataset_properties,
                                                                    **update)
-            parent_hyperparameter = {'parent': use_temporal_fusion, 'value': True}
+            parent_hyperparameter = {'parent': use_temporal_fusion_hp, 'value': True}
             cs.add_configuration_space(
                 self.tf_prefix,
                 cs_tf,
@@ -551,16 +563,16 @@ def get_hyperparameter_search_space(
 
                     forbidden_deep_ars = []
 
-                    hps_forbidden_deep_ar = [use_temporal_fusion]
+                    hps_forbidden_deep_ar = [use_temporal_fusion_hp]
                     for hp_forbidden_deep_ar in hps_forbidden_deep_ar:
                         if True in hp_forbidden_deep_ar.choices:
                             forbidden_deep_ars.append(ForbiddenAndConjunction(
                                 ForbiddenEqualsClause(hp_forbidden_deep_ar, True),
                                 forbidden_deep_ar
                             ))
-                    if True in skip_connection.choices:
+                    if True in skip_connection_hp.choices:
                         forbidden_deep_ars.append(ForbiddenAndConjunction(
-                            ForbiddenEqualsClause(skip_connection, True),
+                            ForbiddenEqualsClause(skip_connection_hp, True),
                             forbidden_deep_ar
                         ))
                     if forbidden_deep_ars:
@@ -576,26 +588,25 @@ def get_hyperparameter_search_space(
                         ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
                         ForbiddenInClause(num_blocks, list(range(i + 1, max_num_blocks + 1))),
                     ))
-                c1 = isinstance(skip_connection, CategoricalHyperparameter) and True in skip_connection.choices
-                c2 = isinstance(skip_connection, Constant) and skip_connection.value
+                c1 = isinstance(skip_connection_hp, CategoricalHyperparameter) and True in skip_connection_hp.choices
+                c2 = isinstance(skip_connection_hp, Constant) and skip_connection_hp.value
                 if c1 or c2:
-                    if True in skip_connection.choices:
+                    if True in skip_connection_hp.choices:
                         forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
                             ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
-                            ForbiddenEqualsClause(skip_connection, True),
+                            ForbiddenEqualsClause(skip_connection_hp, True),
                         ))
-                c1 = isinstance(use_temporal_fusion, CategoricalHyperparameter) and True in use_temporal_fusion.choices
-                c2 = isinstance(use_temporal_fusion, Constant) and skip_connection.value
+                c1 = isinstance(use_temporal_fusion_hp, CategoricalHyperparameter) \
+                     and True in use_temporal_fusion_hp.choices
+                c2 = isinstance(use_temporal_fusion_hp, Constant) and skip_connection_hp.value
                 if c1 or c2:
-                    if True in use_temporal_fusion.choices:
+                    if True in use_temporal_fusion_hp.choices:
                         forbidden_mlp_local_layer.append(ForbiddenAndConjunction(
                             ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
-                            ForbiddenEqualsClause(use_temporal_fusion, True),
+                            ForbiddenEqualsClause(use_temporal_fusion_hp, True),
                         ))
 
         cs.add_forbidden_clauses(forbidden_mlp_local_layer)
-        cs.get_children_of(decoder_auto_regressive)
-
         return cs
 
     @property
@@ -636,8 +647,8 @@ def set_hyperparameters(self,
         use_temporal_fusion = forecasting_structure_kwargs['use_temporal_fusion']
 
         pipeline_steps = [('net_structure', ForecastingNetworkStructure(**forecasting_structure_kwargs))]
-        self.encoder_choice = []
-        self.decoder_choice: List[BaseForecastingEncoder] = []
+        self.encoder_choice: Union[List[BaseForecastingEncoder], List[()]] = []
+        self.decoder_choice: Union[List[BaseForecastingDecoder], List[()]] = []
 
         decoder_components = self.get_decoder_components()
 
@@ -658,7 +669,7 @@ def set_hyperparameters(self,
                         param = param.replace(block_prefix + choice + ':', '')
                         new_params[param] = value
 
-            decoder_type = None
+            decoder_type: Optional[str] = None
 
             decoder_params = {}
             decoder_params_names = []
@@ -666,15 +677,16 @@ def set_hyperparameters(self,
                 if decoder_type is None:
                     for decoder_component in decoder_components.keys():
                         if param.startswith(block_prefix + decoder_component):
-                            decoder_type = decoder_component
+                            decoder_type: str = decoder_component  # type:ignore[no-redef]
                             decoder_params_names.append(param)
-                            param = param.replace(block_prefix + decoder_type + ':', '')
+                            param = param.replace(block_prefix + decoder_type + ':', '')  # type:ignore[operator]
                             decoder_params[param] = value
                 else:
                     if param.startswith(block_prefix + decoder_type):
                         decoder_params_names.append(param)
                         param = param.replace(block_prefix + decoder_type + ':', '')
                         decoder_params[param] = value
+            assert decoder_type is not None, 'Decoder must be given to initialize a forecasting backbone!'
 
             for param_name in decoder_params_names:
                 del new_params[param_name]
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
index 756e9e961..452e74cc1 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py
@@ -205,4 +205,4 @@ def get_hyperparameter_search_space(
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index 67bbd8019..f2c15aa2d 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -191,4 +191,4 @@ def get_hyperparameter_search_space(
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index 4e0b61455..e9734ab8c 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -118,7 +118,7 @@ def domain_map(self, loc: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tens
 
     @property
     def dist_cls(self) -> Type[Distribution]:
-        return Normal
+        return Normal  # type: ignore[no-any-return]
 
 
 class StudentTOutput(ProjectionLayer):
@@ -135,7 +135,7 @@ def domain_map(  # type: ignore[override]
 
     @property
     def dist_cls(self) -> Type[Distribution]:
-        return StudentT
+        return StudentT   # type: ignore[no-any-return]
 
 
 class BetaOutput(ProjectionLayer):
@@ -157,7 +157,7 @@ def domain_map(  # type: ignore[override]
     @property
     def dist_cls(self) -> Type[Distribution]:
         # TODO consider constraints on Beta!!!
-        return Beta
+        return Beta   # type: ignore[no-any-return]
 
 
 class GammaOutput(ProjectionLayer):
@@ -178,7 +178,7 @@ def domain_map(  # type: ignore[override]
 
     @property
     def dist_cls(self) -> Type[Distribution]:
-        return Gamma
+        return Gamma  # type: ignore[no-any-return]
 
 
 class PoissonOutput(ProjectionLayer):
@@ -192,7 +192,7 @@ def domain_map(self, rate: torch.Tensor) -> Tuple[torch.Tensor]:  # type: ignore
 
     @property
     def dist_cls(self) -> Type[Distribution]:
-        return Poisson
+        return Poisson  # type: ignore[no-any-return]
 
 
 ALL_DISTRIBUTIONS = {
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index f1601af85..214d47d85 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -209,7 +209,8 @@ def build_head(self,  # type: ignore[override]
                     nn.Sequential(
                         nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
                         nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
-                    ) for _ in range(num_quantiles)]
+                    ) for _ in range(num_quantiles)
+                ]
             proj_layer = QuantileHead(proj_layer)
             return proj_layer
         else:
diff --git a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
index f7254f6dc..f89b80849 100644
--- a/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/optimizer/__init__.py
@@ -177,4 +177,4 @@ def get_hyperparameter_search_space(
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5ad9fad36..cd90c7754 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -92,22 +92,22 @@ def __init__(self,
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector: Optional[Callable] = None
 
-        self.known_future_features_index = None
+        self.known_future_features_index: Union[Tuple[int], Tuple[()]] = tuple()
         self._is_uni_variant = False
 
         self.transform_time_features = transform_time_features
         self.freq = "1Y"
         self.time_feature_transform: List[TimeFeature] = []
-        self.dataset_columns: List[Union[int, str]] = []
+        self.dataset_columns: Union[Tuple[Union[int, str]], Tuple[()]] = tuple()
         self.sampler_train: Optional[Union[Iterator, torch.utils.data.sampler.Sampler]] = None
 
         # Applied for get loader
         self.feature_preprocessor: Optional[ColumnTransformer] = None
 
         self.add_fit_requirements(
-            [FitRequirement("known_future_features", (Tuple,), user_defined=True, dataset_property=True),
+            [FitRequirement("known_future_features", (tuple,), user_defined=True, dataset_property=True),
              FitRequirement("feature_shapes", (Dict,), user_defined=True, dataset_property=True),
-             FitRequirement("feature_names", (Tuple,), user_defined=True, dataset_property=True),
+             FitRequirement("feature_names", (tuple,), user_defined=True, dataset_property=True),
              FitRequirement("sequence_lengths_train", (List,), user_defined=True, dataset_property=True),
              FitRequirement("freq", (str,), user_defined=True, dataset_property=True),
              FitRequirement("n_prediction_steps", (int,), user_defined=True, dataset_property=True)])
@@ -376,7 +376,6 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     sequence_lengths[seq_idx] = len(x_seq.X)
                 series_number = np.arange(len(sequence_lengths)).repeat(sequence_lengths)
 
-                assert self.known_future_features_index is not None
                 if len(self.known_future_features_index) > 0:
                     sequence_lengths_test = [0] * num_sequences
                     for seq_idx, x_seq in enumerate(X):
@@ -415,9 +414,9 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                         x_all_test = pd.DataFrame(np.concatenate([x_seq.X_test for x_seq in X]))
                         x_all_test.index = series_number_test
 
-                x_all = x_all.groupby(x_all.index)
+                x_all_grouped = x_all.groupby(x_all.index)
                 if len(self.known_future_features_index) > 0:
-                    x_all_test = x_all_test.groupby(x_all_test.index)
+                    x_all_test_grouped = x_all_test.groupby(x_all_test.index)
 
             for i, x_seq in enumerate(X):
                 if not isinstance(x_seq, TimeSeriesSequence):
@@ -429,10 +428,10 @@ def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.nd
                     x_seq._cached_time_features = None
 
                 if self.dataset_small_preprocess and not self._is_uni_variant:
-                    x_seq.X = x_all.get_group(i).transform(np.array).values
+                    x_seq.X = x_all_grouped.get_group(i).transform(np.array).values
                     update_dict: Dict[str, Any] = {"known_future_features_index": self.known_future_features_index}
                     if len(self.known_future_features_index) > 0:
-                        x_seq.X_test = x_all_test.get_group(i).transform(np.array).values
+                        x_seq.X_test = x_all_test_grouped.get_group(i).transform(np.array).values
 
                 else:
                     update_dict = {}
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index d19d6618f..d58124789 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -191,7 +191,7 @@ def __call__(
 
 
 class _ForecastingMetric(ForecastingMetricMixin, autoPyTorchMetric):
-    def __call__(
+    def __call__(  # type: ignore[override]
             self,
             y_true: np.ndarray,
             y_pred: np.ndarray,
@@ -267,9 +267,9 @@ def __call__(
             losses_all = np.mean(losses_all, -1)
 
         if agg == 'mean':
-            return self._sign * np.mean(losses_all)
+            return float(self._sign * np.mean(losses_all))
         elif agg == 'median':
-            return self._sign * np.median(losses_all)
+            return float(self._sign * np.median(losses_all))
         else:
             raise NotImplementedError(f'Unsupported aggregation type {agg}')
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index 4102f949a..4bb8fda5c 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -221,6 +221,8 @@ def prepare_trainer(self, X: Dict) -> None:
         """
         prepare trainer, forecasting tasks require more parameters
         """
+        assert self.choice is not None
+
         # Support additional user metrics
         metrics = get_metrics(dataset_properties=X['dataset_properties'])
         if 'additional_metrics' in X:
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index cc6f0cf2a..14c0e0a24 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -56,6 +56,7 @@ def get_budget_tracker(self, X: Dict) -> BudgetTracker:
         )
 
     def prepare_trainer(self, X: Dict) -> None:
+        assert self.choice is not None
         # Support additional user metrics
         metrics = get_metrics(dataset_properties=X['dataset_properties'])
         if 'additional_metrics' in X:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 7dbea4dcb..f935e0ed2 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -111,7 +111,7 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None,
             calculate_score, get_metrics)
         metrics = get_metrics(self.dataset_properties, ['mean_MAPE_forecasting'])
         y_pred = self.predict(X, batch_size=batch_size)
-        r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[self.dataset_properties['task_type']],
+        r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[str(self.dataset_properties['task_type'])],
                              metrics=metrics, **score_kwargs)['mean_MAPE_forecasting']
         return r2
 
@@ -139,11 +139,10 @@ def _get_hyperparameter_search_space(self,
         """
         cs = ConfigurationSpace()
 
-        if dataset_properties is None or not isinstance(dataset_properties, dict):
-            if not isinstance(dataset_properties, dict):
-                warnings.warn('The given dataset_properties argument contains an illegal value.'
-                              'Proceeding with the default value')
-            dataset_properties = dict()
+        if not isinstance(dataset_properties, dict):
+            warnings.warn('The given dataset_properties argument contains an illegal value.'
+                          'Proceeding with the default value')
+        dataset_properties = dict()
 
         if 'target_type' not in dataset_properties:
             dataset_properties['target_type'] = 'time_series_forecasting'
@@ -336,7 +335,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         """
         steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
 
-        default_dataset_properties = {'target_type': 'time_series_prediction'}
+        default_dataset_properties: Dict[str, BaseDatasetPropertiesType] = {'target_type': 'time_series_prediction'}
         if dataset_properties is not None:
             default_dataset_properties.update(dataset_properties)
 
diff --git a/test/test_pipeline/test_losses.py b/test/test_pipeline/test_losses.py
index 0929dc1fe..d68f030fb 100644
--- a/test/test_pipeline/test_losses.py
+++ b/test/test_pipeline/test_losses.py
@@ -85,29 +85,29 @@ def test_loss_dict():
 
 @pytest.mark.parametrize('target,expected_weights', [
     (
-            # Expected 4 classes where first one is majority one
-            np.array([[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]),
-            # We reduce the contribution of the first class which has double elements
-            np.array([0.5, 1., 1., 1.]),
+        # Expected 4 classes where first one is majority one
+        np.array([[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]),
+        # We reduce the contribution of the first class which has double elements
+        np.array([0.5, 1., 1., 1.]),
     ),
     (
-            # Expected 2 classes -- multilable format
-            np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
-            # We reduce the contribution of the first class which 3 to 1 ratio
-            np.array([2 / 3, 2]),
+        # Expected 2 classes -- multilable format
+        np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
+        # We reduce the contribution of the first class which 3 to 1 ratio
+        np.array([2 / 3, 2]),
     ),
     (
-            # Expected 2 classes -- (-1, 1) format
-            np.array([[1], [1], [1], [0]]),
-            # We reduce the contribution of the second class, which has a 3 to 1 ratio
-            np.array([2, 2 / 3]),
+        # Expected 2 classes -- (-1, 1) format
+        np.array([[1], [1], [1], [0]]),
+        # We reduce the contribution of the second class, which has a 3 to 1 ratio
+        np.array([2, 2 / 3]),
     ),
     (
-            # Expected 2 classes -- single column
-            # We have to reduce the contribution of the second class with 5 to 1 ratio
-            np.array([1, 1, 1, 1, 1, 0]),
-            # We reduce the contribution of the first class which has double elements
-            np.array([3, 6 / 10]),
+        # Expected 2 classes -- single column
+        # We have to reduce the contribution of the second class with 5 to 1 ratio
+        np.array([1, 1, 1, 1, 1, 0]),
+        # We reduce the contribution of the first class which has double elements
+        np.array([3, 6 / 10]),
     ),
 ])
 def test_lossweightstrategyweighted(target, expected_weights):
@@ -122,23 +122,23 @@ def test_lossweightstrategyweighted(target, expected_weights):
 
 @pytest.mark.parametrize('target,expected_weights', [
     (
-            # Expected 2 classes -- multilable format
-            np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
-            # We reduce the contribution of the first class which 3 to 1 ratio
-            np.array([1 / 3, 3]),
+        # Expected 2 classes -- multilable format
+        np.array([[1, 0], [1, 0], [1, 0], [0, 1]]),
+        # We reduce the contribution of the first class which 3 to 1 ratio
+        np.array([1 / 3, 3]),
     ),
     (
-            # Expected 2 classes -- (-1, 1) format
-            np.array([[1], [1], [1], [0]]),
-            # We reduce the contribution of the second class, which has a 3 to 1 ratio
-            np.array([1 / 3]),
+        # Expected 2 classes -- (-1, 1) format
+        np.array([[1], [1], [1], [0]]),
+        # We reduce the contribution of the second class, which has a 3 to 1 ratio
+        np.array([1 / 3]),
     ),
     (
-            # Expected 2 classes -- single column
-            # We have to reduce the contribution of the second class with 5 to 1 ratio
-            np.array([1, 1, 1, 1, 1, 0]),
-            # We reduce the contribution of the first class which has double elements
-            np.array([0.2]),
+        # Expected 2 classes -- single column
+        # We have to reduce the contribution of the second class with 5 to 1 ratio
+        np.array([1, 1, 1, 1, 1, 0]),
+        # We reduce the contribution of the first class which has double elements
+        np.array([0.2]),
     ),
 ])
 def test_lossweightstrategyweightedbinary(target, expected_weights):

From 609ccf1d8f89c03279b5035a87a57484aadfc41d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 18:02:30 +0200
Subject: [PATCH 299/347] maint

---
 .../early_preprocessor/TimeSeriesEarlyPreProcessing.py      | 2 +-
 .../components/setup/network/forecasting_network.py         | 2 +-
 .../forecasting_encoder/seq_encoder/RNNEncoder.py           | 6 ++----
 .../forecasting_encoder/seq_encoder/__init__.py             | 5 +++--
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index b052ce5fc..e9e421447 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 586a479e7..244fbeea1 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
index 3a6e678d8..152936e1b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/RNNEncoder.py
@@ -96,10 +96,8 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         return encoder
 
     def n_encoder_output_feature(self) -> int:
-        if self.config['bidirectional']:
-            return 2 * self.config['hidden_size']  # type: int
-        else:
-            return self.config['hidden_size']  # type: int
+        hidden_size: int = self.config['hidden_size']
+        return 2 * hidden_size if self.config['bidirectional'] else hidden_size
 
     def n_hidden_states(self) -> int:
         if self.config['cell_type'] == 'lstm':
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 20319b2bf..d7b6ec04e 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -596,8 +596,9 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                             ForbiddenEqualsClause(hp_mlp_has_local_layer, False),
                             ForbiddenEqualsClause(skip_connection_hp, True),
                         ))
-                c1 = isinstance(use_temporal_fusion_hp, CategoricalHyperparameter) \
-                     and True in use_temporal_fusion_hp.choices
+                c1 = isinstance(
+                    use_temporal_fusion_hp, CategoricalHyperparameter
+                ) and True in use_temporal_fusion_hp.choices
                 c2 = isinstance(use_temporal_fusion_hp, Constant) and skip_connection_hp.value
                 if c1 or c2:
                     if True in use_temporal_fusion_hp.choices:

From 168b7cf282fea63daca4422803c54a42c82dc774 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 18:11:50 +0200
Subject: [PATCH 300/347] maint mypy

---
 .../forecasting_encoder/flat_encoder/MLPEncoder.py             | 3 ++-
 .../forecasting_encoder/seq_encoder/InceptionTimeEncoder.py    | 3 ++-
 .../forecasting_encoder/seq_encoder/TCNEncoder.py              | 3 ++-
 .../forecasting_encoder/seq_encoder/TransformerEncoder.py      | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
index 70471b12c..10e67ff8a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/flat_encoder/MLPEncoder.py
@@ -100,7 +100,8 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
 
     def n_encoder_output_feature(self) -> int:
         # This function should never be called!!
-        return self.config["num_units_%d" % (self.config['num_groups'])]  # type: int
+        num_out_features: int = self.config["num_units_%d" % (self.config['num_groups'])]
+        return num_out_features
 
     def _add_layer(self, layers: List[nn.Module], in_features: int, out_features: int,
                    layer_id: int) -> None:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index ca1bad4e4..f31de69af 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -147,7 +147,8 @@ def build_encoder(self, input_shape: Tuple[int, ...] = (0,)) -> nn.Module:
 
     def n_encoder_output_feature(self) -> int:
         # see _InceptionBlock.forward()
-        return self.config['num_filters'] * 4  # type: int
+        num_filters_out: int = self.config['num_filters']
+        return num_filters_out * 4
 
     @staticmethod
     def allowed_decoders() -> List[str]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
index 5ec427568..ee9293e8d 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TCNEncoder.py
@@ -142,7 +142,8 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
 
     def n_encoder_output_feature(self) -> int:
         num_blocks = self.config["num_blocks"]
-        return self.config[f"num_filters_{num_blocks}"]  # type: int
+        num_filter_out: int = self.config[f"num_filters_{num_blocks}"]
+        return num_filter_out
 
     @staticmethod
     def allowed_decoders() -> List[str]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 0641056d4..fb851acbb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -101,7 +101,8 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         return encoder
 
     def n_encoder_output_feature(self) -> int:
-        return 2 ** self.config['d_model_log']  # type: int
+        d_model_log: int = self.config['d_model_log']
+        return 2 ** d_model_log
 
     @staticmethod
     def allowed_decoders() -> List[str]:

From 88c23548701c43a70355e96f66a0e68da047acd4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 19:20:06 +0200
Subject: [PATCH 301/347] mypy!!!

---
 autoPyTorch/api/time_series_forecasting.py    |  6 +--
 .../data/time_series_feature_validator.py     |  2 +-
 autoPyTorch/datasets/time_series_dataset.py   | 19 +++++-----
 ...time_series_forecasting_train_evaluator.py |  4 +-
 autoPyTorch/optimizer/smbo.py                 | 12 +-----
 .../encoding/time_series_base_encoder.py      |  4 +-
 .../forecasting_training_loss/__init__.py     |  2 +-
 .../setup/network/forecasting_architecture.py |  8 ++--
 .../forecasting_backbone/__init__.py          | 12 +++---
 .../forecasting_decoder/MLPDecoder.py         |  8 ++--
 .../forecasting_decoder/NBEATSDecoder.py      |  2 +-
 .../forecasting_encoder/__init__.py           | 13 ++++---
 .../forecasting_encoder/components.py         |  2 +-
 .../seq_encoder/TransformerEncoder.py         |  4 +-
 .../seq_encoder/__init__.py                   | 38 +++++++++++--------
 .../forecasting_head.py                       | 26 ++++++-------
 .../setup/network_initializer/__init__.py     |  2 +-
 .../time_series_forecasting_data_loader.py    |  8 ++--
 18 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 46a75833a..4fab5bb2a 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -428,7 +428,7 @@ def search(
             **forecasting_dataset_kwargs,
         )
 
-        if self.dataset.base_window_size is not None and not self.customized_window_size:
+        if not self.customized_window_size:
             base_window_size = int(np.ceil(self.dataset.base_window_size))
             # we don't want base window size to large, which might cause a too long computation time, in which case
             # we will use n_prediction_step instead (which is normally smaller than base_window_size)
@@ -482,7 +482,7 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
-            **forecasting_kwargs,
+            **forecasting_kwargs,  # type: ignore[arg-type]
         )
 
     def predict(
@@ -500,7 +500,7 @@ def predict(
             target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
         (used for multi-variable prediction), indicates which value needs to be predicted
         """
-        if not isinstance(X_test[0], TimeSeriesSequence):
+        if X_test is None or not isinstance(X_test[0], TimeSeriesSequence):
             assert past_targets is not None
             # Validate and construct TimeSeriesSequence
             X_test, _, _, _ = self.dataset.transform_data_into_time_series_sequence(
diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
index 462d49a0b..962da78a8 100644
--- a/autoPyTorch/data/time_series_feature_validator.py
+++ b/autoPyTorch/data/time_series_feature_validator.py
@@ -137,7 +137,7 @@ def fit(
         static_features: pd.Series = (
             X_train.groupby(X_train.index).nunique() <= 1
         ).all()
-        self.static_features = tuple(
+        self.static_features = tuple(   # type: ignore[assignment]
             idx for idx in static_features.index if static_features[idx]
         )
         return self
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 0fd33dd81..f34a13adf 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -106,7 +106,7 @@ def __init__(self,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
                  n_prediction_steps: int = 1,
                  sp: int = 1,
-                 known_future_features_index: Optional[List[int]] = None,
+                 known_future_features_index: Optional[Tuple[int]] = None,
                  compute_mase_coefficient_value: bool = True,
                  time_features: Optional[np.ndarray] = None,
                  is_test_set: bool = False,
@@ -470,7 +470,7 @@ def __init__(self,
         self.seasonality = int(seasonality)
 
         self.freq: str = freq
-        self.freq_value: Real = freq_value
+        self.freq_value: Union[float, int] = freq_value
 
         self.n_prediction_steps = n_prediction_steps
 
@@ -610,7 +610,7 @@ def __init__(self,
         self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
 
-        resampling_strategy, resampling_strategy_args = self.get_split_strategy(
+        resampling_strategy_opt, resampling_strategy_args_opt = self.get_split_strategy(
             sequence_lengths=sequence_lengths,
             n_prediction_steps=n_prediction_steps,
             freq_value=self.freq_value,
@@ -618,8 +618,8 @@ def __init__(self,
             resampling_strategy_args=resampling_strategy_args
         )
 
-        self.resampling_strategy = resampling_strategy
-        self.resampling_strategy_args = resampling_strategy_args
+        self.resampling_strategy = resampling_strategy_opt
+        self.resampling_strategy_args = resampling_strategy_args_opt
 
         if isinstance(self.resampling_strategy, CrossValTypes):
             self.cross_validators = CrossValFuncs.get_cross_validators(self.resampling_strategy)
@@ -898,7 +898,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
     def replace_data(self,
                      X_train: pd.DataFrame,
                      X_test: Optional[pd.DataFrame],
-                     known_future_features_index: Union[Tuple[int], Tuple[()]] = tuple()) -> 'BaseDataset':
+                     known_future_features_index: Union[Tuple[int], Tuple[()]] = ()) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
         if X_train is None:
             return self
@@ -988,7 +988,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[
                 num_splits=cast(int, num_splits),
                 n_repeats=n_repeats
             ))
-        elif self.resampling_strategy is None:
+        elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
             splits.append(self.create_refit_split())
         else:
             raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
@@ -1028,8 +1028,9 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
     def get_split_strategy(sequence_lengths: List[int],
                            n_prediction_steps: int,
                            freq_value: Union[float, int],
-                           resampling_strategy: Optional[Union[
-                               CrossValTypes, HoldoutValTypes]] = HoldoutValTypes.time_series_hold_out_validation,
+                           resampling_strategy: Union[
+                               CrossValTypes, HoldoutValTypes,
+                               NoResamplingStrategyTypes] = HoldoutValTypes.time_series_hold_out_validation,
                            resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
             Tuple[Optional[Union[CrossValTypes, HoldoutValTypes]], Optional[Dict[str, Any]]]:
         """
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 2b0c3b220..e41e69fc6 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -112,7 +112,7 @@ def fit_predict_and_loss(self) -> None:
                                                                                         test_indices=test_split,
                                                                                         add_pipeline_to_self=True)
 
-            mase_coefficient_val = self.generate_mase_coefficient_for_validation(test_split)
+            mase_coefficient_val = self.generate_mase_coefficient_for_validation(test_split)  # type: ignore[arg-type]
 
             forecasting_kwargs = {'sp': self.seasonality,
                                   'n_prediction_steps': self.n_prediction_steps,
@@ -165,7 +165,7 @@ def fit_predict_and_loss(self) -> None:
 
             mase_coefficient_val_all = []
             for train_split, test_split in self.splits:
-                mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)
+                mase_coefficient = self.generate_mase_coefficient_for_validation(test_split)  # type: ignore[arg-type]
                 mase_coefficient_val_all.append(mase_coefficient)
 
             forecasting_kwargs = {'sp': self.seasonality,
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 174dd19c4..41ec1ae25 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -44,7 +44,7 @@ def get_smac_object(
         ta: Callable,
         ta_kwargs: Dict[str, Any],
         n_jobs: int,
-        initial_budget: int,
+        initial_budget: Union[int, float],
         max_budget: Union[int, float],
         dask_client: Optional[dask.distributed.Client],
         initial_configurations: Optional[List[Configuration]] = None,
@@ -297,15 +297,7 @@ def __init__(self,
                 initial_configurations = read_return_initial_configurations(config_space=config_space,
                                                                             portfolio_selection=portfolio_selection)
 
-        self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else None
-
-    def reset_data_manager(self) -> None:
-        if self.datamanager is not None:
-            del self.datamanager
-        self.datamanager = self.backend.load_datamanager()
-
-        if self.datamanager is not None and self.datamanager.task_type is not None:
-            self.task = self.datamanager.task_type
+        self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else Non
 
     def run_smbo(self, func: Optional[Callable] = None
                  ) -> Tuple[RunHistory, List[TrajEntry], str]:
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
index da9ad016f..6fa5b69be 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Union
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import \
     BaseEncoder
@@ -16,7 +16,7 @@ def __init__(self) -> None:
         self.add_fit_requirements([
             FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
             FitRequirement('categories', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('feature_names', (Tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True),
             FitRequirement('feature_shapes', (Dict, ), user_defined=True, dataset_property=True),
         ])
         self.feature_shapes: Union[Dict[str, int]] = {}
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index b15ec761a..dc20fe313 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -190,4 +190,4 @@ def get_hyperparameter_search_space(
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)  # ignore[no-any-return]
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index b51cdcd0f..f514dc3d0 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -951,9 +951,9 @@ def forward(self,
                 encoder_input = encoder_input.to(self.device)
 
                 encoder_input = self.embedding(encoder_input)
-                static_context_initial_hidden = None
+                static_context_initial_hidden = None  # type: ignore[assignment]
 
-            encoder_additional = [static_context_initial_hidden]
+            encoder_additional: List[Optional[torch.Tensor]] = [static_context_initial_hidden]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
@@ -1024,12 +1024,12 @@ def forward(self,
 
                 encoder_input = encoder_input.to(self.device)
                 encoder_input = self.embedding(encoder_input)
-                static_context_initial_hidden = None
+                static_context_initial_hidden = None  # type: ignore[assignment]
 
             all_samples = []
             batch_size: int = past_targets.shape[0]
 
-            encoder_additional = [static_context_initial_hidden]
+            encoder_additional: List[Optional[torch.Tensor]] = [static_context_initial_hidden]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index b6e9f85c2..39a1a5611 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -41,7 +41,7 @@ def __init__(self,
              "seq_encoder": SeqForecastingEncoderChoice(dataset_properties=self.dataset_properties,
                                                         random_state=self.random_state)})
 
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+    def get_components(self) -> Dict[str, AbstractForecastingEncoderChoice]:  # type: ignore[override]
         """Returns the available backbone components
 
         Args:
@@ -51,7 +51,7 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
             Dict[str, autoPyTorchComponent]: all basebackbone components available
                 as choices for learning rate scheduling
         """
-        return self.default_components  # type: ignore[return-value]
+        return self.default_components
 
     def get_available_components(  # type: ignore[override]
             self,
@@ -86,7 +86,7 @@ def get_available_components(  # type: ignore[override]
         if components is None:
             available_comp = self.get_components()
         else:
-            available_comp = components
+            available_comp = components  # type: ignore[assignment]
 
         if include is not None:
             include_top = set()
@@ -219,10 +219,10 @@ def get_hyperparameter_search_space(
                     exclude_encoder = self.exclude_components[name]
 
             config_space = available_encoders[name].get_hyperparameter_search_space(
-                dataset_properties=dataset_properties,  # type: ignore
+                dataset_properties=dataset_properties,
                 include=include_encoder,
                 exclude=exclude_encoder,
-                **updates  # type: ignore[call-args]
+                **updates  # type: ignore[call-arg]
             )
             parent_hyperparameter = {'parent': hp_encoder, 'value': name}
             cs.add_configuration_space(
@@ -262,7 +262,7 @@ def set_hyperparameters(self,
         self.new_params = new_params
         sub_configuration_space = choice_component.get_hyperparameter_search_space(
             self.dataset_properties,
-            **updates  # type: ignore[call-args]
+            **updates  # type: ignore[call-arg]
         )
 
         sub_configuration = Configuration(sub_configuration_space,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index 472093576..c67e98b98 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -212,16 +212,16 @@ def get_hyperparameter_search_space(
         cond_units_local_layer = EqualsCondition(units_local_layer, has_local_layer, True)
 
         if can_be_auto_regressive:
-            auto_regressive: CategoricalHyperparameter = get_hyperparameter(  # type:ignore[no-redef]
+            auto_regressive_hp: CategoricalHyperparameter = get_hyperparameter(  # type:ignore[assignment]
                 auto_regressive, CategoricalHyperparameter
             )
-            cs.add_hyperparameters([auto_regressive])
+            cs.add_hyperparameters([auto_regressive_hp])
 
-            if False in auto_regressive.choices:
+            if False in auto_regressive_hp.choices:
                 cs.add_hyperparameters([has_local_layer, units_local_layer])
                 cs.add_conditions([cond_units_local_layer])
 
-                cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive, False)
+                cond_use_local_layer = EqualsCondition(has_local_layer, auto_regressive_hp, False)
                 cs.add_conditions([cond_use_local_layer])
                 return cs
             else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 35c181193..e7bc605e4 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -85,7 +85,7 @@ def _add_layer(self, layers: List[nn.Module], in_features: int) -> None:
 
     def forward(self, x_future: Optional[torch.Tensor], encoder_output: torch.Tensor,
                 pos_idx: Optional[Tuple[int]] = None) -> Union[nn.Module, Tuple[nn.Module, nn.Module]]:
-        if self.backcast_head is None and self.forecast_head is None:
+        if self.backcast_head is None or self.forecast_head is None:
             # used to compute head dimensions
             return self.backbone(encoder_output)
         else:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 8f41c541b..e6b7514ed 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -214,9 +214,10 @@ def get_hyperparameter_search_space(
         encoder2decoder: Dict[str, List[str]] = {}
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
-            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type: ignore[call-args]
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                 dataset_properties,
-                **updates)
+                **updates   # type: ignore[call-args]
+            )
             parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
             cs.add_configuration_space(
                 encoder_name,
@@ -242,9 +243,9 @@ def get_hyperparameter_search_space(
             if not decoder2encoder[decoder_name]:
                 continue
             updates = self._get_search_space_updates(prefix=decoder_name)
-            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(  # type: ignore[call-args]
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
                 dataset_properties,
-                **updates
+                **updates   # type: ignore[call-args]
             )
             compatible_encoders = decoder2encoder[decoder_name]
             encoders_with_multi_decoder = []
@@ -379,6 +380,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> Pipeline:  # type: ignore[ove
         assert self.pipeline is not None, "Cannot call fit without initializing the component"
         return self.pipeline.fit(X, y)
 
-    def transform(self, X: Dict) -> Dict:
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.pipeline is not None, "Cannot call transform before the object is initialized"
-        return self.pipeline.transform(X)
+        return self.pipeline.transform(X)   # type: ignore[no-any-return]
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
index 7f7b7e761..65fd5032c 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -10,7 +10,7 @@ class EncoderProperties(NamedTuple):
     bijective_seq_output: bool = True
     fixed_input_seq_length: bool = False
     lagged_input: bool = False
-    causality: bool = True  # this value indicates if the output of the model only depends on the past targets
+    is_casual: bool = True  # this value indicates if the output of the model only depends on the past targets
 
 
 class EncoderBlockInfo(NamedTuple):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index fb851acbb..fb3c1f9c6 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -101,8 +101,8 @@ def build_encoder(self, input_shape: Tuple[int, ...]) -> nn.Module:
         return encoder
 
     def n_encoder_output_feature(self) -> int:
-        d_model_log: int = self.config['d_model_log']
-        return 2 ** d_model_log
+        d_model: int = 2 ** self.config['d_model_log']
+        return d_model
 
     @staticmethod
     def allowed_decoders() -> List[str]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index d7b6ec04e..ec5d94f7f 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -1,7 +1,7 @@
 import inspect
 import os
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Dict, List, Optional, Type, Tuple, Union
 
 import ConfigSpace as CS
 from ConfigSpace.conditions import (
@@ -166,13 +166,14 @@ def get_hyperparameter_search_space(  # type: ignore[override]
         if dataset_properties is None:
             dataset_properties = {}
 
-        static_features_shape = dataset_properties.get("static_features_shape", 0)
-        future_feature_shapes = dataset_properties.get("future_feature_shapes", (0,))
+        static_features_shape: int = dataset_properties.get("static_features_shape", 0)  # type: ignore[assignment]
+        future_feature_shapes: Tuple[int] = dataset_properties.get("future_feature_shapes", # type: ignore[assignment]
+                                                                   (0,))
 
         cs = ConfigurationSpace()
 
-        min_num_blocks: int = num_blocks.value_range[0]
-        max_num_blocks: int = num_blocks.value_range[1]
+        min_num_blocks: int = num_blocks.value_range[0]   # type: ignore[assignment]
+        max_num_blocks: int = num_blocks.value_range[1]   # type: ignore[assignment]
 
         variable_selection_hp: CategoricalHyperparameter = get_hyperparameter(  # type: ignore[assignment]
             variable_selection, CategoricalHyperparameter)
@@ -241,11 +242,11 @@ def get_hyperparameter_search_space(  # type: ignore[override]
             cs.add_condition(EqualsCondition(share_single_variable_networks, variable_selection_hp, True))
 
         # Compile a list of legal preprocessors for this problem
-        available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(  # type: ignore[call-arg]
+        available_encoders: Dict[str, BaseForecastingEncoder] = self.get_available_components(  # type: ignore
             dataset_properties=dataset_properties,
             include=include, exclude=exclude)
 
-        available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(  # type: ignore[call-arg]
+        available_decoders: Dict[str, BaseForecastingDecoder] = self.get_available_components(  # type: ignore
             dataset_properties=dataset_properties,
             include=None, exclude=exclude,
             components=self.get_decoder_components())
@@ -339,8 +340,10 @@ def get_hyperparameter_search_space(  # type: ignore[override]
             encoder2decoder = {}
             for encoder_name in hp_encoder.choices:
                 updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
-                config_space = available_encoders[encoder_name].get_hyperparameter_search_space(dataset_properties,
-                                                                                                **updates)
+                config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                    dataset_properties,
+                    **updates  # type: ignore[call-args]
+                )
                 allowed_decoders = available_encoders[encoder_name].allowed_decoders()
                 if len(allowed_decoders) > 1:
                     if 'decoder_type' not in config_space:
@@ -363,7 +366,8 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                                                                                 )
                             config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                                 dataset_properties,
-                                **updates)
+                                **updates  # type: ignore[call-args]
+                            )
                             hp_decoder_choice = recurrent_decoders
                         else:
                             cs.add_forbidden_clause(ForbiddenEqualsClause(hp_encoder, encoder_name))
@@ -383,7 +387,8 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                                                                             default_value=valid_decoders[0])
                         config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                             dataset_properties,
-                            **updates)
+                            **updates  # type: ignore[call-args]
+                        )
                 parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
                 cs.add_configuration_space(
                     block_prefix + encoder_name,
@@ -397,11 +402,12 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                 updates = self._get_search_space_updates(prefix=block_prefix + decoder_name)
                 if i == 1 and decoder_name == self.deepAR_decoder_name:
                     # TODO this is only a temporary solution, a fix on ConfigSpace needs to be implemented
-                    updates['can_be_auto_regressive'] = True
+                    updates['can_be_auto_regressive'] = True  # type: ignore[assignment]
 
-                config_space = available_decoders[decoder_name].get_hyperparameter_search_space(dataset_properties,
-                                                                                                # type: ignore
-                                                                                                **updates)
+                config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
+                    dataset_properties,
+                    **updates  # type: ignore
+                )
                 compatible_encoders = decoder2encoder[decoder_name]
                 encoders_with_multi_decoder_l = []
                 encoder_with_single_decoder_l = []
@@ -527,7 +533,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
             )
 
         for encoder_name, encoder in available_encoders.items():
-            encoder_is_casual = encoder.encoder_properties()
+            encoder_is_casual = encoder.encoder_properties().is_casual
             if not encoder_is_casual:
                 # we do not allow non-casual encoder to appear in the lower layer of the network. e.g, if we have an
                 # encoder with 3 blocks, then non_casual encoder is only allowed to appear in the third layer
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 214d47d85..8c59a4c32 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -183,36 +183,36 @@ def build_head(self,  # type: ignore[override]
         """
         if net_output_type == 'distribution':
             assert dist_cls is not None
-            proj_layer = ALL_DISTRIBUTIONS[dist_cls](num_in_features=head_n_in_features,
-                                                     output_shape=output_shape[1:],
-                                                     n_prediction_heads=n_prediction_heads,
-                                                     decoder_has_local_layer=decoder_has_local_layer
-                                                     )
-            return proj_layer
+            proj_layer_d = ALL_DISTRIBUTIONS[dist_cls](num_in_features=head_n_in_features,
+                                                       output_shape=output_shape[1:],
+                                                       n_prediction_heads=n_prediction_heads,
+                                                       decoder_has_local_layer=decoder_has_local_layer
+                                                       )
+            return proj_layer_d
         elif net_output_type == 'regression':
             if decoder_has_local_layer:
-                proj_layer = nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
+                proj_layer_r = nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
             else:
-                proj_layer = nn.Sequential(
+                proj_layer_r = nn.Sequential(
                     nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
                     nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
                 )
-            return proj_layer
+            return proj_layer_r
         elif net_output_type == "quantile":
             if decoder_has_local_layer:
-                proj_layer = [  # type: ignore[assignment]
+                proj_layer_quantiles = [
                     nn.Sequential(nn.Linear(head_n_in_features, np.product(output_shape[1:])))
                     for _ in range(num_quantiles)
                 ]
             else:
-                proj_layer = [  # type: ignore[assignment]
+                proj_layer_quantiles = [
                     nn.Sequential(
                         nn.Linear(head_n_in_features, n_prediction_heads * np.product(output_shape[1:])),
                         nn.Unflatten(-1, (n_prediction_heads, *output_shape[1:])),
                     ) for _ in range(num_quantiles)
                 ]
-            proj_layer = QuantileHead(proj_layer)
-            return proj_layer
+            proj_layer_q = QuantileHead(proj_layer_quantiles)
+            return proj_layer_q
         else:
             raise NotImplementedError(f"Unsupported network type "
                                       f"{net_output_type} (should be one of the following: "
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
index a2d382999..bae589570 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/__init__.py
@@ -175,4 +175,4 @@ def get_hyperparameter_search_space(
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.choice is not None, "Cannot call transform before the object is initialized"
-        return self.choice.transform(X)
+        return self.choice.transform(X)  # type: ignore[no-any-return]
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index cd90c7754..5b6db9d05 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -92,13 +92,13 @@ def __init__(self,
         self.num_batches_per_epoch = num_batches_per_epoch if num_batches_per_epoch is not None else np.inf
         self.padding_collector: Optional[Callable] = None
 
-        self.known_future_features_index: Union[Tuple[int], Tuple[()]] = tuple()
+        self.known_future_features_index: Union[Tuple[int], Tuple[()]] = ()
         self._is_uni_variant = False
 
         self.transform_time_features = transform_time_features
         self.freq = "1Y"
         self.time_feature_transform: List[TimeFeature] = []
-        self.dataset_columns: Union[Tuple[Union[int, str]], Tuple[()]] = tuple()
+        self.dataset_columns: Union[Tuple[Union[int, str]], Tuple[()]] = ()
         self.sampler_train: Optional[Union[Iterator, torch.utils.data.sampler.Sampler]] = None
 
         # Applied for get loader
@@ -204,7 +204,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         max_lagged_value += self.window_size + self.n_prediction_steps
 
         # we want the feature names from the raw dataset
-        self.dataset_columns = datamanager.feature_names
+        self.dataset_columns = datamanager.feature_names  # type: ignore[assignment]
 
         known_future_features_index = extract_feature_index(
             feature_shapes=X['dataset_properties']['feature_shapes'],
@@ -357,7 +357,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
         # We transform to tensor under dataset
         return torchvision.transforms.Compose(candidate_transformations)
 
-    def get_loader(self, X: Union[np.ndarray, TimeSeriesSequence], y: Optional[np.ndarray] = None,
+    def get_loader(self, X: Union[TimeSeriesSequence, List[TimeSeriesSequence]], y: Optional[np.ndarray] = None,
                    batch_size: int = np.iinfo(np.int32).max,
                    ) -> torch.utils.data.DataLoader:
         """

From 374cc1df3fc3f723d21776e16cfa382cc0c9c395 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 19:34:13 +0200
Subject: [PATCH 302/347] pre-commit

---
 autoPyTorch/datasets/time_series_dataset.py         |  7 +++----
 autoPyTorch/optimizer/smbo.py                       |  2 +-
 .../setup/network/forecasting_architecture.py       |  2 +-
 .../forecasting_backbone/__init__.py                |  6 +++---
 .../forecasting_encoder/__init__.py                 | 13 +++++++++----
 .../seq_encoder/TransformerEncoder.py               |  2 +-
 .../forecasting_encoder/seq_encoder/__init__.py     | 12 ++++++------
 .../time_series_forecasting_data_loader.py          |  2 +-
 8 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f34a13adf..0b356e228 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -3,7 +3,6 @@
 import os
 import uuid
 import warnings
-from numbers import Real
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from gluonts.time_feature import Constant as ConstantTransform
@@ -618,7 +617,7 @@ def __init__(self,
             resampling_strategy_args=resampling_strategy_args
         )
 
-        self.resampling_strategy = resampling_strategy_opt
+        self.resampling_strategy = resampling_strategy_opt   # type: ignore[assignment]
         self.resampling_strategy_args = resampling_strategy_args_opt
 
         if isinstance(self.resampling_strategy, CrossValTypes):
@@ -898,7 +897,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
     def replace_data(self,
                      X_train: pd.DataFrame,
                      X_test: Optional[pd.DataFrame],
-                     known_future_features_index: Union[Tuple[int], Tuple[()]] = ()) -> 'BaseDataset':
+                     known_future_features_index: Optional[Tuple[int]] = None) -> 'BaseDataset':
         super(TimeSeriesForecastingDataset, self).replace_data(X_train=X_train, X_test=X_test)
         if X_train is None:
             return self
@@ -1032,7 +1031,7 @@ def get_split_strategy(sequence_lengths: List[int],
                                CrossValTypes, HoldoutValTypes,
                                NoResamplingStrategyTypes] = HoldoutValTypes.time_series_hold_out_validation,
                            resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
-            Tuple[Optional[Union[CrossValTypes, HoldoutValTypes]], Optional[Dict[str, Any]]]:
+            Tuple[Optional[Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]], Optional[Dict[str, Any]]]:
         """
         Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
         enough to support cross-validation split, thus we need to carefully compute the number of folds
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 41ec1ae25..e5a0c3f2a 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -297,7 +297,7 @@ def __init__(self,
                 initial_configurations = read_return_initial_configurations(config_space=config_space,
                                                                             portfolio_selection=portfolio_selection)
 
-        self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else Non
+        self.initial_configurations = initial_configurations if len(initial_configurations) > 0 else None
 
     def run_smbo(self, func: Optional[Callable] = None
                  ) -> Tuple[RunHistory, List[TrajEntry], str]:
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index f514dc3d0..424fdca54 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -1029,7 +1029,7 @@ def forward(self,
             all_samples = []
             batch_size: int = past_targets.shape[0]
 
-            encoder_additional: List[Optional[torch.Tensor]] = [static_context_initial_hidden]
+            encoder_additional: List[Optional[torch.Tensor]] = [static_context_initial_hidden]  # type: ignore[no-redef]
             encoder_additional.extend([None] * (self.network_structure.num_blocks - 1))
 
             encoder2decoder, encoder_output = self.encoder(encoder_input=encoder_input,
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 39a1a5611..1da656f16 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -222,7 +222,7 @@ def get_hyperparameter_search_space(
                 dataset_properties=dataset_properties,
                 include=include_encoder,
                 exclude=exclude_encoder,
-                **updates  # type: ignore[call-arg]
+                **updates  # type: ignore[call-arg, arg-type]
             )
             parent_hyperparameter = {'parent': hp_encoder, 'value': name}
             cs.add_configuration_space(
@@ -262,12 +262,12 @@ def set_hyperparameters(self,
         self.new_params = new_params
         sub_configuration_space = choice_component.get_hyperparameter_search_space(
             self.dataset_properties,
-            **updates  # type: ignore[call-arg]
+            **updates  # type: ignore[call-arg, arg-type]
         )
 
         sub_configuration = Configuration(sub_configuration_space,
                                           values=new_params)
-        self.choice = choice_component.set_hyperparameters(sub_configuration)
+        self.choice = choice_component.set_hyperparameters(sub_configuration)  # type: ignore[assignment]
 
         return self
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index e6b7514ed..002a87034 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -2,7 +2,7 @@
 import warnings
 from abc import abstractmethod
 from collections import OrderedDict
-from typing import Any, Callable, Dict, List, Optional, Type
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import ConfigSpace.hyperparameters as CSH
 from ConfigSpace.conditions import EqualsCondition, OrConjunction
@@ -216,7 +216,7 @@ def get_hyperparameter_search_space(
             updates = self._get_search_space_updates(prefix=encoder_name)
             config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                 dataset_properties,
-                **updates   # type: ignore[call-args]
+                **updates   # type: ignore[call-arg]
             )
             parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
             cs.add_configuration_space(
@@ -245,7 +245,7 @@ def get_hyperparameter_search_space(
             updates = self._get_search_space_updates(prefix=decoder_name)
             config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
                 dataset_properties,
-                **updates   # type: ignore[call-args]
+                **updates   # type: ignore[call-arg]
             )
             compatible_encoders = decoder2encoder[decoder_name]
             encoders_with_multi_decoder = []
@@ -382,4 +382,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> Pipeline:  # type: ignore[ove
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         assert self.pipeline is not None, "Cannot call transform before the object is initialized"
-        return self.pipeline.transform(X)   # type: ignore[no-any-return]
\ No newline at end of file
+        return self.pipeline.transform(X)   # type: ignore[no-any-return]
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+                       ) -> Dict[str, Union[str, bool]]:
+        raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index fb3c1f9c6..262288d7b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -114,7 +114,7 @@ def allowed_decoders() -> List[str]:
     @staticmethod
     def encoder_properties() -> EncoderProperties:
         return EncoderProperties(lagged_input=True,
-                                 causality=False)
+                                 is_casual=False)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         if 'lagged_value' in X['dataset_properties']:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index ec5d94f7f..fbf41bfc8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -1,7 +1,7 @@
 import inspect
 import os
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Type, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import ConfigSpace as CS
 from ConfigSpace.conditions import (
@@ -167,7 +167,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
             dataset_properties = {}
 
         static_features_shape: int = dataset_properties.get("static_features_shape", 0)  # type: ignore[assignment]
-        future_feature_shapes: Tuple[int] = dataset_properties.get("future_feature_shapes", # type: ignore[assignment]
+        future_feature_shapes: Tuple[int] = dataset_properties.get("future_feature_shapes",  # type: ignore[assignment]
                                                                    (0,))
 
         cs = ConfigurationSpace()
@@ -342,7 +342,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                 updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
                 config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                     dataset_properties,
-                    **updates  # type: ignore[call-args]
+                    **updates  # type: ignore[call-arg]
                 )
                 allowed_decoders = available_encoders[encoder_name].allowed_decoders()
                 if len(allowed_decoders) > 1:
@@ -366,7 +366,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                                                                                 )
                             config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                                 dataset_properties,
-                                **updates  # type: ignore[call-args]
+                                **updates  # type: ignore[call-arg]
                             )
                             hp_decoder_choice = recurrent_decoders
                         else:
@@ -387,7 +387,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                                                                             default_value=valid_decoders[0])
                         config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
                             dataset_properties,
-                            **updates  # type: ignore[call-args]
+                            **updates  # type: ignore[call-arg]
                         )
                 parent_hyperparameter = {'parent': hp_encoder, 'value': encoder_name}
                 cs.add_configuration_space(
@@ -406,7 +406,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
 
                 config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
                     dataset_properties,
-                    **updates  # type: ignore
+                    **updates  # type: ignore[call-arg]
                 )
                 compatible_encoders = decoder2encoder[decoder_name]
                 encoders_with_multi_decoder_l = []
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 5b6db9d05..34750f08b 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -382,7 +382,7 @@ def get_loader(self, X: Union[TimeSeriesSequence, List[TimeSeriesSequence]], y:
                         sequence_lengths_test[seq_idx] = len(x_seq.X_test)
                     series_number_test = np.arange(len(sequence_lengths_test)).repeat(sequence_lengths_test)
 
-                if not X[0].is_pre_processed:
+                if not X[0].is_pre_processed:  # type: ignore[union-attr]
 
                     x_all = pd.DataFrame(np.concatenate([x_seq.X for x_seq in X]), columns=self.dataset_columns)
 

From 4898ca5ef492b465ad1331df2b380b10be070087 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 31 May 2022 19:41:43 +0200
Subject: [PATCH 303/347] mypyyyyyyyyyyyyyyyyyyyyyyyy

---
 autoPyTorch/datasets/time_series_dataset.py              | 2 --
 .../forecasting_backbone/forecasting_encoder/__init__.py | 4 ++--
 .../forecasting_encoder/seq_encoder/__init__.py          | 9 +++++----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 0b356e228..d894dbba3 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1045,8 +1045,6 @@ def get_split_strategy(sequence_lengths: List[int],
             resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy
             resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments
         """
-        if resampling_strategy is None:
-            return None, None
         # check if dataset could be split with cross validation
         minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
         if isinstance(resampling_strategy, CrossValTypes):
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 002a87034..56cd21e68 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -214,7 +214,7 @@ def get_hyperparameter_search_space(
         encoder2decoder: Dict[str, List[str]] = {}
         for encoder_name in hp_encoder.choices:
             updates = self._get_search_space_updates(prefix=encoder_name)
-            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type: ignore[call-arg]
                 dataset_properties,
                 **updates   # type: ignore[call-arg]
             )
@@ -243,7 +243,7 @@ def get_hyperparameter_search_space(
             if not decoder2encoder[decoder_name]:
                 continue
             updates = self._get_search_space_updates(prefix=decoder_name)
-            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
+            config_space = available_decoders[decoder_name].get_hyperparameter_search_space(  # type: ignore[call-arg]
                 dataset_properties,
                 **updates   # type: ignore[call-arg]
             )
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index fbf41bfc8..0d2b23aee 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -340,7 +340,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
             encoder2decoder = {}
             for encoder_name in hp_encoder.choices:
                 updates = self._get_search_space_updates(prefix=block_prefix + encoder_name)
-                config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type: ignore
                     dataset_properties,
                     **updates  # type: ignore[call-arg]
                 )
@@ -364,7 +364,8 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                                                                                 tuple(recurrent_decoders),
                                                                                 recurrent_decoders[0]
                                                                                 )
-                            config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                            ecd = available_encoders[encoder_name]
+                            config_space = ecd.get_hyperparameter_search_space(  # type:ignore
                                 dataset_properties,
                                 **updates  # type: ignore[call-arg]
                             )
@@ -385,7 +386,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                         updates['decoder_type'] = HyperparameterSearchSpace(hyperparameter='decoder_type',
                                                                             value_range=tuple(valid_decoders),
                                                                             default_value=valid_decoders[0])
-                        config_space = available_encoders[encoder_name].get_hyperparameter_search_space(
+                        config_space = available_encoders[encoder_name].get_hyperparameter_search_space(  # type:ignore
                             dataset_properties,
                             **updates  # type: ignore[call-arg]
                         )
@@ -404,7 +405,7 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                     # TODO this is only a temporary solution, a fix on ConfigSpace needs to be implemented
                     updates['can_be_auto_regressive'] = True  # type: ignore[assignment]
 
-                config_space = available_decoders[decoder_name].get_hyperparameter_search_space(
+                config_space = available_decoders[decoder_name].get_hyperparameter_search_space(  # type: ignore
                     dataset_properties,
                     **updates  # type: ignore[call-arg]
                 )

From 694eebb92cbb5a4a6cff94d272863989c2948af5 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 13 Jun 2022 20:01:21 +0200
Subject: [PATCH 304/347] maint

---
 autoPyTorch/pipeline/time_series_forecasting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index f935e0ed2..1be5a6cd8 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -142,7 +142,7 @@ def _get_hyperparameter_search_space(self,
         if not isinstance(dataset_properties, dict):
             warnings.warn('The given dataset_properties argument contains an illegal value.'
                           'Proceeding with the default value')
-        dataset_properties = dict()
+            dataset_properties = dict()
 
         if 'target_type' not in dataset_properties:
             dataset_properties['target_type'] = 'time_series_forecasting'

From abd390094a01d06cbfbf92bbf52aea7d7e4fd11f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 13 Jun 2022 20:11:18 +0200
Subject: [PATCH 305/347] move forcasting requirements to extras_require

---
 .github/workflows/pytest.yml | 4 ++--
 requirements.txt             | 5 +----
 setup.py                     | 5 +++++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 5a5cce20e..2a3e87ee6 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -89,7 +89,7 @@ jobs:
       run: |
         git submodule update --init --recursive
         python -m pip install --upgrade pip
-        pip install -e .[test]
+        pip install -e .[forecasting, test]
 
     - name: Dist install
       if: matrix.kind == 'dist'
@@ -98,7 +98,7 @@ jobs:
 
         python setup.py sdist
         last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1)
-        pip install $last_dist[test]
+        pip install $last_dist[forecasting, test]
 
     - name: Store repository status
       id: status-before
diff --git a/requirements.txt b/requirements.txt
index 202d192aa..3f37e131c 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,16 +7,13 @@ numpy
 scipy>=1.7
 lockfile
 imgaug>=0.4.0
-ConfigSpace>=0.4.14,<0.5
+ConfigSpace>=0.5.0
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
 smac>=1.2
-gluonts
 dask
 distributed>=2.2.0
 catboost
 lightgbm
 flaky
 tabulate
-sktime>=0.11.0
-
diff --git a/setup.py b/setup.py
index b9df7f21a..fff10a9d5 100755
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,11 @@
     install_requires=requirements,
     include_package_data=True,
     extras_require={
+        "forecasting": [
+            "gluonts",
+            "sktime",
+            "pytorch-forecasting",
+        ],
         "test": [
             "matplotlib",
             "pytest",

From 776aa8458a281f803b0d1eeec4c25e626c54047a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 11:58:18 +0200
Subject: [PATCH 306/347] bring eval_test to tae

---
 autoPyTorch/evaluation/tae.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 7f7013eb2..6c233f857 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -29,6 +29,7 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes
 )
+from autoPyTorch.evaluation.test_evaluator import eval_test_function
 from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.utils import (
     DisableFileOutputParameters,
@@ -149,7 +150,7 @@ def __init__(
             eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
             self.output_y_hat_optimization = output_y_hat_optimization
         elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
-            eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
+            eval_function = functools.partial(eval_test_function, **eval_func_kwargs)
             self.output_y_hat_optimization = False
         self.worst_possible_result = cost_for_crash
 

From f70e2b3c41070ae502981ba6b18b7974d4cb9032 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 12:08:10 +0200
Subject: [PATCH 307/347] make rh2epm consistent with SMAC4HPO

---
 autoPyTorch/optimizer/smbo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index e5a0c3f2a..5c46d9c8a 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -13,7 +13,7 @@
 from smac.intensification.hyperband import Hyperband
 from smac.intensification.intensification import Intensifier
 from smac.runhistory.runhistory import RunHistory
-from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost, RunHistory2EPM4LogScaledCost
+from smac.runhistory.runhistory2epm import RunHistory2EPM4LogScaledCost
 from smac.scenario.scenario import Scenario
 from smac.tae.dask_runner import DaskParallelRunner
 from smac.tae.serial_runner import SerialRunner
@@ -73,13 +73,12 @@ def get_smac_object(
     if initial_budget == max_budget:
         intensifier = Intensifier
         intensifier_kwargs: Dict[str, Any] = {'deterministic': True, }
-        rh2EPM = RunHistory2EPM4LogScaledCost
 
     else:
         intensifier = Hyperband
         intensifier_kwargs = {'initial_budget': initial_budget, 'max_budget': max_budget,
                               'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'}
-        rh2EPM = RunHistory2EPM4LogCost
+    rh2EPM = RunHistory2EPM4LogScaledCost
 
     return SMAC4HPO(
         scenario=Scenario(scenario_dict),

From 50f6f184bc09f16f6a68d177c2c3dcf926476fb2 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 12:11:35 +0200
Subject: [PATCH 308/347] remove smac4ac from smbo

---
 autoPyTorch/optimizer/smbo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 5c46d9c8a..3dad49492 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -8,7 +8,6 @@
 
 import dask.distributed
 
-from smac.facade.smac_ac_facade import SMAC4AC
 from smac.facade.smac_hpo_facade import SMAC4HPO
 from smac.intensification.hyperband import Hyperband
 from smac.intensification.intensification import Intensifier
@@ -48,7 +47,7 @@ def get_smac_object(
         max_budget: Union[int, float],
         dask_client: Optional[dask.distributed.Client],
         initial_configurations: Optional[List[Configuration]] = None,
-) -> SMAC4AC:
+) -> SMAC4HPO:
     """
     This function returns an SMAC object that is gonna be used as
     optimizer of pipelines
@@ -67,7 +66,7 @@ def get_smac_object(
             configurations which smac will run before starting the search process
 
     Returns:
-        (SMAC4AC): sequential model algorithm configuration object
+        (SMAC4HPO): sequential model algorithm configuration object
 
     """
     if initial_budget == max_budget:

From 2663ad961af5070195f24267c58434fd1ccb9707 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 12:23:43 +0200
Subject: [PATCH 309/347] revert changes in network

---
 .../components/setup/network/base_network.py  | 13 +++++-------
 .../setup/network/forecasting_network.py      | 20 +++++++++++++++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index 28fe83902..d8d4c87d0 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -27,17 +27,13 @@ def __init__(
         super(NetworkComponent, self).__init__()
         self.random_state = random_state
         self.device = None
-        self.add_fit_requirements(self._required_fit_requirements)
-        self.network = network
-        self.final_activation: Optional[torch.nn.Module] = None
-
-    @property
-    def _required_fit_requirements(self) -> List[FitRequirement]:
-        return [
+        self.add_fit_requirements([
             FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
-        ]
+        ])
+        self.network = network
+        self.final_activation: Optional[torch.nn.Module] = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         """
@@ -50,6 +46,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         Returns:
             A instance of self
         """
+
         # Make sure that input dictionary X has the required
         # information to fit this stage
         self.check_requirements(X, y)
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 244fbeea1..251553fce 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -30,6 +30,26 @@ def __init__(
             random_state: Optional[np.random.RandomState] = None,
     ) -> None:
         super(ForecastingNetworkComponent, self).__init__(network=network, random_state=random_state)
+        self._fit_requirements.clear()
+        self.add_fit_requirements([
+            FitRequirement('dataset_properties', (Dict,), user_defined=False, dataset_property=True),
+            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
+            FitRequirement('network_structure', (Dict,), user_defined=False, dataset_property=False),
+            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_encoder", (Dict,), user_defined=False,
+                           dataset_property=False),
+            FitRequirement("network_decoder", (Dict,), user_defined=False,
+                           dataset_property=False),
+            FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
+            FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
+            FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
+            FitRequirement("net_output_type", (str,), user_defined=False, dataset_property=False),
+            FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
+            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
+            FitRequirement('static_features', (tuple,), user_defined=True, dataset_property=True),
+            FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
+        ])
 
     @property
     def _required_fit_requirements(self) -> List[FitRequirement]:

From 58eeb0c39495b88f547040dcd2e136d7e977151c Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 12:34:31 +0200
Subject: [PATCH 310/347] revert changes in trainer

---
 .../components/training/trainer/__init__.py   | 25 +++++++++----------
 .../trainer/forecasting_trainer/__init__.py   | 22 +++++++++-------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index 4bb8fda5c..89d9e733d 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -66,21 +66,20 @@ def __init__(self,
                          random_state=random_state)
         self.run_summary: Optional[RunSummary] = None
         self.writer: Optional[SummaryWriter] = None
+        self.early_stopping_split_type: Optional[str] = None
+        self._fit_requirements: Optional[List[FitRequirement]] = [
+            FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
+            FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
+            FitRequirement(
+                "optimizer", (Optimizer,), user_defined=False, dataset_property=False),
+            FitRequirement("train_data_loader",
+                           (torch.utils.data.DataLoader,),
+                           user_defined=False, dataset_property=False),
+            FitRequirement("val_data_loader",
+                           (torch.utils.data.DataLoader,),
+                           user_defined=False, dataset_property=False)]
         self.checkpoint_dir: Optional[str] = None
 
-    @property
-    def _fit_requirements(self) -> List[FitRequirement]:
-        return [FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
-                FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
-                FitRequirement(
-                    "optimizer", (Optimizer,), user_defined=False, dataset_property=False),
-                FitRequirement("train_data_loader",
-                               (torch.utils.data.DataLoader,),
-                               user_defined=False, dataset_property=False),
-                FitRequirement("val_data_loader",
-                               (torch.utils.data.DataLoader,),
-                               user_defined=False, dataset_property=False)]
-
     def get_fit_requirements(self) -> Optional[List[FitRequirement]]:
         return self._fit_requirements
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 14c0e0a24..2122111e5 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -1,9 +1,12 @@
 import collections
 import os
-from typing import Dict, List
+from typing import Dict, Optional
+
+import numpy as np
 
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
 from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
     autoPyTorchComponent,
@@ -33,14 +36,15 @@ def add_trainer(trainer: ForecastingBaseTrainerComponent) -> None:
 
 
 class ForecastingTrainerChoice(TrainerChoice):
-    @property
-    def _fit_requirements(self) -> List[FitRequirement]:
-        fit_requirements = super()._fit_requirements
-        fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
-                                                user_defined=False, dataset_property=False),
-                                 FitRequirement("window_size", (int,), user_defined=False, dataset_property=False)]
-                                )
-        return fit_requirements
+    def __init__(self,
+                 dataset_properties: Dict[str, BaseDatasetPropertiesType],
+                 random_state: Optional[np.random.RandomState] = None
+                 ):
+        super().__init__(dataset_properties=dataset_properties, random_state=random_state)
+        self._fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
+                                                      user_defined=False, dataset_property=False),
+                                       FitRequirement("window_size", (int,), user_defined=False,
+                                                      dataset_property=False)])
 
     def get_budget_tracker(self, X: Dict) -> BudgetTracker:
         if 'epochs' in X:

From b86908fa0756d45d1f847ecd7057c1988c73312f Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 12:40:01 +0200
Subject: [PATCH 311/347] revert format changes

---
 README.md                                     |   1 -
 autoPyTorch/api/base_task.py                  | 243 +++++++++---------
 autoPyTorch/constants.py                      |   1 -
 autoPyTorch/constants_forecasting.py          |   2 +-
 autoPyTorch/data/tabular_target_validator.py  |  32 +--
 .../data/time_series_forecasting_validator.py |  24 +-
 autoPyTorch/datasets/resampling_strategy.py   |   2 +
 autoPyTorch/evaluation/abstract_evaluator.py  |  76 +++---
 autoPyTorch/evaluation/tae.py                 |  70 ++---
 autoPyTorch/evaluation/train_evaluator.py     |  58 +++--
 autoPyTorch/optimizer/smbo.py                 |  19 +-
 autoPyTorch/optimizer/utils.py                |   4 +-
 .../setup/early_preprocessor/utils.py         |   1 +
 .../network_backbone/base_network_backbone.py |   1 +
 .../LearnedEntityEmbedding.py                 |  21 +-
 .../base_network_embedding.py                 |   3 +-
 .../components/setup/network_head/__init__.py |   4 +-
 .../components/training/trainer/__init__.py   |  10 +-
 .../training/trainer/base_trainer.py          |  42 ++-
 19 files changed, 310 insertions(+), 304 deletions(-)

diff --git a/README.md b/README.md
index f82910806..9f7ae78ae 100755
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@ Copyright (C) 2021  [AutoML Groups Freiburg and Hannover](http://www.automl.org/
 
 While early AutoML frameworks focused on optimizing traditional ML pipelines and their hyperparameters, another trend in AutoML is to focus on neural architecture search. To bring the best of these two worlds together, we developed **Auto-PyTorch**, which jointly and robustly optimizes the network architecture and the training hyperparameters to enable fully automated deep learning (AutoDL).
 
-
 Auto-PyTorch is mainly developed to support tabular data (classification, regression).
 The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below for bibtex ref).
 
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index cba22da38..de0e5cbc6 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -168,25 +168,25 @@ class BaseTask(ABC):
     """
 
     def __init__(
-            self,
-            seed: int = 1,
-            n_jobs: int = 1,
-            n_threads: int = 1,
-            logging_config: Optional[Dict] = None,
-            ensemble_size: int = 50,
-            ensemble_nbest: int = 50,
-            max_models_on_disc: int = 50,
-            temporary_directory: Optional[str] = None,
-            output_directory: Optional[str] = None,
-            delete_tmp_folder_after_terminate: bool = True,
-            delete_output_folder_after_terminate: bool = True,
-            include_components: Optional[Dict[str, Any]] = None,
-            exclude_components: Optional[Dict[str, Any]] = None,
-            backend: Optional[Backend] = None,
-            resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-            task_type: Optional[str] = None
+        self,
+        seed: int = 1,
+        n_jobs: int = 1,
+        n_threads: int = 1,
+        logging_config: Optional[Dict] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
+        temporary_directory: Optional[str] = None,
+        output_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        delete_output_folder_after_terminate: bool = True,
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        backend: Optional[Backend] = None,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        task_type: Optional[str] = None
     ) -> None:
 
         if isinstance(resampling_strategy, NoResamplingStrategyTypes) and ensemble_size != 0:
@@ -225,7 +225,6 @@ def __init__(
         self._metrics_kwargs: Dict = {}
 
         self._scoring_functions: Optional[List[autoPyTorchMetric]] = None
-
         self._logger: Optional[PicklableClientLogger] = None
         self.dataset_name: Optional[str] = None
         self.cv_models_: Dict = {}
@@ -264,11 +263,11 @@ def __init__(
 
     @abstractmethod
     def build_pipeline(
-            self,
-            dataset_properties: Dict[str, BaseDatasetPropertiesType],
-            include_components: Optional[Dict[str, Any]] = None,
-            exclude_components: Optional[Dict[str, Any]] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
     ) -> BasePipeline:
         """
         Build pipeline according to current task
@@ -301,15 +300,15 @@ def build_pipeline(
 
     @abstractmethod
     def _get_dataset_input_validator(
-            self,
-            X_train: Union[List, pd.DataFrame, np.ndarray],
-            y_train: Union[List, pd.DataFrame, np.ndarray],
-            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            resampling_strategy: Optional[ResamplingStrategies] = None,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            dataset_name: Optional[str] = None,
-            dataset_compression: Optional[DatasetCompressionSpec] = None,
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> Tuple[BaseDataset, BaseInputValidator]:
         """
         Returns an object of a child class of `BaseDataset` and
@@ -347,15 +346,15 @@ def _get_dataset_input_validator(
         raise NotImplementedError
 
     def get_dataset(
-            self,
-            X_train: Union[List, pd.DataFrame, np.ndarray],
-            y_train: Union[List, pd.DataFrame, np.ndarray],
-            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            resampling_strategy: Optional[ResamplingStrategies] = None,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            dataset_name: Optional[str] = None,
-            dataset_compression: Optional[DatasetCompressionSpec] = None,
+        self,
+        X_train: Union[List, pd.DataFrame, np.ndarray],
+        y_train: Union[List, pd.DataFrame, np.ndarray],
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        resampling_strategy: Optional[ResamplingStrategies] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        dataset_name: Optional[str] = None,
+        dataset_compression: Optional[DatasetCompressionSpec] = None,
     ) -> BaseDataset:
         """
         Returns an object of a child class of `BaseDataset` according to the current task.
@@ -619,9 +618,9 @@ def _close_dask_client(self) -> None:
             None
         """
         if (
-                hasattr(self, '_is_dask_client_internally_created')
-                and self._is_dask_client_internally_created
-                and self._dask_client
+            hasattr(self, '_is_dask_client_internally_created')
+            and self._is_dask_client_internally_created
+            and self._dask_client
         ):
             self._dask_client.shutdown()
             self._dask_client.close()
@@ -811,7 +810,6 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
 
             # Only launch a task if there is time
             start_time = time.time()
-
             if time_left >= func_eval_time_limit_secs:
                 self._logger.info(f"{n_r}: Started fitting {classifier} with cutoff={func_eval_time_limit_secs}")
                 scenario_mock = unittest.mock.Mock()
@@ -914,26 +912,26 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
         return
 
     def _search(
-            self,
-            optimize_metric: str,
-            dataset: BaseDataset,
-            budget_type: str = 'epochs',
-            min_budget: Union[int, float] = 5,
-            max_budget: Union[int, float] = 50,
-            total_walltime_limit: int = 100,
-            func_eval_time_limit_secs: Optional[int] = None,
-            enable_traditional_pipeline: bool = True,
-            memory_limit: Optional[int] = 4096,
-            smac_scenario_args: Optional[Dict[str, Any]] = None,
-            get_smac_object_callback: Optional[Callable] = None,
-            tae_func: Optional[Callable] = None,
-            all_supported_metrics: bool = True,
-            precision: int = 32,
-            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-            load_models: bool = True,
-            portfolio_selection: Optional[str] = None,
-            dask_client: Optional[dask.distributed.Client] = None,
-            **kwargs: Any
+        self,
+        optimize_metric: str,
+        dataset: BaseDataset,
+        budget_type: str = 'epochs',
+        min_budget: Union[int, float] = 5,
+        max_budget: Union[int, float] = 50,
+        total_walltime_limit: int = 100,
+        func_eval_time_limit_secs: Optional[int] = None,
+        enable_traditional_pipeline: bool = True,
+        memory_limit: Optional[int] = 4096,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+        tae_func: Optional[Callable] = None,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        load_models: bool = True,
+        portfolio_selection: Optional[str] = None,
+        dask_client: Optional[dask.distributed.Client] = None,
+        **kwargs: Any
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1097,8 +1095,8 @@ def _search(
         self._all_supported_metrics = all_supported_metrics
         self._disable_file_output = disable_file_output if disable_file_output is not None else []
         if (
-                DisableFileOutputParameters.y_optimization in self._disable_file_output
-                and self.ensemble_size > 1
+            DisableFileOutputParameters.y_optimization in self._disable_file_output
+            and self.ensemble_size > 1
         ):
             self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
                                  f" is in disable_file_output")
@@ -1271,7 +1269,6 @@ def _search(
                 max_budget=max_budget,
                 ensemble_callback=proc_ensemble,
                 logger_port=self._logger_port,
-
                 # We do not increase the num_run here, this is something
                 # smac does internally
                 start_num_run=self._backend.get_next_num_run(peek=True),
@@ -1339,10 +1336,10 @@ def _search(
         return self
 
     def _get_fit_dictionary(
-            self,
-            dataset_properties: Dict[str, BaseDatasetPropertiesType],
-            dataset: BaseDataset,
-            split_id: int = 0
+        self,
+        dataset_properties: Dict[str, BaseDatasetPropertiesType],
+        dataset: BaseDataset,
+        split_id: int = 0
     ) -> Dict[str, Any]:
         X_test = dataset.test_tensors[0].copy() if dataset.test_tensors is not None else None
         y_test = dataset.test_tensors[1].copy() if dataset.test_tensors is not None else None
@@ -1367,9 +1364,9 @@ def _get_fit_dictionary(
         return X
 
     def refit(
-            self,
-            dataset: BaseDataset,
-            split_id: int = 0
+        self,
+        dataset: BaseDataset,
+        split_id: int = 0
     ) -> "BaseTask":
         """
         Refit all models found with fit to new data.
@@ -1435,28 +1432,28 @@ def refit(
         return self
 
     def fit_pipeline(
-            self,
-            configuration: Configuration,
-            *,
-            dataset: Optional[BaseDataset] = None,
-            X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            dataset_name: Optional[str] = None,
-            resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
-            resampling_strategy_args: Optional[Dict[str, Any]] = None,
-            run_time_limit_secs: int = 60,
-            memory_limit: Optional[int] = None,
-            eval_metric: Optional[str] = None,
-            all_supported_metrics: bool = False,
-            budget_type: Optional[str] = None,
-            include_components: Optional[Dict[str, Any]] = None,
-            exclude_components: Optional[Dict[str, Any]] = None,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-            budget: Optional[float] = None,
-            pipeline_options: Optional[Dict] = None,
-            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        self,
+        configuration: Configuration,
+        *,
+        dataset: Optional[BaseDataset] = None,
+        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        dataset_name: Optional[str] = None,
+        resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
+        resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        run_time_limit_secs: int = 60,
+        memory_limit: Optional[int] = None,
+        eval_metric: Optional[str] = None,
+        all_supported_metrics: bool = False,
+        budget_type: Optional[str] = None,
+        include_components: Optional[Dict[str, Any]] = None,
+        exclude_components: Optional[Dict[str, Any]] = None,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        budget: Optional[float] = None,
+        pipeline_options: Optional[Dict] = None,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]:
         """
         Fit a pipeline on the given task for the budget.
@@ -1571,8 +1568,8 @@ def fit_pipeline(
 
         if dataset is None:
             if (
-                    X_train is not None
-                    and y_train is not None
+                X_train is not None
+                and y_train is not None
             ):
                 raise ValueError("No dataset provided, must provide X_train, y_train tensors")
             dataset = self.get_dataset(X_train=X_train,
@@ -1685,12 +1682,12 @@ def fit_pipeline(
         return fitted_pipeline, run_info, run_value, dataset
 
     def _get_fitted_pipeline(
-            self,
-            dataset_name: str,
-            pipeline_idx: int,
-            run_info: RunInfo,
-            run_value: RunValue,
-            disable_file_output: List[Union[str, DisableFileOutputParameters]]
+        self,
+        dataset_name: str,
+        pipeline_idx: int,
+        run_info: RunInfo,
+        run_value: RunValue,
+        disable_file_output: List[Union[str, DisableFileOutputParameters]]
     ) -> Optional[BasePipeline]:
 
         if self._logger is None:
@@ -1716,10 +1713,10 @@ def _get_fitted_pipeline(
         )
 
     def predict(
-            self,
-            X_test: np.ndarray,
-            batch_size: Optional[int] = None,
-            n_jobs: int = 1
+        self,
+        X_test: np.ndarray,
+        batch_size: Optional[int] = None,
+        n_jobs: int = 1
     ) -> np.ndarray:
         """Generate the estimator predictions.
         Generate the predictions based on the given examples from the test set.
@@ -1771,9 +1768,9 @@ def predict(
         return predictions
 
     def score(
-            self,
-            y_pred: np.ndarray,
-            y_test: Union[np.ndarray, pd.DataFrame]
+        self,
+        y_pred: np.ndarray,
+        y_test: Union[np.ndarray, pd.DataFrame]
     ) -> Dict[str, float]:
         """Calculate the score on the test set.
         Calculate the evaluation measure on the test set.
@@ -1818,8 +1815,8 @@ def __del__(self) -> None:
             self._backend.context.delete_directories(force=False)
 
     def get_incumbent_results(
-            self,
-            include_traditional: bool = False
+        self,
+        include_traditional: bool = False
     ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]:
         """
         Get Incumbent config and the corresponding results
@@ -1920,13 +1917,13 @@ def sprint_statistics(self) -> str:
         )
 
     def plot_perf_over_time(
-            self,
-            metric_name: str,
-            ax: Optional[plt.Axes] = None,
-            plot_setting_params: PlotSettingParams = PlotSettingParams(),
-            color_label_settings: ColorLabelSettings = ColorLabelSettings(),
-            *args: Any,
-            **kwargs: Any
+        self,
+        metric_name: str,
+        ax: Optional[plt.Axes] = None,
+        plot_setting_params: PlotSettingParams = PlotSettingParams(),
+        color_label_settings: ColorLabelSettings = ColorLabelSettings(),
+        *args: Any,
+        **kwargs: Any
     ) -> None:
         """
         Visualize the performance over time using matplotlib.
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 4196149b0..4a3df01f7 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -11,7 +11,6 @@
 TABULAR_TASKS = [TABULAR_CLASSIFICATION, TABULAR_REGRESSION]
 IMAGE_TASKS = [IMAGE_CLASSIFICATION, IMAGE_REGRESSION]
 TIMESERIES_TASKS = [TIMESERIES_FORECASTING]
-
 TASK_TYPES = REGRESSION_TASKS + CLASSIFICATION_TASKS + FORECASTING_TASKS
 
 TASK_TYPES_TO_STRING = \
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
index edfc40e11..0b7d03137 100644
--- a/autoPyTorch/constants_forecasting.py
+++ b/autoPyTorch/constants_forecasting.py
@@ -1,4 +1,4 @@
-# The cosntant values for time series forecasting comes from
+# The constant values for time series forecasting comes from
 # https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
 # seasonality map, maps a frequency value to a number
 
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index 5cd66eaa4..e34695e3c 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -17,6 +17,7 @@
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
 from autoPyTorch.utils.common import ispandas
 
+
 ArrayType = Union[np.ndarray, spmatrix]
 
 
@@ -54,9 +55,9 @@ def _modify_regression_target(y: ArrayType, allow_nan: bool = False) -> ArrayTyp
 
 class TabularTargetValidator(BaseTargetValidator):
     def _fit(
-            self,
-            y_train: SupportedTargetTypes,
-            y_test: Optional[SupportedTargetTypes] = None,
+        self,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         If dealing with classification, this utility encodes the targets.
@@ -93,10 +94,10 @@ def _fit(
                                                         unknown_value=-1)
         else:
             # We should not reach this if statement as we check for type of targets before
-            raise ValueError("Multi-dimensional classification is not yet supported. "
-                             "Encoding multidimensional data converts multiple columns "
-                             "to a 1 dimensional encoding. Data involved = {}/{}".format(np.shape(y_train),
-                                                                                         self.type_of_target)
+            raise ValueError(f"Multi-dimensional classification is not yet supported. "
+                             f"Encoding multidimensional data converts multiple columns "
+                             f"to a 1 dimensional encoding. Data involved = "
+                             f"{np.shape(y_train)}/{self.type_of_target}"
                              )
 
         # Mypy redefinition
@@ -120,8 +121,8 @@ def _fit(
             if is_numeric_dtype(y_train.dtype):
                 self.dtype = y_train.dtype
         elif (
-                hasattr(y_train, 'dtypes')
-                and is_numeric_dtype(cast(pd.DataFrame, y_train).dtypes[0])
+            hasattr(y_train, 'dtypes')
+            and is_numeric_dtype(cast(pd.DataFrame, y_train).dtypes[0])
         ):
             # This case is for pandas array with a single column
             y_train = cast(pd.DataFrame, y_train)
@@ -224,12 +225,13 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
             y (SupportedTargetTypes):
                 A set of features whose dimensionality and data type is going to be checked
         """
+
         if not isinstance(y, (np.ndarray, pd.DataFrame,
                               List, pd.Series)) \
                 and not issparse(y):  # type: ignore[misc]
-            raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
-                             " pd.Series, sparse data and Python Lists as targets, yet, "
-                             "the provided input is of type {}".format(type(y))
+            raise ValueError(f"AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
+                             f" pd.Series, sparse data and Python Lists as targets, yet, "
+                             f"the provided input is of type {type(y)}"
                              )
 
         # Sparse data muss be numerical
@@ -296,7 +298,7 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
                                   # should filter out unsupported types.
                                   )
         if self.type_of_target not in supported_output_types:
-            raise ValueError("Provided targets are not supported by AutoPyTorch. "
-                             "Provided type is {} whereas supported types are {}.".format(self.type_of_target,
-                                                                                          supported_output_types)
+            raise ValueError(f"Provided targets are not supported by AutoPyTorch. "
+                             f"Provided type is {self.type_of_target} "
+                             f"whereas supported types are {supported_output_types}."
                              )
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index ca6df2eef..1ffd2c610 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -57,15 +57,19 @@ def fit(  # type: ignore[override]
         """
         fit the validator with the training data, (optionally) start times and other information
         Args:
-            X_train (Optional[Union[List, pd.DataFrame]]): training features, could be None for "pure" forecasting tasks
-            y_train (Union[List, pd.DataFrame]), training targets
-            series_idx (Optional[Union[List[Union[str, int]], str, int]]): which columns of features are applied to
-                identify the series
-            X_test (Optional[Union[List, pd.DataFrame]]): test features. For forecasting tasks, test features indicates
-                known future features after the forecasting timestep\
-            y_test (Optional[Union[List, pd.DataFrame]]): target in the future
-            start_times (Optional[List[pd.DatetimeIndex]]): start times on which the first element of each series is
-                sampled
+            X_train (Optional[Union[List, pd.DataFrame]]):
+                training features, could be None for uni-variant forecasting tasks
+            y_train (Union[List, pd.DataFrame]),
+                training targets
+            series_idx (Optional[Union[List[Union[str, int]], str, int]])
+                which columns of features are applied to identify the series
+            X_test (Optional[Union[List, pd.DataFrame]]):
+                test features. For forecasting tasks, test features indicates known future features
+                after the forecasting timestep
+            y_test (Optional[Union[List, pd.DataFrame]]):
+                target in the future
+            start_times (Optional[List[pd.DatetimeIndex]]):
+                start times on which the first element of each series is sampled
 
         """
         if series_idx is not None and not isinstance(series_idx, Iterable):
@@ -329,7 +333,7 @@ def join_series(
         X: List[Union[pd.DataFrame, np.ndarray]], return_seq_lengths: bool = False
     ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, List[int]]]:
         """
-        join the series into one single value
+        join the series into one single item
         """
         num_sequences = len(X)
         sequence_lengths = [0] * num_sequences
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
index ef073827f..4f373bf24 100644
--- a/autoPyTorch/datasets/resampling_strategy.py
+++ b/autoPyTorch/datasets/resampling_strategy.py
@@ -194,6 +194,7 @@ def time_series_hold_out_validation(random_state: np.random.RandomState,
 
     @classmethod
     def get_holdout_validators(cls, *holdout_val_types: HoldoutValTypes) -> Dict[str, HoldOutFunc]:
+
         holdout_validators = {
             holdout_val_type.name: getattr(cls, holdout_val_type.name)
             for holdout_val_type in holdout_val_types
@@ -228,6 +229,7 @@ def stratified_k_fold_cross_validation(random_state: np.random.RandomState,
                                            indices: np.ndarray,
                                            **kwargs: Any
                                            ) -> List[Tuple[np.ndarray, np.ndarray]]:
+
         shuffle = kwargs.get('shuffle', True)
         cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle,
                              random_state=random_state if not shuffle else None)
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index aeb48d5b3..cf90fa338 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -23,27 +23,36 @@
 import autoPyTorch.pipeline.traditional_tabular_classification
 import autoPyTorch.pipeline.traditional_tabular_regression
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import (CLASSIFICATION_TASKS, FORECASTING_TASKS,
-                                   IMAGE_TASKS, MULTICLASS, REGRESSION_TASKS,
-                                   STRING_TO_OUTPUT_TYPES,
-                                   STRING_TO_TASK_TYPES, TABULAR_TASKS)
+from autoPyTorch.constants import (
+    CLASSIFICATION_TASKS,
+    FORECASTING_TASKS,
+    IMAGE_TASKS,
+    MULTICLASS,
+    REGRESSION_TASKS,
+    STRING_TO_OUTPUT_TYPES,
+    STRING_TO_TASK_TYPES,
+    TABULAR_TASKS
+)
 from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
-from autoPyTorch.datasets.base_dataset import (BaseDataset,
-                                               BaseDatasetPropertiesType)
+from autoPyTorch.datasets.base_dataset import (
+    BaseDataset,
+    BaseDatasetPropertiesType
+)
 from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
 from autoPyTorch.evaluation.utils import (
-    DisableFileOutputParameters, VotingRegressorWrapper,
-    convert_multioutput_multiclass_to_multilabel)
+    DisableFileOutputParameters,
+    VotingRegressorWrapper,
+    convert_multioutput_multiclass_to_multilabel
+)
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
-from autoPyTorch.pipeline.components.training.metrics.base import \
-    autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.pipeline.components.training.metrics.utils import (
-    calculate_loss, get_metrics)
+    calculate_loss,
+    get_metrics
+)
 from autoPyTorch.utils.common import dict_repr, subsampler
-from autoPyTorch.utils.hyperparameter_search_space_update import \
-    HyperparameterSearchSpaceUpdates
-from autoPyTorch.utils.logging_ import (PicklableClientLogger,
-                                        get_named_client_logger)
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
 __all__ = [
@@ -141,7 +150,6 @@ class MyTraditionalTabularRegressionPipeline(BaseEstimator):
             An optional dictionary that is passed to the pipeline's steps. It complies
             a similar function as the kwargs
     """
-
     def __init__(self, config: str,
                  dataset_properties: Dict[str, Any],
                  random_state: Optional[np.random.RandomState] = None,
@@ -185,7 +193,7 @@ def get_pipeline_representation(self) -> Dict[str, str]:
 
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
-        return autoPyTorch.pipeline.traditional_tabular_regression. \
+        return autoPyTorch.pipeline.traditional_tabular_regression.\
             TraditionalTabularRegressionPipeline.get_default_pipeline_options()
 
 
@@ -448,7 +456,6 @@ class AbstractEvaluator(object):
         search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
             An object used to fine tune the hyperparameter search space of the pipeline
     """
-
     def __init__(self, backend: Backend,
                  queue: Queue,
                  metric: autoPyTorchMetric,
@@ -465,7 +472,7 @@ def __init__(self, backend: Backend,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  all_supported_metrics: bool = True,
-                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+                 search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
                  ) -> None:
 
         self.starttime = time.time()
@@ -494,7 +501,6 @@ def __init__(self, backend: Backend,
         self.disable_file_output = disable_file_output
 
         self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None
-
         if self.task_type in REGRESSION_TASKS:
             if isinstance(self.configuration, int):
                 self.pipeline_class = DummyRegressionPipeline
@@ -572,7 +578,7 @@ def __init__(self, backend: Backend,
         self.logger.debug("Search space updates :{}".format(self.search_space_updates))
 
     def _init_datamanager_info(
-            self,
+        self,
     ) -> None:
         """
         Initialises instance attributes that come from the datamanager.
@@ -619,10 +625,10 @@ def _init_datamanager_info(
         del datamanager
 
     def _init_fit_dictionary(
-            self,
-            logger_port: int,
-            pipeline_config: Dict[str, Any],
-            metrics_dict: Optional[Dict[str, List[str]]] = None,
+        self,
+        logger_port: int,
+        pipeline_config: Dict[str, Any],
+        metrics_dict: Optional[Dict[str, List[str]]] = None,
     ) -> None:
         """
         Initialises the fit dictionary
@@ -680,7 +686,7 @@ def _init_fit_dictionary(
             self.fit_dictionary.pop('runtime', None)
         else:
             raise ValueError(f"budget type must be `epochs` or `runtime` or {FORECASTING_BUDGET_TYPE} "
-                             f"(Only used in forecasting taskss), but got {self.budget_type}")
+                             f"(Only used by forecasting taskss), but got {self.budget_type}")
 
     def _get_pipeline(self) -> BaseEstimator:
         """
@@ -837,10 +843,10 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         return None
 
     def calculate_auxiliary_losses(
-            self,
-            Y_valid_pred: np.ndarray,
-            Y_test_pred: np.ndarray,
-            **metric_kwargs: Any
+        self,
+        Y_valid_pred: np.ndarray,
+        Y_test_pred: np.ndarray,
+        **metric_kwargs: Any
     ) -> Tuple[Optional[Dict[str, float]], Optional[Dict[str, float]]]:
         """
         A helper function to calculate the performance estimate of the
@@ -877,10 +883,10 @@ def calculate_auxiliary_losses(
         return validation_loss_dict, test_loss_dict
 
     def file_output(
-            self,
-            Y_optimization_pred: np.ndarray,
-            Y_valid_pred: np.ndarray,
-            Y_test_pred: np.ndarray
+        self,
+        Y_optimization_pred: np.ndarray,
+        Y_valid_pred: np.ndarray,
+        Y_test_pred: np.ndarray
     ) -> Tuple[Optional[float], Dict]:
         """
         This method decides what file outputs are written to disk.
@@ -1015,7 +1021,6 @@ def _predict_proba(self, X: np.ndarray, pipeline: BaseEstimator,
             (np.ndarray):
                 The predictions of pipeline for the given features X
         """
-
         @no_type_check
         def send_warnings_to_log(message, category, filename, lineno,
                                  file=None, line=None):
@@ -1050,7 +1055,6 @@ def _predict_regression(self, X: np.ndarray, pipeline: BaseEstimator,
             (np.ndarray):
                 The predictions of pipeline for the given features X
         """
-
         @no_type_check
         def send_warnings_to_log(message, category, filename, lineno,
                                  file=None, line=None):
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 6c233f857..f9fcf3f42 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -105,31 +105,32 @@ class ExecuteTaFuncWithQueue(AbstractTAFunc):
     """
 
     def __init__(
-            self,
-            backend: Backend,
-            seed: int,
-            metric: autoPyTorchMetric,
-            cost_for_crash: float,
-            abort_on_first_run_crash: bool,
-            pynisher_context: str,
-            multi_objectives: List[str],
-            pipeline_config: Optional[Dict[str, Any]] = None,
-            initial_num_run: int = 1,
-            stats: Optional[Stats] = None,
-            run_obj: str = 'quality',
-            par_factor: int = 1,
-            output_y_hat_optimization: bool = True,
-            include: Optional[Dict[str, Any]] = None,
-            exclude: Optional[Dict[str, Any]] = None,
-            memory_limit: Optional[int] = None,
-            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-            init_params: Dict[str, Any] = None,
-            budget_type: str = None,
-            ta: Optional[Callable] = None,
-            logger_port: int = None,
-            all_supported_metrics: bool = True,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-            **eval_func_kwargs: Any):
+        self,
+        backend: Backend,
+        seed: int,
+        metric: autoPyTorchMetric,
+        cost_for_crash: float,
+        abort_on_first_run_crash: bool,
+        pynisher_context: str,
+        multi_objectives: List[str],
+        pipeline_config: Optional[Dict[str, Any]] = None,
+        initial_num_run: int = 1,
+        stats: Optional[Stats] = None,
+        run_obj: str = 'quality',
+        par_factor: int = 1,
+        output_y_hat_optimization: bool = True,
+        include: Optional[Dict[str, Any]] = None,
+        exclude: Optional[Dict[str, Any]] = None,
+        memory_limit: Optional[int] = None,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        init_params: Dict[str, Any] = None,
+        budget_type: str = None,
+        ta: Optional[Callable] = None,
+        logger_port: int = None,
+        all_supported_metrics: bool = True,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        **eval_func_kwargs: Any
+    ):
 
         self.backend = backend
 
@@ -224,8 +225,8 @@ def _check_and_get_default_budget(self) -> float:
             return budget_choices[budget_type]
 
     def run_wrapper(
-            self,
-            run_info: RunInfo,
+        self,
+        run_info: RunInfo,
     ) -> Tuple[RunInfo, RunValue]:
         """
         wrapper function for ExecuteTARun.run_wrapper() to cap the target algorithm
@@ -240,7 +241,6 @@ def run_wrapper(
             RunValue:
                 Contains information about the status/performance of config
         """
-
         # SMAC returns non-zero budget for intensification
         # In other words, SMAC returns budget=0 for a simple intensifier (i.e. no intensification)
         is_intensified = (run_info.budget != 0)
@@ -284,13 +284,13 @@ def run_wrapper(
         return run_info, run_value
 
     def run(
-            self,
-            config: Configuration,
-            instance: Optional[str] = None,
-            cutoff: Optional[float] = None,
-            seed: int = 12345,
-            budget: float = 0.0,
-            instance_specific: Optional[str] = None,
+        self,
+        config: Configuration,
+        instance: Optional[str] = None,
+        cutoff: Optional[float] = None,
+        seed: int = 12345,
+        budget: float = 0.0,
+        instance_specific: Optional[str] = None,
     ) -> Tuple[StatusType, float, float, Dict[str, Any]]:
 
         context = multiprocessing.get_context(self.pynisher_context)
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index a7bb9af86..e761cc77b 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -10,17 +10,19 @@
 from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import CLASSIFICATION_TASKS, MULTICLASSMULTIOUTPUT
-from autoPyTorch.datasets.resampling_strategy import (CrossValTypes,
-                                                      HoldoutValTypes)
+from autoPyTorch.constants import (
+    CLASSIFICATION_TASKS,
+    MULTICLASSMULTIOUTPUT
+)
+from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.evaluation.abstract_evaluator import (
-    AbstractEvaluator, fit_and_suppress_warnings)
+    AbstractEvaluator,
+    fit_and_suppress_warnings
+)
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
-from autoPyTorch.pipeline.components.training.metrics.base import \
-    autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.common import dict_repr, subsampler
-from autoPyTorch.utils.hyperparameter_search_space_update import \
-    HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 __all__ = ['TrainEvaluator', 'eval_train_function']
 
@@ -407,26 +409,26 @@ def _predict(self, pipeline: BaseEstimator,
 
 # create closure for evaluating an algorithm
 def eval_train_function(
-        backend: Backend,
-        queue: Queue,
-        metric: autoPyTorchMetric,
-        budget: float,
-        config: Optional[Configuration],
-        seed: int,
-        output_y_hat_optimization: bool,
-        num_run: int,
-        include: Optional[Dict[str, Any]],
-        exclude: Optional[Dict[str, Any]],
-        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-        pipeline_config: Optional[Dict[str, Any]] = None,
-        budget_type: str = None,
-        init_params: Optional[Dict[str, Any]] = None,
-        logger_port: Optional[int] = None,
-        all_supported_metrics: bool = True,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        instance: str = None,
-        evaluator_class: Type[TrainEvaluator] = TrainEvaluator,
-        **evaluator_kwargs: Any,
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    output_y_hat_optimization: bool,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    instance: str = None,
+    evaluator_class: Type[TrainEvaluator] = TrainEvaluator,
+    **evaluator_kwargs: Any,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 3dad49492..3a982132b 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -38,15 +38,15 @@
 
 
 def get_smac_object(
-        scenario_dict: Dict[str, Any],
-        seed: int,
-        ta: Callable,
-        ta_kwargs: Dict[str, Any],
-        n_jobs: int,
-        initial_budget: Union[int, float],
-        max_budget: Union[int, float],
-        dask_client: Optional[dask.distributed.Client],
-        initial_configurations: Optional[List[Configuration]] = None,
+    scenario_dict: Dict[str, Any],
+    seed: int,
+    ta: Callable,
+    ta_kwargs: Dict[str, Any],
+    n_jobs: int,
+    initial_budget: Union[int, float],
+    max_budget: Union[int, float],
+    dask_client: Optional[dask.distributed.Client],
+    initial_configurations: Optional[List[Configuration]] = None,
 ) -> SMAC4HPO:
     """
     This function returns an SMAC object that is gonna be used as
@@ -341,7 +341,6 @@ def run_smbo(self, func: Optional[Callable] = None
             pynisher_context=self.pynisher_context,
             evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
         )
-
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")
 
diff --git a/autoPyTorch/optimizer/utils.py b/autoPyTorch/optimizer/utils.py
index aa1290234..5f12b0dd1 100644
--- a/autoPyTorch/optimizer/utils.py
+++ b/autoPyTorch/optimizer/utils.py
@@ -7,8 +7,8 @@
 
 
 def read_return_initial_configurations(
-        config_space: ConfigurationSpace,
-        portfolio_selection: str
+    config_space: ConfigurationSpace,
+    portfolio_selection: str
 ) -> List[Configuration]:
     # read and validate initial configurations
     portfolio_path = portfolio_selection if portfolio_selection != "greedy" else \
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index a47270c7d..5cd4941a7 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -28,6 +28,7 @@ def get_preprocess_transforms(X: Dict[str, Any],
 
 def preprocess(dataset: np.ndarray, transforms: torchvision.transforms.Compose,
                indices: List[int] = None) -> np.ndarray:
+
     composite_transforms = torchvision.transforms.Compose(transforms)
     if indices is None:
         dataset = composite_transforms(dataset)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
index 6145ca91a..7ff914a98 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -42,6 +42,7 @@ def __init__(self,
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Builds the backbone component and assigns it to self.backbone
+
         Args:
             X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
             y (Any): not used. To comply with sklearn API
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 4a915d4f7..be41c2463 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -48,7 +48,6 @@ def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_n
         self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in
                                       zip(self.num_output_dimensions, self.embed_features,
                                           self.num_input_features)]
-
         self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions)
 
         self.ee_layers = self._create_ee_layers()
@@ -159,16 +158,16 @@ def build_embedding(self,
 
     @staticmethod
     def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="min_unique_values_for_embedding",
-                value_range=(3, 7),
-                default_value=5,
-                log=True),
-            dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(
-                hyperparameter="dimension_reduction",
-                value_range=(0, 1),
-                default_value=0.5),
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="min_unique_values_for_embedding",
+            value_range=(3, 7),
+            default_value=5,
+            log=True),
+        dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="dimension_reduction",
+            value_range=(0, 1),
+            default_value=0.5),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
         add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter)
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 36b8204b7..1ff5df13e 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -23,7 +23,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
         self.embedding, num_output_features = self.build_embedding(
             num_input_features=num_input_features,
-            num_numerical_features=num_numerical_columns,
+            num_numerical_features=num_numerical_columns
         )
         if "feature_shapes" in X['dataset_properties']:
             if num_output_features is not None:
@@ -55,6 +55,7 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
             num_numerical_columns = 0
         else:
             X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
+
             if 'tabular_transformer' in X:
                 numerical_column_transformer = X['tabular_transformer'].preprocessor. \
                     named_transformers_['numerical_pipeline']
diff --git a/autoPyTorch/pipeline/components/setup/network_head/__init__.py b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
index f2c15aa2d..ac52cf1c9 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/__init__.py
@@ -16,7 +16,6 @@
     NetworkHeadComponent,
 )
 
-
 directory = os.path.split(__file__)[0]
 _heads = find_components(__package__,
                          directory,
@@ -43,7 +42,6 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         components = OrderedDict()
         components.update(_heads)
         components.update(_addons.components)
-
         return components
 
     def get_available_components(
@@ -112,6 +110,7 @@ def get_available_components(
             # is not recommended for a certain dataset
 
             components_dict[name] = entry
+
         return components_dict
 
     def get_hyperparameter_search_space(
@@ -156,7 +155,6 @@ def get_hyperparameter_search_space(
                 if default_ in available_heads:
                     default = default_
                     break
-
         updates = self._get_search_space_updates()
         if '__choice__' in updates.keys():
             choice_hyperparameter = updates['__choice__']
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
index 89d9e733d..3134db201 100755
--- a/autoPyTorch/pipeline/components/training/trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -99,11 +99,11 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]:
         return components
 
     def get_hyperparameter_search_space(
-            self,
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            default: Optional[str] = None,
-            include: Optional[List[str]] = None,
-            exclude: Optional[List[str]] = None,
+        self,
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        default: Optional[str] = None,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
     ) -> ConfigurationSpace:
         """Returns the configuration space of the current chosen components
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index 04adce688..0dba1e869 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -63,10 +63,10 @@ def is_max_time_reached(self) -> bool:
 
 class RunSummary(object):
     def __init__(
-            self,
-            total_parameter_count: float,
-            trainable_parameter_count: float,
-            optimize_metric: Optional[str] = None,
+        self,
+        total_parameter_count: float,
+        trainable_parameter_count: float,
+        optimize_metric: Optional[str] = None,
     ):
         """
         A useful object to track performance per epoch.
@@ -126,7 +126,6 @@ def get_best_epoch(self, split_type: str = 'val') -> int:
         # If we compute for optimization, prefer the performance
         # metric to the loss
         if self.optimize_metric is not None:
-
             metrics_type = f"{split_type}_metrics"
             if self.optimize_metric in CLASSIFICATION_METRICS:
                 scorer = CLASSIFICATION_METRICS[self.optimize_metric]
@@ -212,19 +211,19 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
         self.weighted_loss: bool = False
 
     def prepare(
-            self,
-            metrics: List[Any],
-            model: torch.nn.Module,
-            criterion: Type[torch.nn.Module],
-            budget_tracker: BudgetTracker,
-            optimizer: Optimizer,
-            device: torch.device,
-            metrics_during_training: bool,
-            scheduler: _LRScheduler,
-            task_type: int,
-            labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
-            step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
-            **kwargs: Dict
+        self,
+        metrics: List[Any],
+        model: torch.nn.Module,
+        criterion: Type[torch.nn.Module],
+        budget_tracker: BudgetTracker,
+        optimizer: Optimizer,
+        device: torch.device,
+        metrics_during_training: bool,
+        scheduler: _LRScheduler,
+        task_type: int,
+        labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
+        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+        **kwargs: Dict
     ) -> None:
 
         # Save the device to be used
@@ -277,9 +276,9 @@ def on_epoch_end(self, X: Dict[str, Any], epoch: int) -> bool:
         return False
 
     def _scheduler_step(
-            self,
-            step_interval: StepIntervalUnit,
-            loss: Optional[float] = None
+        self,
+        step_interval: StepIntervalUnit,
+        loss: Optional[float] = None
     ) -> None:
 
         if self.step_interval != step_interval:
@@ -371,7 +370,6 @@ def train_step(self, data: torch.Tensor, targets: torch.Tensor) -> Tuple[float,
         """
         # prepare
         data = data.float().to(self.device)
-
         targets = self.cast_targets(targets)
 
         data, criterion_kwargs = self.data_preparation(data, targets)

From 68d8a25bfb438d365b3e8375b751cc0968d16b10 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 12:58:09 +0200
Subject: [PATCH 312/347] move constant_forecasting to constatn

---
 autoPyTorch/api/base_task.py                  |  2 +-
 autoPyTorch/api/time_series_forecasting.py    |  3 +--
 autoPyTorch/constants.py                      | 22 +++++++++++++++++++
 autoPyTorch/constants_forecasting.py          | 19 ----------------
 autoPyTorch/datasets/time_series_dataset.py   | 20 ++++++++---------
 autoPyTorch/evaluation/abstract_evaluator.py  |  2 +-
 autoPyTorch/evaluation/tae.py                 |  2 +-
 ...time_series_forecasting_train_evaluator.py | 14 +++++-------
 autoPyTorch/optimizer/smbo.py                 |  2 +-
 .../trainer/forecasting_trainer/__init__.py   |  3 +--
 .../training/test_forecasting_training.py     |  2 +-
 11 files changed, 44 insertions(+), 47 deletions(-)
 delete mode 100644 autoPyTorch/constants_forecasting.py

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index de0e5cbc6..a5500c3ae 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -34,12 +34,12 @@
 from autoPyTorch import metrics
 from autoPyTorch.automl_common.common.utils.backend import Backend, create
 from autoPyTorch.constants import (
+    FORECASTING_BUDGET_TYPE,
     FORECASTING_TASKS,
     REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
 )
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.data.utils import DatasetCompressionSpec
 from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 4fab5bb2a..8b41748a6 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -6,8 +6,7 @@
 
 from autoPyTorch.api.base_task import BaseTask
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
-from autoPyTorch.constants_forecasting import MAX_WINDOW_SIZE_BASE
+from autoPyTorch.constants import MAX_WINDOW_SIZE_BASE, TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
 from autoPyTorch.data.time_series_forecasting_validator import \
     TimeSeriesForecastingInputValidator
 from autoPyTorch.data.utils import (DatasetCompressionSpec,
diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py
index 4a3df01f7..d7bb38b45 100644
--- a/autoPyTorch/constants.py
+++ b/autoPyTorch/constants.py
@@ -53,3 +53,25 @@
 
 CLASSIFICATION_OUTPUTS = [BINARY, MULTICLASS, MULTICLASSMULTIOUTPUT]
 REGRESSION_OUTPUTS = [CONTINUOUS, CONTINUOUSMULTIOUTPUT]
+
+# Constants for Forecasting Tasks
+
+# The constant values for time series forecasting comes from
+# https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
+# seasonality map, maps a frequency value to a number
+FORECASTING_BUDGET_TYPE = ('resolution', 'num_seq', 'num_sample_per_seq')
+
+SEASONALITY_MAP = {
+    "1min": [1440, 10080, 525960],
+    "10min": [144, 1008, 52596],
+    "30min": [48, 336, 17532],
+    "1H": [24, 168, 8766],
+    "1D": 7,
+    "1W": 365.25 / 7,
+    "1M": 12,
+    "1Q": 4,
+    "1Y": 1
+}
+
+# To avoid that we get a sequence that is too long to be fed to a network
+MAX_WINDOW_SIZE_BASE = 500
diff --git a/autoPyTorch/constants_forecasting.py b/autoPyTorch/constants_forecasting.py
deleted file mode 100644
index 0b7d03137..000000000
--- a/autoPyTorch/constants_forecasting.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# The constant values for time series forecasting comes from
-# https://github.com/rakshitha123/TSForecasting/blob/master/experiments/deep_learning_experiments.py
-# seasonality map, maps a frequency value to a number
-
-FORECASTING_BUDGET_TYPE = ('resolution', 'num_seq', 'num_sample_per_seq')
-
-SEASONALITY_MAP = {
-    "1min": [1440, 10080, 525960],
-    "10min": [144, 1008, 52596],
-    "30min": [48, 336, 17532],
-    "1H": [24, 168, 8766],
-    "1D": 7,
-    "1W": 365.25 / 7,
-    "1M": 12,
-    "1Q": 4,
-    "1Y": 1
-}
-
-MAX_WINDOW_SIZE_BASE = 500
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index d894dbba3..ce0b4ae95 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -21,14 +21,15 @@
 
 import torchvision.transforms
 
-from autoPyTorch.constants import (CLASSIFICATION_OUTPUTS,
-                                   STRING_TO_OUTPUT_TYPES,
-                                   TASK_TYPES_TO_STRING,
-                                   TIMESERIES_FORECASTING)
-from autoPyTorch.constants_forecasting import (MAX_WINDOW_SIZE_BASE,
-                                               SEASONALITY_MAP)
-from autoPyTorch.data.time_series_forecasting_validator import \
-    TimeSeriesForecastingInputValidator
+from autoPyTorch.constants import (
+    CLASSIFICATION_OUTPUTS,
+    MAX_WINDOW_SIZE_BASE,
+    SEASONALITY_MAP,
+    STRING_TO_OUTPUT_TYPES,
+    TASK_TYPES_TO_STRING,
+    TIMESERIES_FORECASTING
+)
+from autoPyTorch.data.time_series_forecasting_validator import TimeSeriesForecastingInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset, type_of_target
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValFuncs,
@@ -38,8 +39,7 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes,
 )
-from autoPyTorch.pipeline.components.training.metrics.metrics import \
-    compute_mase_coefficient
+from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
 from autoPyTorch.utils.common import FitRequirement
 
 TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index cf90fa338..8e59bcb9a 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -25,6 +25,7 @@
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
     CLASSIFICATION_TASKS,
+    FORECASTING_BUDGET_TYPE,
     FORECASTING_TASKS,
     IMAGE_TASKS,
     MULTICLASS,
@@ -33,7 +34,6 @@
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS
 )
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
 from autoPyTorch.datasets.base_dataset import (
     BaseDataset,
     BaseDatasetPropertiesType
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index f9fcf3f42..e65d2e80a 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -23,7 +23,7 @@
 from smac.tae.execute_func import AbstractTAFunc
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.constants import FORECASTING_BUDGET_TYPE
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index e41e69fc6..a319afb60 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -12,17 +12,13 @@
 from smac.tae import StatusType
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants_forecasting import SEASONALITY_MAP
-from autoPyTorch.evaluation.abstract_evaluator import \
-    DummyTimeSeriesForecastingPipeline
+from autoPyTorch.constants import SEASONALITY_MAP
+from autoPyTorch.evaluation.abstract_evaluator import DummyTimeSeriesForecastingPipeline
 from autoPyTorch.evaluation.train_evaluator import TrainEvaluator
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
-from autoPyTorch.pipeline.components.training.metrics.base import \
-    autoPyTorchMetric
-from autoPyTorch.pipeline.components.training.metrics.metrics import \
-    MASE_LOSSES
-from autoPyTorch.utils.hyperparameter_search_space_update import \
-    HyperparameterSearchSpaceUpdates
+from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
+from autoPyTorch.pipeline.components.training.metrics.metrics import MASE_LOSSES
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
 class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 3a982132b..51f417d01 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -19,7 +19,7 @@
 from smac.utils.io.traj_logging import TrajEntry
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.constants import FORECASTING_BUDGET_TYPE
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 2122111e5..b4e9cb34e 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -4,8 +4,7 @@
 
 import numpy as np
 
-from autoPyTorch.constants import STRING_TO_TASK_TYPES
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.constants import FORECASTING_BUDGET_TYPE, STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import (
     ThirdPartyComponents,
diff --git a/test/test_pipeline/components/training/test_forecasting_training.py b/test/test_pipeline/components/training/test_forecasting_training.py
index 4c5f21517..3780ea206 100644
--- a/test/test_pipeline/components/training/test_forecasting_training.py
+++ b/test/test_pipeline/components/training/test_forecasting_training.py
@@ -1,6 +1,6 @@
 import unittest
 
-from autoPyTorch.constants_forecasting import FORECASTING_BUDGET_TYPE
+from autoPyTorch.constants import FORECASTING_BUDGET_TYPE
 from autoPyTorch.pipeline.components.training.trainer.forecasting_trainer import ForecastingTrainerChoice
 
 

From dac5cdd49658c3ced7a2a1611a46f04ba4c8ddcb Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 13:56:25 +0200
Subject: [PATCH 313/347] additional annotate for base pipeline

---
 autoPyTorch/pipeline/base_pipeline.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index b8bcfade1..5c580dbd6 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -413,6 +413,8 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 if include is not None and update.node_name in include.keys():
                     if split_hyperparameter[0] not in include[update.node_name]:
                         hp_in_component = False
+                        # If the node contains subcomponent that is also an instance of autoPyTorchChoice,
+                        # We need to ensure that include is properly passed to it subcomponent
                         for include_component in include[update.node_name]:
                             if include_component.startswith(split_hyperparameter[0]):
                                 hp_in_component = True
@@ -454,6 +456,12 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 elif split_hyperparameter[0] not in components.keys():
                     hp_in_component = False
                     if hasattr(node, 'additional_components') and node.additional_components:
+                        # This is designed for forecasting network encoder:
+                        # forecasting network backbone is composed of two parts: encoder and decoder whereas the type
+                        # of the decoder is determined by the encoder. However, the type of decoder cannot be any part
+                        # of encoder's choice. To allow the user to update the hyperparameter search space for decoder
+                        # network, we consider decoder as "additional_components" and check if the update can be applied
+                        # to node.additional_components
                         for component_func in node.additional_components:
                             if split_hyperparameter[0] in component_func().keys():
                                 hp_in_component = True

From 7f2d394ad50b93407fde8f10590618e35e387964 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 15:46:03 +0200
Subject: [PATCH 314/347] move forecasting check to tae

---
 autoPyTorch/api/base_task.py               | 24 ++++++++----------
 autoPyTorch/api/time_series_forecasting.py |  1 -
 autoPyTorch/evaluation/tae.py              | 29 ++++++++++++++++------
 autoPyTorch/optimizer/smbo.py              | 25 +++++++++----------
 test/test_evaluation/test_evaluation.py    | 20 ++++++++++++---
 5 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index a5500c3ae..d732ffbe7 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -39,6 +39,7 @@
     REGRESSION_TASKS,
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
+    TIMESERIES_FORECASTING,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.data.utils import DatasetCompressionSpec
@@ -53,7 +54,6 @@
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
-from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
@@ -81,7 +81,7 @@ def _pipeline_predict(pipeline: BasePipeline,
                       batch_size: int,
                       logger: PicklableClientLogger,
                       task: int,
-                      forecasting_task: bool = False) -> np.ndarray:
+                      task_type: str = "") -> np.ndarray:
     @typing.no_type_check
     def send_warnings_to_log(
             message, category, filename, lineno, file=None, line=None):
@@ -105,7 +105,7 @@ def send_warnings_to_log(
                     prediction,
                     np.sum(prediction, axis=1)
                 ))
-    if not forecasting_task:
+    if STRING_TO_TASK_TYPES.get(task_type, -1) != TIMESERIES_FORECASTING:
         if len(prediction.shape) < 1 or len(X_.shape) < 1 or \
                 X_.shape[0] < 1 or prediction.shape[0] != X_.shape[0]:
             logger.warning(
@@ -259,8 +259,6 @@ def __init__(
                 raise ValueError("Expected search space updates to be of instance"
                                  " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
 
-        self.time_series_forecasting = task_type == 'time_series_forecasting'
-
     @abstractmethod
     def build_pipeline(
         self,
@@ -746,7 +744,6 @@ def _do_dummy_prediction(self) -> None:
             memory_limit=memory_limit,
             disable_file_output=self._disable_file_output,
             all_supported_metrics=self._all_supported_metrics,
-            evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
         )
 
         status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -832,7 +829,6 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
                     memory_limit=memory_limit,
                     disable_file_output=self._disable_file_output,
                     all_supported_metrics=self._all_supported_metrics,
-                    evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
                 )
                 dask_futures.append([
                     classifier,
@@ -1125,8 +1121,10 @@ def _search(
         self.search_space = self.get_search_space(dataset)
 
         # Incorporate budget to pipeline config
-        if budget_type not in ('epochs', 'runtime') and (budget_type in FORECASTING_BUDGET_TYPE
-                                                         and not self.time_series_forecasting):
+        if budget_type not in ('epochs', 'runtime') and (
+                budget_type in FORECASTING_BUDGET_TYPE and
+                STRING_TO_TASK_TYPES[self.task_type] != TIMESERIES_FORECASTING
+        ):
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
         self.pipeline_options['budget_type'] = budget_type
@@ -1275,7 +1273,7 @@ def _search(
                 search_space_updates=self.search_space_updates,
                 portfolio_selection=portfolio_selection,
                 pynisher_context=self._multiprocessing_context,
-                time_series_forecasting=self.time_series_forecasting,
+                task_type=self.task_type,
                 **kwargs,
             )
             try:
@@ -1356,7 +1354,7 @@ def _get_fit_dictionary(
                                   'split_id': split_id,
                                   'num_run': self._backend.get_next_num_run(),
                                   })
-        if self.time_series_forecasting:
+        if STRING_TO_TASK_TYPES[self.task_type] == TIMESERIES_FORECASTING:
             warnings.warn("Currently Time Series Forecasting tasks do not allow computing metrics "
                           "during training. It will be automatically set as False")
             self.pipeline_options["metrics_during_training"] = False
@@ -1655,8 +1653,6 @@ def fit_pipeline(
             search_space_updates=search_space_updates,
             pipeline_config=pipeline_options,
             pynisher_context=self._multiprocessing_context,
-            evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
-
         )
 
         run_info, run_value = tae.run_wrapper(
@@ -1749,7 +1745,7 @@ def predict(
         all_predictions = joblib.Parallel(n_jobs=n_jobs)(
             joblib.delayed(_pipeline_predict)(
                 models[identifier], X_test, batch_size, self._logger, STRING_TO_TASK_TYPES[self.task_type],
-                self.time_series_forecasting
+                self.task_type
             )
             for identifier in self.ensemble_.get_selected_model_identifiers()
         )
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 8b41748a6..eba3bf9c8 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -104,7 +104,6 @@ def __init__(
                     and update.hyperparameter == "window_size"
                 ):
                     self.customized_window_size = True
-        self.time_series_forecasting = True
 
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TimeSeriesForecastingDataset):
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index e65d2e80a..fe9254182 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -23,7 +23,11 @@
 from smac.tae.execute_func import AbstractTAFunc
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import FORECASTING_BUDGET_TYPE
+from autoPyTorch.constants import (
+    FORECASTING_BUDGET_TYPE,
+    STRING_TO_TASK_TYPES,
+    TIMESERIES_FORECASTING,
+)
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -31,6 +35,7 @@
 )
 from autoPyTorch.evaluation.test_evaluator import eval_test_function
 from autoPyTorch.evaluation.train_evaluator import eval_train_function
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.evaluation.utils import (
     DisableFileOutputParameters,
     empty_queue,
@@ -129,7 +134,6 @@ def __init__(
         logger_port: int = None,
         all_supported_metrics: bool = True,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
-        **eval_func_kwargs: Any
     ):
 
         self.backend = backend
@@ -147,12 +151,21 @@ def __init__(
         self.resampling_strategy = dm.resampling_strategy
         self.resampling_strategy_args = dm.resampling_strategy_args
 
-        if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
-            eval_function = functools.partial(eval_train_function, **eval_func_kwargs)
-            self.output_y_hat_optimization = output_y_hat_optimization
-        elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
-            eval_function = functools.partial(eval_test_function, **eval_func_kwargs)
-            self.output_y_hat_optimization = False
+        if STRING_TO_TASK_TYPES.get(dm.task_type, -1) == TIMESERIES_FORECASTING:
+            eval_function = functools.partial(eval_train_function,
+                                              evaluator_class=TimeSeriesForecastingTrainEvaluator)
+            if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
+                self.output_y_hat_optimization = output_y_hat_optimization
+            elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
+                self.output_y_hat_optimization = None
+        else:
+            if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
+                eval_function = eval_train_function
+                self.output_y_hat_optimization = output_y_hat_optimization
+            elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
+                eval_function = eval_test_function
+                self.output_y_hat_optimization = False
+
         self.worst_possible_result = cost_for_crash
 
         eval_function = functools.partial(
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 51f417d01..08fd1e362 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -19,7 +19,11 @@
 from smac.utils.io.traj_logging import TrajEntry
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
-from autoPyTorch.constants import FORECASTING_BUDGET_TYPE
+from autoPyTorch.constants import (
+    STRING_TO_TASK_TYPES,
+    FORECASTING_BUDGET_TYPE,
+    TIMESERIES_FORECASTING
+)
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
@@ -29,7 +33,6 @@
 )
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
-from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.optimizer.utils import read_forecasting_init_configurations, read_return_initial_configurations
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -127,7 +130,7 @@ def __init__(self,
                  pynisher_context: str = 'spawn',
                  min_budget: Union[int, float] = 5,
                  max_budget: Union[int, float] = 50,
-                 time_series_forecasting: bool = False,
+                 task_type: str = "",
                  **kwargs: Dict[str, Any]
                  ):
         """
@@ -203,9 +206,8 @@ def __init__(self,
                 max_budget states the maximum resource allocation a pipeline is going to
                 be ran. For example, if the budget_type is epochs, and max_budget=50,
                 then the pipeline training will be terminated after 50 epochs.
-            time_series_forecasting (bool):
-                If we want to apply this optimizer to optimize time series prediction tasks (which has a different
-                tae)
+            task_type (str):
+                task type. Forecasting tasks require special process
             kwargs (Any):
                 Additional Arguments for forecasting tasks. It includes:
                     min_num_test_instances (int): minimal number of instances used to initialize a proxy validation set
@@ -258,7 +260,7 @@ def __init__(self,
 
         self.search_space_updates = search_space_updates
 
-        self.time_series_forecasting = time_series_forecasting
+        self.task_type = task_type
 
         if logger_port is None:
             self.logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
@@ -271,7 +273,7 @@ def __init__(self,
 
         initial_configurations = []
 
-        if self.time_series_forecasting:
+        if STRING_TO_TASK_TYPES.get(self.task_type, -1) == TIMESERIES_FORECASTING:
             suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models',  # type:ignore[assignment]
                                                                     None)
             custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path',  # type:ignore[assignment]
@@ -339,7 +341,6 @@ def run_smbo(self, func: Optional[Callable] = None
             pipeline_config=self.pipeline_config,
             search_space_updates=self.search_space_updates,
             pynisher_context=self.pynisher_context,
-            evaluator_class=TimeSeriesForecastingTrainEvaluator if self.time_series_forecasting else None,
         )
         ta = ExecuteTaFuncWithQueue
         self.logger.info("Finish creating Target Algorithm (TA) function")
@@ -389,13 +390,11 @@ def run_smbo(self, func: Optional[Callable] = None
 
         budget_type = self.pipeline_config['budget_type']
         if budget_type in FORECASTING_BUDGET_TYPE:
+            if STRING_TO_TASK_TYPES.get(self.task_type, -1) != TIMESERIES_FORECASTING:
+                raise ValueError('Forecasting Budget type is only available for forecasting task!')
             if self.min_budget > 1. or self.max_budget > 1.:
                 self.min_budget = float(self.min_budget) / float(self.max_budget)
                 self.max_budget = 1.0
-
-        if self.time_series_forecasting:
-            ta_kwargs["evaluator_class"] = TimeSeriesForecastingTrainEvaluator
-            ta_kwargs['max_budget'] = self.max_budget
             ta_kwargs['min_num_test_instances'] = self.min_num_test_instances
 
         if self.get_smac_object_callback is not None:
diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py
index 8222efe70..194d62aa8 100644
--- a/test/test_evaluation/test_evaluation.py
+++ b/test/test_evaluation/test_evaluation.py
@@ -23,7 +23,7 @@
 
 this_directory = os.path.dirname(__file__)
 sys.path.append(this_directory)
-from evaluation_util import get_multiclass_classification_datamanager  # noqa E402
+from evaluation_util import get_forecasting_dataset, get_multiclass_classification_datamanager  # noqa E402
 
 
 def safe_eval_success_mock(*args, **kwargs):
@@ -45,6 +45,19 @@ def load_datamanager(self):
         return get_multiclass_classification_datamanager()
 
 
+class BackendMockForecasting(object):
+    def __init__(self):
+        self.temporary_directory = './.tmp_evaluation'
+        try:
+            os.mkdir(self.temporary_directory)
+        except:  # noqa 3722
+            pass
+
+    def load_datamanager(self):
+        return get_forecasting_dataset()
+
+
+
 class EvaluationTest(unittest.TestCase):
     def setUp(self):
         self.datamanager = get_multiclass_classification_datamanager()
@@ -433,11 +446,11 @@ def test_eval_with_simple_intensification(self):
             run_info_out, _ = ta.run_wrapper(run_info)
             self.assertEqual(run_info_out.budget, budget)
 
-    def test_eval_with_addition_eval_func_kwargs(self):
+    def test_eval_forecsating(self):
         config = unittest.mock.Mock(spec=int)
         config.config_id = 198
 
-        ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1,
+        ta = ExecuteTaFuncWithQueue(backend=BackendMockForecasting(), seed=1,
                                     stats=self.stats,
                                     memory_limit=3072,
                                     multi_objectives=["cost"],
@@ -447,7 +460,6 @@ def test_eval_with_addition_eval_func_kwargs(self):
                                     logger_port=self.logger_port,
                                     pynisher_context='fork',
                                     budget_type='runtime',
-                                    evaluator_class=TimeSeriesForecastingTrainEvaluator
                                     )
 
         ta.pynisher_logger = unittest.mock.Mock()

From e43d70a9557791c50b83529d040129c1662897bb Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 15:46:28 +0200
Subject: [PATCH 315/347] maint time series refit dataset

---
 autoPyTorch/datasets/time_series_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index ce0b4ae95..d2f75932c 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -1256,7 +1256,7 @@ def create_refit_split(
 
     def create_refit_set(self) -> "TimeSeriesForecastingDataset":
         refit_set: TimeSeriesForecastingDataset = copy.deepcopy(self)
-        refit_set.resampling_strategy = None  # type: ignore[assignment]
+        refit_set.resampling_strategy = NoResamplingStrategyTypes.no_resampling
         refit_set.splits = refit_set.get_splits_from_resampling_strategy()
         return refit_set
 

From dc48b9d7c553e10474c418c9ca6e4681f3f2ae3e Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 15:51:45 +0200
Subject: [PATCH 316/347] fix test

---
 .../preprocessing/forecasting/test_imputer.py | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
index e219f2dca..406ede9e4 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
@@ -9,6 +9,7 @@
 
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
+from sktime.transformations.series.impute import Imputer as SKTImpute
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (
     TimeSeriesFeatureImputer,
@@ -93,9 +94,10 @@ def test_drift_imputation(self):
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data.iloc[self.test_indices])
 
-        self.assertTrue(np.allclose(transformed, np.array([[7.5, 2., 3.],
-                                                           [7., 2., 9.],
-                                                           [4, 2., 10.]])))
+        skt_imputer = SKTImpute(method='drift', random_state=imputer_component.random_state)
+        skt_imputer.fit(X['X_train'])
+
+        self.assertTrue(np.allclose(transformed, skt_imputer.transform(data.iloc[self.test_indices]).values))
 
     def test_linear_imputation(self):
         imputer_component = TimeSeriesFeatureImputer(imputation_strategy='linear')
@@ -111,9 +113,10 @@ def test_linear_imputation(self):
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(self.data[self.test_indices])
 
-        self.assertTrue(np.allclose(transformed, np.array([[7., 2., 3.],
-                                                           [7., 2., 9.],
-                                                           [4., 2., 9.]])))
+        skt_imputer = SKTImpute(method='linear', random_state=imputer_component.random_state)
+        skt_imputer.fit(X['X_train'])
+
+        assert_array_equal(transformed, skt_imputer.transform(self.data[self.test_indices]))
 
     def test_nearest_imputation(self):
         data = np.array([[1.0, np.nan, 7],
@@ -149,9 +152,10 @@ def test_nearest_imputation(self):
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed, np.array([[12., 5., 6.],
-                                                  [12., 5, 8.],
-                                                  [9., 7., 8]]))
+        skt_imputer = SKTImpute(method='nearest', random_state=imputer_component.random_state)
+        skt_imputer.fit(X['X_train'])
+
+        assert_array_equal(transformed, skt_imputer.transform(data[test_indices]))
 
     def test_constant_imputation(self):
         imputer_component = TimeSeriesFeatureImputer(imputation_strategy='constant_zero')
@@ -183,6 +187,7 @@ def test_bfill_imputation(self):
                                                      remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(self.data[self.test_indices])
+
         assert_array_equal(transformed, np.array([[7., 2, 3],
                                                   [7, 2., 9],
                                                   [4, 2., 9.]]))

From 1e7253a5cab798f6e534dad8eac0585dd5ccc45d Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 16:06:45 +0200
Subject: [PATCH 317/347] workflow for extra requirements

---
 .github/workflows/docs.yml                 | 2 +-
 .github/workflows/long_regression_test.yml | 2 +-
 .github/workflows/pytest.yml               | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index cd665ecf9..480883eaa 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -33,7 +33,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install -e .[docs,examples]
+        pip install -e .[docs,examples,forecasting]
 
     - name: Make docs
       run: |
diff --git a/.github/workflows/long_regression_test.yml b/.github/workflows/long_regression_test.yml
index 3007b22de..c36bb5e6d 100644
--- a/.github/workflows/long_regression_test.yml
+++ b/.github/workflows/long_regression_test.yml
@@ -30,7 +30,7 @@ jobs:
     - name: Install test dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e .[test]
+        pip install -e .[forecasting,test]
 
     - name: Run tests
       run: |
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 2a3e87ee6..0ac1d04e1 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -89,7 +89,7 @@ jobs:
       run: |
         git submodule update --init --recursive
         python -m pip install --upgrade pip
-        pip install -e .[forecasting, test]
+        pip install -e .[forecasting,test]
 
     - name: Dist install
       if: matrix.kind == 'dist'
@@ -98,7 +98,7 @@ jobs:
 
         python setup.py sdist
         last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1)
-        pip install $last_dist[forecasting, test]
+        pip install $last_dist[forecasting,test]
 
     - name: Store repository status
       id: status-before

From 83e2469129f37cafdee0a97244a183ca839f6fcd Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 16:58:08 +0200
Subject: [PATCH 318/347] docs for time series dataset

---
 autoPyTorch/datasets/time_series_dataset.py | 194 +++++++++++++-------
 1 file changed, 128 insertions(+), 66 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index d2f75932c..0d40e55e0 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -42,10 +42,6 @@
 from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
 from autoPyTorch.utils.common import FitRequirement
 
-TIME_SERIES_FORECASTING_INPUT = Tuple[np.ndarray, np.ndarray]  # currently only numpy arrays are supported
-TIME_SERIES_REGRESSION_INPUT = Tuple[np.ndarray, np.ndarray]
-TIME_SERIES_CLASSIFICATION_INPUT = Tuple[np.ndarray, np.ndarray]
-
 
 def extract_feature_index(feature_shapes: Dict[str, int],
                           feature_names: Tuple[str],
@@ -53,12 +49,16 @@ def extract_feature_index(feature_shapes: Dict[str, int],
     """
     extract the index of a set of queried_features from the extracted feature_shapes
     Args:
-        feature_shapes (dict): feature_shapes recoding the shape of each features
-        feature_names (List[str]): names of the features
-        queried_features (Tuple[str]): names of the features that we expect their index
+        feature_shapes (dict):
+            feature_shapes recoding the shape of each features
+        feature_names (List[str]):
+            names of the features
+        queried_features (Tuple[str]):
+            names of the features that we expect their index
 
     Returns:
         feature_index (Tuple[int]):
+            indices of the corresponding features
     """
     df_range = pd.DataFrame(feature_shapes, columns=feature_names, index=[0])
     df_range_end = df_range.cumsum(axis=1)
@@ -90,6 +90,39 @@ def compute_time_features(start_time: pd.DatetimeIndex,
 
 
 class TimeSeriesSequence(Dataset):
+    """
+    A dataset representing a time series sequence. It returns all the previous observations once it is asked for an item
+    Args:
+        X (Optional[np.ndarray]):
+            past features
+        Y (np.ndarray):
+            past targets
+        start_time (Optional[pd.DatetimeIndex]):
+            times of the first timestep of the series
+        freq (str):
+            frequency that the data is sampled
+        time_feature_transform (List[TimeFeature]):
+            available time features applied to the series
+        X_test (Optional[np.ndarray]):
+            known future features
+        Y_test (Optional[np.ndarray]):
+            future targets
+        train_transforms (Optional[torchvision.transforms.Compose]):
+            training transforms, used to transform training features
+        val_transforms (Optional[torchvision.transforms.Compose]):
+            validation transforms, used to transform training features
+        n_prediction_steps (int):
+            how many steps need to be predicted in advance
+        known_future_features_index (int):
+            indices of the known future index
+        compute_mase_coefficient_value (bool):
+            if the mase coefficient for this series is pre-computed
+        time_features (Optional[np.ndarray]):
+            pre-computed time features
+        is_test_set (bool):
+            if this dataset is test sets. Test sequence will simply make X_test and Y_test as future features and
+            future targets
+    """
     _is_test_set = False
     is_pre_processed = False
 
@@ -110,27 +143,6 @@ def __init__(self,
                  time_features: Optional[np.ndarray] = None,
                  is_test_set: bool = False,
                  ) -> None:
-        """
-        A dataset representing a time series sequence.
-        Args:
-            X (Optional[np.ndarray]): past features
-            Y (np.ndarray): past targets
-            start_time (Optional[pd.DatetimeIndex]): times of the first timestep of the series
-            freq (str): frequency that the data is sampled
-            time_feature_transform (List[TimeFeature]) available time features applied to the series
-            X_test (Optional[np.ndarray]): known future features
-            Y_test (Optional[np.ndarray]): future targets
-            train_transforms (Optional[torchvision.transforms.Compose]): training transforms, used to transform
-                training features
-            val_transforms (Optional[torchvision.transforms.Compose]): validation transforms, used to transform
-                training features
-            n_prediction_steps (int): how many steps need to be predicted in advance
-            known_future_features_index (int), indices of the known future index
-            compute_mase_coefficient_value (bool): if the mase coefficient for this series is pre-computed
-            time_features (Optional[np.ndarray]): pre-computed time features
-            is_test_set (bool): if this dataset is test sets. Test sequence will simply make X_test and Y_test as future
-                features and future targets
-        """
         self.n_prediction_steps = n_prediction_steps
 
         if X is not None and X.ndim == 1:
@@ -202,8 +214,10 @@ def __getitem__(self, index: int, train: bool = True) \
         [past_targets, time_features, X_features])
 
         Args:
-            index (int): what element to yield from all the train/test tensors
-            train (bool): Whether to apply a train or test transformation, if any
+            index (int):
+                what element to yield from all the train/test tensors
+            train (bool):
+                Whether to apply a train or test transformation, if any
 
         Returns:
             features from past, targets from past and future
@@ -303,10 +317,12 @@ def get_target_values(self, index: int) -> np.ndarray:
         """
         Get the visible targets in the datasets without generating a tensor. This can be used to create a dummy pipeline
         Args:
-            index: target index
+            index (int):
+                target index
 
         Returns:
-            y: the last visible target value
+            y (np.ndarray):
+                the last visible target value
         """
         if index < 0:
             index = self.__len__() + index
@@ -343,9 +359,10 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
         a dataloader can yield this dataset with the desired transformations
 
         Args:
-            transform (torchvision.transforms.Compose): The transformations proposed
-                by the current pipeline
-            train (bool): Whether to update the train or validation transform
+            transform (torchvision.transforms.Compose):
+                The transformations proposed by the current pipeline
+            train (bool):
+                    Whether to update the train or validation transform
 
         Returns:
             self: A copy of the update pipeline
@@ -422,6 +439,56 @@ def update_attribute(self, **kwargs: Any) -> None:
 
 
 class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
+    """
+    Dataset class for time series forecasting used in AutoPyTorch. It consists of multiple TimeSeriesSequence.
+    Train and test tensors are stored as pd.DataFrame whereas their index indicates which series the data belongs to
+    Args:
+    X (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
+        time series features. can be None if we work with a uni-variant forecasting task
+    Y (Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]):
+        forecasting targets. Must be given
+    X_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
+        known future features. It is a collection of series that has the same amount of data as X. It
+        is designed to be at the tail of X. If no feature is known in the future, this value can be omitted.
+    Y_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None):
+        future targets. It is a collection of series that has the same data of series as Y. It is designed to be at
+        the tail of Y after the timestamps that need to be predicted.
+    start_times (Optional[List[pd.DatetimeIndex]]):
+        starting time of each series when they are sampled. If it is not given, we simply start with a fixed timestamp.
+    series_idx (Optional[Union[List[Union[str, int]], str, int]]):
+        (only works if X is stored as pd.DataFrame). This value is applied to identify which series the data belongs to
+        if the data is presented as a "chunk" dataframe
+    known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
+        future features that are known in advance. For instance, holidays.
+    time_feature_transform (Optional[List[TimeFeature]]):
+        A list of time feature transformation methods implemented in gluonts. For more information, please check
+        gluonts.time_feature
+    freq (Optional[Union[str, int, List[int]]]):
+        the frequency that the data is sampled. It needs to keep consistent within one dataset
+    resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]])
+        resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks. Please
+        refer to autoPyTorch.datasets.resampling_strategy
+    resampling_strategy_args (Optional[Dict[str, Any]]):
+        arguments passed to resampling_strategy
+    seed (int):
+        random seeds
+    train_transforms (Optional[torchvision.transforms.Compose]):
+        Transformation applied to training data before it is fed to the dataloader
+    val_transforms (Optional[torchvision.transforms.Compose]):
+        Transformation applied to validation data before it is fed to the dataloader
+    validator (Optional[TimeSeriesForecastingInputValidator]):
+        Input Validator
+    lagged_value (Optional[List[int]])
+        We could consider past targets as additional features for the current timestep. This item indicates the number
+        timesteps in advanced that we want to apply the targets as our current features
+    n_prediction_steps (int):
+        The number of steps you want to forecast into the future (forecast horizon)
+    dataset_name (Optional[str]):
+        dataset name
+    normalize_y(bool):
+        if targets are normalized within each series
+    """
+
     datasets: List[TimeSeriesSequence]
     cumulative_sizes: List[int]
 
@@ -439,7 +506,6 @@ def __init__(self,
                      Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
                  ] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
-                 shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -449,19 +515,6 @@ def __init__(self,
                  dataset_name: Optional[str] = None,
                  normalize_y: bool = False,
                  ):
-        """
-        tasks, the target_variables indicates which values in X corresponds to Y.
-        :param freq: Optional[Union[str, int]] frequency of the series sequences, used to determine the (possible)
-        period
-        :param lagged_value: lagged values applied to RNN and Transformer that allows them to use previous data
-        :param n_prediction_steps: The number of steps you want to forecast into the future
-        if the input X and targets needs to be shifted to be aligned:
-        such that the data until X[t] is applied to predict the value y[t+n_prediction_steps]
-        :param normalize_y: bool
-        if y values needs to be normalized with mean 0 and variance 1
-        if the dataset is trained with log_prob losses, this needs to be specified in the very beginning such that the
-        header's configspace can be built beforehand.
-        """
         # Preprocess time series data information
         assert X is not Y, "Training and Test data needs to belong two different object!!!"
 
@@ -606,7 +659,6 @@ def __init__(self,
         self.numerical_features: List[int] = self.numerical_columns
         self.categorical_features: List[int] = self.categorical_columns
 
-        self.shuffle = shuffle
         self.random_state = np.random.RandomState(seed=seed)
 
         resampling_strategy_opt, resampling_strategy_args_opt = self.get_split_strategy(
@@ -845,8 +897,6 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
                 flattened test target array with size N_all (the sum of all the series sequences) and number of targets
             is_test_set (bool):
                 if the generated sequence used for test
-            dataset_with_future_features (bool):
-                if we want to create a dataset with future features (that contained in X)
             sequences_kwargs: Dict
                 additional arguments for test sets
         Returns:
@@ -926,9 +976,10 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
         a dataloader can yield this dataset with the desired transformations
 
         Args:
-            transform (torchvision.transforms.Compose): The transformations proposed
-                by the current pipeline
-            train (bool): Whether to update the train or validation transform
+            transform (torchvision.transforms.Compose):
+                The transformations proposed by the current pipeline
+            train (bool):
+                Whether to update the train or validation transform
 
         Returns:
             self: A copy of the update pipeline
@@ -1036,14 +1087,21 @@ def get_split_strategy(sequence_lengths: List[int],
         Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
         enough to support cross-validation split, thus we need to carefully compute the number of folds
         Args:
-            sequence_lengths (List[int]): lengths of each sequence
-            n_prediction_steps (int): forecasting horizon
-            freq_value (Union[float, int]): period of the dataset, determined by its sampling frequency
-            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy to be checked
-            resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments to be checked
+            sequence_lengths (List[int]):
+                lengths of each sequence
+            n_prediction_steps (int):
+                forecasting horizon
+            freq_value (Union[float, int]):
+                period of the dataset, determined by its sampling frequency
+            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]):
+                resampling strategy to be checked
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                resampling strategy arguments to be checked
         Returns:
-            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]): resampling strategy
-            resampling_strategy_args (Optional[Dict[str, Any]]): resampling strategy arguments
+            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]):
+                resampling strategy
+            resampling_strategy_args (Optional[Dict[str, Any]]):
+                resampling strategy arguments
         """
         # check if dataset could be split with cross validation
         minimal_seq_length = np.min(sequence_lengths) - n_prediction_steps
@@ -1136,8 +1194,10 @@ def create_cross_val_splits(
         It is done once per dataset to have comparable results among pipelines
         Args:
             cross_val_type (CrossValTypes):
-            num_splits (int): number of splits to be created
-            n_repeats (int): how many n_prediction_steps to repeat in the validation set
+            num_splits (int):
+                number of splits to be created
+            n_repeats (int):
+                how many n_prediction_steps to repeat in the validation set
 
         Returns:
             (List[Tuple[Union[List[int], np.ndarray], Union[List[int], np.ndarray]]]):
@@ -1193,8 +1253,10 @@ def create_holdout_val_split(
         It is done once per dataset to have comparable results among pipelines
         Args:
             holdout_val_type (HoldoutValTypes):
-            val_share (float): share of the validation data
-            n_repeats (int): how many n_prediction_steps to repeat in the validation set
+            val_share (float):
+                share of the validation data
+            n_repeats (int):
+                how many n_prediction_steps to repeat in the validation set
 
         Returns:
             (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)

From 16719920552d7e073751a732db9c041909e84916 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 17:06:21 +0200
Subject: [PATCH 319/347] fix pre-commit

---
 autoPyTorch/api/base_task.py                  |  4 +-
 autoPyTorch/evaluation/tae.py                 | 74 +++++++++----------
 autoPyTorch/optimizer/smbo.py                 |  2 +-
 .../components/setup/network/base_network.py  |  2 +-
 .../trainer/forecasting_trainer/__init__.py   |  1 +
 test/test_evaluation/test_evaluation.py       |  2 -
 .../preprocessing/forecasting/test_imputer.py |  1 +
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index d732ffbe7..6a5ce1e14 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1122,8 +1122,8 @@ def _search(
 
         # Incorporate budget to pipeline config
         if budget_type not in ('epochs', 'runtime') and (
-                budget_type in FORECASTING_BUDGET_TYPE and
-                STRING_TO_TASK_TYPES[self.task_type] != TIMESERIES_FORECASTING
+                budget_type in FORECASTING_BUDGET_TYPE
+                and STRING_TO_TASK_TYPES[self.task_type] != TIMESERIES_FORECASTING
         ):
             raise ValueError("Budget type must be one ('epochs', 'runtime')"
                              f" yet {budget_type} was provided")
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index fe9254182..9c5f349a7 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -34,8 +34,8 @@
     NoResamplingStrategyTypes
 )
 from autoPyTorch.evaluation.test_evaluator import eval_test_function
-from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
+from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.utils import (
     DisableFileOutputParameters,
     empty_queue,
@@ -110,30 +110,30 @@ class ExecuteTaFuncWithQueue(AbstractTAFunc):
     """
 
     def __init__(
-        self,
-        backend: Backend,
-        seed: int,
-        metric: autoPyTorchMetric,
-        cost_for_crash: float,
-        abort_on_first_run_crash: bool,
-        pynisher_context: str,
-        multi_objectives: List[str],
-        pipeline_config: Optional[Dict[str, Any]] = None,
-        initial_num_run: int = 1,
-        stats: Optional[Stats] = None,
-        run_obj: str = 'quality',
-        par_factor: int = 1,
-        output_y_hat_optimization: bool = True,
-        include: Optional[Dict[str, Any]] = None,
-        exclude: Optional[Dict[str, Any]] = None,
-        memory_limit: Optional[int] = None,
-        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-        init_params: Dict[str, Any] = None,
-        budget_type: str = None,
-        ta: Optional[Callable] = None,
-        logger_port: int = None,
-        all_supported_metrics: bool = True,
-        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+            self,
+            backend: Backend,
+            seed: int,
+            metric: autoPyTorchMetric,
+            cost_for_crash: float,
+            abort_on_first_run_crash: bool,
+            pynisher_context: str,
+            multi_objectives: List[str],
+            pipeline_config: Optional[Dict[str, Any]] = None,
+            initial_num_run: int = 1,
+            stats: Optional[Stats] = None,
+            run_obj: str = 'quality',
+            par_factor: int = 1,
+            output_y_hat_optimization: bool = True,
+            include: Optional[Dict[str, Any]] = None,
+            exclude: Optional[Dict[str, Any]] = None,
+            memory_limit: Optional[int] = None,
+            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+            init_params: Dict[str, Any] = None,
+            budget_type: str = None,
+            ta: Optional[Callable] = None,
+            logger_port: int = None,
+            all_supported_metrics: bool = True,
+            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     ):
 
         self.backend = backend
@@ -152,12 +152,12 @@ def __init__(
         self.resampling_strategy_args = dm.resampling_strategy_args
 
         if STRING_TO_TASK_TYPES.get(dm.task_type, -1) == TIMESERIES_FORECASTING:
-            eval_function = functools.partial(eval_train_function,
-                                              evaluator_class=TimeSeriesForecastingTrainEvaluator)
+            eval_function: Callable = functools.partial(eval_train_function,
+                                                        evaluator_class=TimeSeriesForecastingTrainEvaluator)
             if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
                 self.output_y_hat_optimization = output_y_hat_optimization
             elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
-                self.output_y_hat_optimization = None
+                self.output_y_hat_optimization = False
         else:
             if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
                 eval_function = eval_train_function
@@ -238,8 +238,8 @@ def _check_and_get_default_budget(self) -> float:
             return budget_choices[budget_type]
 
     def run_wrapper(
-        self,
-        run_info: RunInfo,
+            self,
+            run_info: RunInfo,
     ) -> Tuple[RunInfo, RunValue]:
         """
         wrapper function for ExecuteTARun.run_wrapper() to cap the target algorithm
@@ -297,13 +297,13 @@ def run_wrapper(
         return run_info, run_value
 
     def run(
-        self,
-        config: Configuration,
-        instance: Optional[str] = None,
-        cutoff: Optional[float] = None,
-        seed: int = 12345,
-        budget: float = 0.0,
-        instance_specific: Optional[str] = None,
+            self,
+            config: Configuration,
+            instance: Optional[str] = None,
+            cutoff: Optional[float] = None,
+            seed: int = 12345,
+            budget: float = 0.0,
+            instance_specific: Optional[str] = None,
     ) -> Tuple[StatusType, float, float, Dict[str, Any]]:
 
         context = multiprocessing.get_context(self.pynisher_context)
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 08fd1e362..b5a986bfe 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -20,8 +20,8 @@
 
 from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.constants import (
-    STRING_TO_TASK_TYPES,
     FORECASTING_BUDGET_TYPE,
+    STRING_TO_TASK_TYPES,
     TIMESERIES_FORECASTING
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index d8d4c87d0..bdae83823 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index b4e9cb34e..320cf6bd1 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -40,6 +40,7 @@ def __init__(self,
                  random_state: Optional[np.random.RandomState] = None
                  ):
         super().__init__(dataset_properties=dataset_properties, random_state=random_state)
+        assert self._fit_requirements is not None
         self._fit_requirements.extend([FitRequirement("target_scaler", (BaseTargetScaler,),
                                                       user_defined=False, dataset_property=False),
                                        FitRequirement("window_size", (int,), user_defined=False,
diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py
index 194d62aa8..8e12d2f71 100644
--- a/test/test_evaluation/test_evaluation.py
+++ b/test/test_evaluation/test_evaluation.py
@@ -18,7 +18,6 @@
 from smac.utils.constants import MAXINT
 
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
-from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
 from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy, log_loss
 
 this_directory = os.path.dirname(__file__)
@@ -57,7 +56,6 @@ def load_datamanager(self):
         return get_forecasting_dataset()
 
 
-
 class EvaluationTest(unittest.TestCase):
     def setUp(self):
         self.datamanager = get_multiclass_classification_datamanager()
diff --git a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
index 406ede9e4..6c0143609 100644
--- a/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
+++ b/test/test_pipeline/components/preprocessing/forecasting/test_imputer.py
@@ -9,6 +9,7 @@
 
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
+
 from sktime.transformations.series.impute import Imputer as SKTImpute
 
 from autoPyTorch.pipeline.components.preprocessing.time_series_preprocessing.imputation.TimeSeriesImputer import (

From 97d3835cbdcafdfa217cd74fef783fe27f2a513a Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 17:34:24 +0200
Subject: [PATCH 320/347] docs for dataset

---
 autoPyTorch/datasets/time_series_dataset.py | 54 ++++++++++++++-------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 0d40e55e0..f00b6401f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -208,19 +208,26 @@ def is_test_set(self, value: bool) -> None:
     def __getitem__(self, index: int, train: bool = True) \
             -> Tuple[Dict[str, torch.Tensor], Optional[Dict[str, torch.Tensor]]]:
         """
-        get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous sequences
-        until the given index, this allows us to do further transformation.
-        (When fed to the neural network, the data is arranged as follows:
-        [past_targets, time_features, X_features])
+        get a subsequent of time series data, unlike vanilla tabular dataset, we obtain all the previous observations
+        until the given index
 
         Args:
             index (int):
-                what element to yield from all the train/test tensors
+                what element to yield from the series
             train (bool):
-                Whether to apply a train or test transformation, if any
+                Whether a train or test transformation is applied
 
         Returns:
-            features from past, targets from past and future
+            past_information (Dict[str, torch.Tensor]):
+                a dict contains all the required information required for future forecasting
+                past_targets (torch.Tensor), past_features(Optional[torch.Tensor]),
+                future_features(Optional[torch.Tensor]),
+                mase_coefficient (np.array, cached value to compute MASE scores),
+                past_observed_targets(torch.BoolTensor), if the past targets are observed.
+                decoder_lengths(int), length of decoder output
+            future_information (Optional[Dict[str, torch.Tensor]]):
+                a dict contains all the future information that are required to predict, including
+                future_targets: (torch.Tensor) and future_observed_targets (torch.BoolTensor)
         """
         if index < 0:
             index = self.__len__() + index
@@ -328,7 +335,7 @@ def get_target_values(self, index: int) -> np.ndarray:
             index = self.__len__() + index
         return self.Y[index]
 
-    def cache_time_features(self, ) -> None:
+    def cache_time_features(self) -> None:
         """
         compute time features if it is not cached. For test sets, we also need to compute the time features for future
         """
@@ -747,6 +754,7 @@ def compute_time_features(start_times: List[pd.DatetimeIndex],
         return series_time_features
 
     def _get_dataset_indices(self, idx: int, only_dataset_idx: bool = False) -> Union[int, Tuple[int, int]]:
+        """get which series the data point belongs to"""
         if idx < 0:
             if -idx > len(self):
                 raise ValueError("absolute value of index should not exceed dataset length")
@@ -769,14 +777,18 @@ def __getitem__(self, idx: int,  # type: ignore[override]
         return self.datasets[dataset_idx].__getitem__(sample_idx, train)
 
     def get_validation_set(self, idx: int) -> TimeSeriesSequence:
+        """generate validation series given the index. It ends at the position of the index"""
         dataset_idx, sample_idx = self._get_dataset_indices(idx)  # type: ignore[misc]
         return self.datasets[dataset_idx].get_val_seq_set(sample_idx)
 
     def get_time_series_seq(self, idx: int) -> TimeSeriesSequence:
+        """get the series that the data point belongs to"""
         dataset_idx = self._get_dataset_indices(idx, True)
         return self.datasets[dataset_idx]  # type: ignore[index]
 
     def get_test_target(self, test_indices: np.ndarray) -> np.ndarray:
+        """get the target data only. This function simply returns a np.array instead of a dictionary"""
+
         test_indices = np.where(test_indices < 0, test_indices + len(self), test_indices)
         y_test = np.ones([len(test_indices), self.n_prediction_steps, self.num_targets])
         y_test_argsort = np.argsort(test_indices)
@@ -1004,12 +1016,12 @@ def transform_time_features(self, value: bool) -> None:
 
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]:
         """
-        Creates a set of splits based on a resampling strategy provided, apart from the
-        'get_splits_from_resampling_strategy' implemented in base_dataset, here we will get self.upper_sequence_length
-        with the given value
+        Creates a set of splits based on a resampling strategy provided, here each item in test_split represent
+        n_prediction_steps element in the dataset. (The start of timestep that we want to predict)
 
         Returns
-            (List[Tuple[List[int], List[int]]]): splits in the [train_indices, val_indices] format
+            ( List[Tuple[List[int], Optional[List[int]]]]):
+            splits in the [train_indices, val_indices] format
         """
         splits = []
         if isinstance(self.resampling_strategy, HoldoutValTypes):
@@ -1194,6 +1206,7 @@ def create_cross_val_splits(
         It is done once per dataset to have comparable results among pipelines
         Args:
             cross_val_type (CrossValTypes):
+                cross validation type
             num_splits (int):
                 number of splits to be created
             n_repeats (int):
@@ -1253,6 +1266,7 @@ def create_holdout_val_split(
         It is done once per dataset to have comparable results among pipelines
         Args:
             holdout_val_type (HoldoutValTypes):
+                holdout type
             val_share (float):
                 share of the validation data
             n_repeats (int):
@@ -1289,15 +1303,10 @@ def create_holdout_val_split(
 
         return train_indices, test_indices
 
-    def create_refit_split(
-            self,
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    def create_refit_split(self) -> Tuple[np.ndarray, np.ndarray]:
         """
         This function creates the refit split for the given task. All the data in the dataset will be considered as
         training sets
-        Args:
-            holdout_val_type (HoldoutValTypes):
-            val_share (float): share of the validation data
 
         Returns:
             (Tuple[np.ndarray, np.ndarray]): Tuple containing (train_indices, val_indices)
@@ -1317,12 +1326,21 @@ def create_refit_split(
         return train_indices, test_indices
 
     def create_refit_set(self) -> "TimeSeriesForecastingDataset":
+        """create a refit set that allows the network to be trained with the entire training-validation sets"""
         refit_set: TimeSeriesForecastingDataset = copy.deepcopy(self)
         refit_set.resampling_strategy = NoResamplingStrategyTypes.no_resampling
         refit_set.splits = refit_set.get_splits_from_resampling_strategy()
         return refit_set
 
     def generate_test_seqs(self) -> List[TimeSeriesSequence]:
+        """
+        A function that generate a set of test series from the information available at this dataset. By calling this
+        function, we could make use of the cached information such as time features to accelerate the computation time
+
+        Returns:
+            test_sets(List[TimeSeriesSequence])
+                generated test sets
+        """
         test_sets = copy.deepcopy(self.datasets)
         for test_seq in test_sets:
             test_seq.is_test_set = True

From 889c5e9dc9f42c149786dce3b60da2d70f54f149 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 19:03:02 +0200
Subject: [PATCH 321/347] maint docstring

---
 autoPyTorch/api/time_series_forecasting.py    | 44 +++++----
 .../data/time_series_forecasting_validator.py |  6 +-
 autoPyTorch/evaluation/abstract_evaluator.py  | 14 +++
 ...time_series_forecasting_train_evaluator.py |  4 +-
 .../base_target_scaler.py                     | 12 ++-
 .../forecasting_backbone/cells.py             | 95 +++++++++++++++----
 .../forecasting_backbone/components_util.py   | 24 ++---
 .../forecasting_decoder/TransformerDecoder.py |  3 +-
 .../base_forecasting_decoder.py               | 50 ++++++----
 .../forecasting_decoder/components.py         | 32 ++++++-
 .../base_forecasting_encoder.py               | 11 +--
 .../forecasting_encoder/components.py         | 41 +++++++-
 .../other_components/TemporalFusion.py        |  4 +-
 .../LearnedEntityEmbedding.py                 | 12 ++-
 .../forecasting_network_head/distribution.py  | 14 +--
 .../forecasting_head.py                       | 36 ++++---
 .../time_series_forecasting_data_loader.py    | 54 +++++++----
 .../components/training/metrics/metrics.py    |  9 +-
 .../forecasting_base_trainer.py               | 36 ++++---
 .../pipeline/time_series_forecasting.py       | 58 ++++++-----
 .../utils/forecasting_time_features.py        | 25 -----
 21 files changed, 384 insertions(+), 200 deletions(-)
 delete mode 100644 autoPyTorch/utils/forecasting_time_features.py

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index eba3bf9c8..8ab1ebef0 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -28,27 +28,31 @@ class TimeSeriesForecastingTask(BaseTask):
     """
     Time Series Forecasting API to the pipelines.
     Args:
-        seed (int): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
-        logging_config (Optional[Dict]): specifies configuration
-            for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int), (default=50): Number of models added to the ensemble built by
-            Ensemble selection from libraries of models.
+        seed (int):
+            seed to be used for reproducibility.
+        n_jobs (int), (default=1):
+            number of consecutive processes to spawn.
+        logging_config (Optional[Dict]):
+            specifies configuration for logging, if None, it is loaded from the logging.yaml
+        ensemble_size (int), (default=50):
+            Number of models added to the ensemble built by Ensemble selection from libraries of models.
             Models are drawn with replacement.
-        ensemble_nbest (int), (default=50): only consider the ensemble_nbest
-            models to build the ensemble
-        max_models_on_disc (int), (default=50): maximum number of models saved to disc.
-            Also, controls the size of the ensemble as any additional models will be deleted.
-            Must be greater than or equal to 1.
-        temporary_directory (str): folder to store configuration output and log file
-        output_directory (str): folder to store predictions for optional test set
-        delete_tmp_folder_after_terminate (bool): determines whether to delete the temporary directory,
-            when finished
-        include_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components to use.
-        exclude_components (Optional[Dict]): If None, all possible components are used.
-            Otherwise specifies set of components not to use. Incompatible with include
-            components
+        ensemble_nbest (int), (default=50):
+            only consider the ensemble_nbest models to build the ensemble
+        max_models_on_disc (int), (default=50):
+            maximum number of models saved to disc. Also, controls the size of the ensemble as any additional models
+             will be deleted. Must be greater than or equal to 1.
+        temporary_directory (str):
+            folder to store configuration output and log file
+        output_directory (str):
+            folder to store predictions for optional test set
+        delete_tmp_folder_after_terminate (bool):
+            determines whether to delete the temporary directory, when finished
+        include_components (Optional[Dict]):
+            If None, all possible components are used. Otherwise specifies set of components to use.
+        exclude_components (Optional[Dict]):
+            If None, all possible components are used. Otherwise specifies set of components not to use.
+            Incompatible with include components
     """
 
     def __init__(
diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index 1ffd2c610..d6bcfb12e 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -182,7 +182,7 @@ def transform(  # type: ignore[override]
                 forecasting targets
             validate_for_future_features: bool
                 if the validator is applied to transform future features (for test sets), in this case we only validate
-                X
+                features
         """
         if not self._is_fitted:
             raise NotFittedError(
@@ -332,9 +332,7 @@ def _transform_X(
     def join_series(
         X: List[Union[pd.DataFrame, np.ndarray]], return_seq_lengths: bool = False
     ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, List[int]]]:
-        """
-        join the series into one single item
-        """
+        """join the series into one single item"""
         num_sequences = len(X)
         sequence_lengths = [0] * num_sequences
         for seq_idx in range(num_sequences):
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 8e59bcb9a..e482973d8 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -315,6 +315,20 @@ def get_default_pipeline_options() -> Dict[str, Any]:
 
 
 class DummyTimeSeriesForecastingPipeline(DummyClassificationPipeline):
+    """
+    A wrapper class that holds a pipeline for dummy forecasting. For each series, it simply repeats the last element
+    in the training series
+
+
+    Attributes:
+        random_state (Optional[Union[int, np.random.RandomState]]):
+            Object that contains a seed and allows for reproducible results
+        init_params  (Optional[Dict]):
+            An optional dictionary that is passed to the pipeline's steps. It complies
+            a similar function as the kwargs
+        n_prediction_steps (int):
+            forecasting horizon
+    """
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None,
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index a319afb60..9f02f9305 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -277,11 +277,11 @@ def generate_mase_coefficient_for_validation(self, test_split: Sequence[int]) ->
 
         Parameters:
         ----------
-        test_split: Sequence
+        test_split (Sequence):
             test splits, consistent of int
         Return:
         ----------
-        mase_coefficient: np.ndarray(self.num_sequence * self.n_prediction_steps)
+        mase_coefficient (np.ndarray(self.num_sequence * self.n_prediction_steps)):
             inverse of the mase_denominator
         """
         mase_coefficient = np.ones([len(test_split), self.num_targets])
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index fb9ca3dfc..3f22bb692 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -26,10 +26,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         Creates a column transformer for the chosen tabular
         preprocessors
         Args:
-            X (Dict[str, Any]): fit dictionary
+            X (Dict[str, Any]):
+                fit dictionary
 
         Returns:
-            "TabularColumnTransformer": an instance of self
+            "BaseEstimator":
+                an instance of self
         """
         self.check_requirements(X, y)
         self.scaler = TargetScaler(mode=self.scaler_mode)
@@ -43,10 +45,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the time series transformer to fit dictionary
         Args:
-            X (Dict[str, Any]): fit dictionary
+            X (Dict[str, Any]):
+                fit dictionary
 
         Returns:
-            X (Dict[str, Any]): updated fit dictionary
+            X (Dict[str, Any]):
+                updated fit dictionary
         """
         X.update({'target_scaler': self})
         return X
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
index 26b8eca30..6da9d42bb 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/cells.py
@@ -98,11 +98,16 @@ def forward(self,
                 static_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
         Args:
-            encoder_output: the output of the last layer of encoder network
-            decoder_output: the output of the last layer of decoder network
-            past_observed_targets: observed values in the past
-            decoder_length: length of decoder network
-            static_embedding: output of static variable selection network (if available)
+            encoder_output (torch.Tensor):
+                the output of the last layer of encoder network
+            decoder_output (torch.Tensor):
+                the output of the last layer of decoder network
+            past_observed_targets (torch.BoolTensor):
+                observed values in the past
+            decoder_length (int):
+                length of decoder network
+            static_embedding Optional[torch.Tensor]:
+                embeddings of static features  (if available)
         """
 
         if self.decoder_proj_layer is not None:
@@ -208,14 +213,22 @@ def __init__(self,
         The order of the input variables is as follows:
         [features (from the dataset), time_features (from time feature transformers), targets]
         Args:
-            network_structure (NetworkStructure): contains the information of the overall architecture information
-            dataset_properties (Dict): dataset properties
-            network_encoder(Dict[str, EncoderBlockInfo]): Network encoders
-            auto_regressive bool: if it belongs to an auto-regressive model
-            feature_names Tuple[str]: feature names, used to construct the selection network
-            known_future_features Tuple[str]: known future features
-            feature_shapes Dict[str, int]: shapes of each features
-            time_feature_names Tuple[str]: time feature names, used to complement feature_shapes
+            network_structure (NetworkStructure):
+                contains the information of the overall architecture information
+            dataset_properties (Dict):
+                dataset properties
+            network_encoder(Dict[str, EncoderBlockInfo]):
+                Network encoders
+            auto_regressive (bool):
+                if it belongs to an auto-regressive model
+            feature_names (Tuple[str]):
+                feature names, used to construct the selection network
+            known_future_features (Tuple[str]):
+                known future features
+            feature_shapes (Dict[str, int]):
+                shapes of each features
+            time_feature_names (Tuple[str]):
+                time feature names, used to complement feature_shapes
         """
         super().__init__()
         first_encoder_output_shape = network_encoder['block_1'].encoder_output_shape[-1]
@@ -450,6 +463,12 @@ def forward(self,
 
 
 class StackedEncoder(nn.Module):
+    """
+    Encoder network that is stacked by several encoders. Skip-connections can be applied to each stack. Each stack
+    needs to generate a sequence of encoded features passed to the next stack and the
+    corresponding decoder (encoder2decoder) that is located at the same layer.Additionally, if temporal fusion
+    transformer is applied, the last encoder also needs to output the full encoded feature sequence
+    """
     def __init__(self,
                  network_structure: NetworkStructure,
                  has_temporal_fusion: bool,
@@ -507,14 +526,25 @@ def forward(self,
                 incremental_update: bool = False) -> Tuple[List[torch.Tensor], Optional[torch.Tensor]]:
         """
         A forward pass through the encoder
+
         Args:
-             encoder_input (torch.Tensor): encoder input
-             additional_input (List[Optional[torch.Tensor]]) additional input to the encoder, e.g., inital hidden states
-             output_seq (bool) if a sequence output is generated
-             cache_intermediate_state (bool): if store the intermediate values
-             incremental_update (bool): if an incremental update is applied, this is normally applied for
-                auto-regressive model, however, ony deepAR requires encoder to do incremental update,
-                whose decoder only need to receive the last output of the encoder
+            encoder_input (torch.Tensor):
+                encoder input
+            additional_input (List[Optional[torch.Tensor]])
+                additional input to the encoder, e.g., initial hidden states
+            output_seq (bool)
+                if the encoder want to generate a sequence of multiple time steps or a single time step
+            cache_intermediate_state (bool):
+                if the intermediate values are cached
+            incremental_update (bool):
+                if an incremental update is applied, this is normally applied for
+                auto-regressive model, however, ony deepAR requires incremental update in encoder
+
+        Returns:
+            encoder2decoder ([List[torch.Tensor]]):
+                encoder output that will be passed to decoders
+            encoder_output (torch.Tensor):
+                full sequential encoded features from the last encoder layer. Applied to temporal transformer
         """
         encoder2decoder = []
         x = encoder_input
@@ -583,6 +613,11 @@ def forward(self,
 
 
 class StackedDecoder(nn.Module):
+    """
+    Decoder network that is stacked by several decoders. Skip-connections can be applied to each stack. It decodes the
+    encoded features (encoder2decoder) from each corresponding stacks and known_future_features to generate the decoded
+    output features that will be further fed to the network decoder.
+    """
     def __init__(self,
                  network_structure: NetworkStructure,
                  encoder: nn.ModuleDict,
@@ -633,6 +668,26 @@ def forward(self,
                 cache_intermediate_state: bool = False,
                 incremental_update: bool = False
                 ) -> torch.Tensor:
+        """
+        A forward pass through the decoder
+
+        Args:
+            x_future (Optional[torch.Tensor]):
+                known future features
+            encoder_output (List[torch.Tensor])
+                encoded features, stored as List, whereas each element in the list indicates encoded features from an
+                encoder stack
+            pos_idx (int)
+                position index of the current x_future. This is applied to transformer decoder
+            cache_intermediate_state (bool):
+                if the intermediate values are cached
+            incremental_update (bool):
+                if an incremental update is applied, this is normally applied for auto-regressive model
+
+        Returns:
+            decoder_output (torch.Tensor):
+                decoder output that will be passed to the network head
+        """
         x = x_future
         for i, block_id in enumerate(range(self.first_block, self.num_blocks + 1)):
             decoder_i = self.decoder[f'block_{block_id}']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
index e6b75277d..9fcbc14e0 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/components_util.py
@@ -92,9 +92,9 @@ def build_transformer_layers(d_model: int, config: Dict[str, Any], layer_type: s
         raise ValueError('layer_type must be encoder or decoder!')
 
 
-# https://github.com/pytorch/examples/blob/master/word_language_model/model.py
 class PositionalEncoding(nn.Module):
-    r"""
+    r"""https://github.com/pytorch/examples/blob/master/word_language_model/model.py
+
         NOTE: different from the raw implementation, this model is designed for the batch_first inputs!
         Inject some information about the relative or absolute position of the tokens
         in the sequence. The positional encodings have the same dimension as
@@ -105,9 +105,12 @@ class PositionalEncoding(nn.Module):
         \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
         \text{where pos is the word position and i is the embed idx)
     Args:
-        d_model: the embed dim (required).
-        dropout: the dropout value (default=0.1).
-        max_len: the max. length of the incoming sequence (default=5000).
+        d_model (int):
+            the embed dim (required).
+        dropout(float):
+            the dropout value (default=0.1).
+        max_len(int):
+            the max. length of the incoming sequence (default=5000).
     Examples:
         >>> pos_encoder = PositionalEncoding(d_model)
     """
@@ -127,12 +130,11 @@ def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
     def forward(self, x: torch.Tensor, pos_idx: Optional[Tuple[int]] = None) -> torch.Tensor:
         r"""Inputs of forward function
         Args:
-            x: the sequence fed to the positional encoder model (required).
-            pos_idx (Tuple[int]), position idx indicating the start (first) and end (last) time index of x in a sequence
-        Shape:
-            x: [batch size, sequence length embed dim]
-            pos_idx: positional index, indicating the index of the current
-            output: [batch size, sequence length, embed dim]
+            x (torch.Tensor(B, L, N)):
+                the sequence fed to the positional encoder model (required).
+            pos_idx (Tuple[int]):
+                position idx indicating the start (first) and end (last) time index of x in a sequence
+
         Examples:
             >>> output = pos_encoder(x)
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index f85e76bb0..ef20c9e06 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -118,8 +118,7 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
     @staticmethod
     def decoder_properties() -> DecoderProperties:
         return DecoderProperties(recurrent=True,
-                                 lagged_input=True,
-                                 mask_on_future_target=True)
+                                 lagged_input=True)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.transformer_encoder_kwargs = X['transformer_encoder_kwargs']
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 9c327a5e3..6b8f3fd93 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -60,8 +60,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         Builds the head component and assigns it to self.decoder
 
         Args:
-            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
-            y (Any): not used. To comply with sklearn API
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
         Returns:
             Self
         """
@@ -125,9 +127,11 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Adds the network head into the fit dictionary 'X' and returns it.
 
         Args:
-            X (Dict[str, Any]): 'X' dictionary
+            X (Dict[str, Any]):
+                'X' dictionary
         Returns:
-            (Dict[str, Any]): the updated 'X' dictionary
+            (Dict[str, Any]):
+                the updated 'X' dictionary
         """
         # 'auto_regressive' needs to be the same across all the decoders,
         # 'n_prediction_heads' and 'n_decoder_output_features' are only applied to the head such that they could be
@@ -162,12 +166,17 @@ def build_decoder(self,
         Builds the head module and returns it
 
         Args:
-            encoder_output_shape (Tuple[int, ...]): shape of the input to the decoder, this value is the encoder output
-            future_variable_input (Tuple[int, ...]): shape of the known future input values
-            n_prediction_heads (int): how many prediction heads the network has, used for final forecasting heads
-            dataset_properties (Dict): dataset properties
+            encoder_output_shape (Tuple[int, ...]):
+                shape of the input to the decoder, this value is the encoder output
+            future_variable_input (Tuple[int, ...]):
+                shape of the known future input values
+            n_prediction_heads (int):
+                how many prediction heads the network has, used for final forecasting heads
+            dataset_properties (Dict):
+                dataset properties
         Returns:
-            nn.Module: head module
+            nn.Module:
+                head module
         """
         decoder, n_decoder_features = self._build_decoder(encoder_output_shape, future_variable_input,
                                                           n_prediction_heads, dataset_properties)
@@ -183,26 +192,33 @@ def _build_decoder(self,
         Builds the head module and returns it
 
         Args:
-            encoder_output_shape (Tuple[int, ...]): shape of the input to the decoder, this value is the encoder output
-            future_variable_input (Tuple[int, ...]): shape of the known future input values
-            n_prediction_heads (int): how many prediction heads the network has, used for final forecasting heads
-            dataset_properties (Dict): dataset properties
+            encoder_output_shape (Tuple[int, ...]):
+                shape of the input to the decoder, this value is the encoder output
+            future_variable_input (Tuple[int, ...]):
+                shape of the known future input values
+            n_prediction_heads (int):
+                how many prediction heads the network has, used for final forecasting heads
+            dataset_properties (Dict):
+                dataset properties
 
         Returns:
-            decoder (nn.Module): decoder module
-            n_decoder_features (int): output of decoder features, used for initialize network head.
+            decoder (nn.Module):
+                decoder module
+            n_decoder_features (int):
+                output of decoder features, used for initialize network head.
         """
         raise NotImplementedError()
 
     @classmethod
     def get_name(cls) -> str:
         """
-        Get the name of the head
+        Get the name of the decoder
 
         Args:
             None
 
         Returns:
-            str: Name of the head
+            str:
+                Name of the decoder
         """
         return str(cls.get_properties()["shortname"])
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
index 72350ce7d..5cb9d8ff2 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/components.py
@@ -5,15 +5,45 @@
 
 
 class DecoderProperties(NamedTuple):
+    """
+    Decoder properties
+
+    Args:
+        has_hidden_states (bool):
+            if the decoder has hidden states. A decoder with hidden states might have additional output and requires
+            additional inputs
+        has_local_layer (bool):
+            if the decoder has local layer, in which case the output is also a 3D sequential feature
+        recurrent (bool):
+            if the decoder is recurrent. This determines if decoders can be auto-regressive
+        lagged_input (bool):
+            if the decoder accepts past targets as additional features
+        multi_blocks (bool):
+            If the decoder is stacked by multiple blocks (only for N-BEATS)
+    """
     has_hidden_states: bool = False
     has_local_layer: bool = True
     recurrent: bool = False
     lagged_input: bool = False
     multi_blocks: bool = False
-    mask_on_future_target: bool = False
 
 
 class DecoderBlockInfo(NamedTuple):
+    """
+    Decoder block infos
+
+    Args:
+        decoder (nn.Module):
+            decoder network
+        decoder_properties (EncoderProperties):
+            decoder properties
+        decoder_output_shape (Tuple[int, ...]):
+            output shape that the decoder ought to output
+
+        decoder_input_shape (Tuple[int, ...]):
+            requried input shape of the decoder
+
+    """
     decoder: nn.Module
     decoder_properties: DecoderProperties
     decoder_output_shape: Tuple[int, ...]
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index 596231cec..b2977a760 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -143,7 +143,8 @@ def build_encoder(self,
         Builds the backbone module and returns it
 
         Args:
-            input_shape (Tuple[int, ...]): input feature shape
+            input_shape (Tuple[int, ...]):
+                input feature shape
 
         Returns:
             nn.Module: backbone module
@@ -155,14 +156,6 @@ def encoder_properties() -> EncoderProperties:
         """
         Encoder properties, this determines how the data flows over the forecasting networks
 
-        has_hidden_states, it determines if the network contains hidden states and thus return or accept the hidden
-        states
-        bijective_seq_output, determines if the network returns a sequence with the same sequence length as the input
-        sequence when output_seq is set True
-        fix_input_shape if the input shape is fixed, this is useful for building network head
-        lagged_input, if lagged input values are applied, this technique is implemented in DeepAR and Transformer
-        implemented in gluonTS:
-        https://github.com/awslabs/gluon-ts/blob/master/src/gluonts/torch/model/deepar/module.py
         """
         encoder_properties = EncoderProperties()
         return encoder_properties
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
index 65fd5032c..f3286f827 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/components.py
@@ -6,14 +6,44 @@
 
 
 class EncoderProperties(NamedTuple):
+    """
+    Encoder properties
+
+    Args:
+        has_hidden_states (bool):
+            if the encoder has hidden states. An encoder with hidden states might have additional output
+        bijective_seq_output (bool):
+            if the encoder's output sequence has the same length as its input sequence's length
+        fixed_input_seq_length (bool):
+            if the encoder requries a fixed length of input (for instance, MLP)
+        lagged_input (bool):
+            if the encoder accepts past targets as additional features
+        is_casual (bool):
+            If the output of the encoder only depends on the past targets
+    """
     has_hidden_states: bool = False
     bijective_seq_output: bool = True
     fixed_input_seq_length: bool = False
     lagged_input: bool = False
-    is_casual: bool = True  # this value indicates if the output of the model only depends on the past targets
+    is_casual: bool = True
 
 
 class EncoderBlockInfo(NamedTuple):
+    """
+    Encoder block infos
+
+    Args:
+        encoder (nn.Module):
+            encoder network
+        encoder_properties (EncoderProperties):
+            encoder properties
+        encoder_input_shape (Tuple[int, ...]):
+            requried input shape of the encoder
+        encoder_output_shape (Tuple[int, ...]):
+            output shape that the encoder ought to output
+        n_hidden_states (int):
+            number of hidden states
+    """
     encoder: nn.Module
     encoder_properties: EncoderProperties
     encoder_input_shape: Tuple[int, ...]
@@ -34,6 +64,7 @@ def forward(self,
 
         Args:
             x: torch.Tensor(B, L_in, N)
+                input data
             output_seq (bool): if the network outputs a sequence tensor. If it is set True,
                 output will be a 3-d Tensor (B, L_out, N). L_out = L_in if encoder_properties['recurrent'] is True.
                 If this value is set as False, the network only returns the last item of the sequence.
@@ -50,10 +81,12 @@ def get_last_seq_value(self, x: torch.Tensor) -> torch.Tensor:
         """
         get the last value of the sequential output
         Args:
-            x: torch.Tensor(B, L, N): a sequential value output by the network, usually this value needs to be fed
-                to the decoder (or a 2D tensor for a flat encoder)
+            x (torch.Tensor(B, L, N)):
+                a sequential value output by the network, usually this value needs to be fed to the decoder
+                (or a 2D tensor for a flat encoder)
         Returns:
-            output: torch.Tensor(B, 1, M): last element of the sequential value (or a 2D tensor for flat encoder)
+            output (torch.Tensor(B, 1, M)):
+                last element of the sequential value (or a 2D tensor for flat encoder)
 
         """
         raise NotImplementedError
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
index 8b3040c75..b428e1a16 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/other_components/TemporalFusion.py
@@ -25,7 +25,9 @@
 
 class TemporalFusion(autoPyTorchComponent):
     """
-    Base class for network backbones. Holds the backbone module and the config which was used to create it.
+    Temporal Fusion layer. For details we refer to
+    Lim et al. Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting
+    https://arxiv.org/abs/1912.09363
     """
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index be41c2463..fdcf051bd 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -56,12 +56,14 @@ def get_partial_models(self, subset_features: List[int]) -> "_LearnedEntityEmbed
         """
         extract a partial models that only works on a subset of the data that ought to be passed to the embedding
         network, this function is implemented for time series forecasting tasks where the known future features is only
-        a subset of the known past features
+        a subset of the past features
         Args:
-            subset_features: a set of index identifying which features will pass through the partial model
+            subset_features (List[int]):
+                a set of index identifying which features will pass through the partial model
 
         Returns:
-            partial_model (_LearnedEntityEmbedding) a new partial model
+            partial_model (_LearnedEntityEmbedding)
+                a new partial model
         """
         num_input_features = self.num_input_features[subset_features]
         num_numerical_features = sum([sf < self.num_numerical for sf in subset_features])
@@ -114,6 +116,10 @@ def _create_ee_layers(self) -> nn.ModuleList:
 
 
 class PartialLearnedEntityEmbedding(_LearnedEntityEmbedding):
+    """
+    Construct a partial Embedding network that is derived from a learned embedding network and only applied to a subset
+    of the input features. This is applied to forecasting tasks where not all the features might be known beforehand
+    """
     def __init__(self,
                  num_input_features: np.ndarray,
                  num_numerical_features: int,
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
index e9734ab8c..2cc3178a9 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/distribution.py
@@ -55,10 +55,12 @@ def build_single_proj_layer(arg_dim: int) -> nn.Module:
             and n_prediction_steps.
             we note that output_shape's first dimensions is always n_prediction_steps
             Args:
-                arg_dim (int): dimension of the target distribution
+                arg_dim (int):
+                    dimension of the target distribution
 
             Returns:
-                proj_layer (nn.Module): projection layer that maps the decoder output to parameterize distributions
+                proj_layer (nn.Module):
+                    projection layer that maps the decoder output to parameterize distributions
             """
             if decoder_has_local_layer:
                 return nn.Sequential(
@@ -82,12 +84,12 @@ def forward(self, x: torch.Tensor) -> torch.distributions:
         """
         get a target distribution
         Args:
-            x: input tensor ([batch_size, in_features]): input tensor, acquired by the base header, have the shape
-            [batch_size, in_features]
+            x: input tensor ([batch_size, in_features]):
+                input tensor, acquired by the base header, have the shape [batch_size, in_features]
 
         Returns:
-            dist: torch.distributions ([batch_size, n_prediction_steps, output_shape]): an output torch distribution
-             with shape (batch_size, n_prediction_steps, output_shape)
+            dist: torch.distributions ([batch_size, n_prediction_steps, output_shape]):
+                an output torch distribution with shape (batch_size, n_prediction_steps, output_shape)
         """
         params_unbounded = [proj(x) for proj in self.proj]
         return self.dist_cls(*self.domain_map(*params_unbounded))
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
index 8c59a4c32..7cfc0bbf9 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/forecasting_head.py
@@ -64,8 +64,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         Builds the head component and assigns it to self.head
 
         Args:
-            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
-            y (Any): not used. To comply with sklearn API
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
         Returns:
             Self
         """
@@ -116,9 +118,11 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Adds the network head into the fit dictionary 'X' and returns it.
 
         Args:
-            X (Dict[str, Any]): 'X' dictionary
+            X (Dict[str, Any]):
+                'X' dictionary
         Returns:
-            (Dict[str, Any]): the updated 'X' dictionary
+            (Dict[str, Any]):
+                the updated 'X' dictionary
         """
         if self.head is not None:
             X.update({'network_head': self.head})
@@ -170,16 +174,24 @@ def build_head(self,  # type: ignore[override]
         Builds the head module and returns it
 
         Args:
-            head_n_in_features (int): shape of the input to the head (usually the shape of the backbone output)
-            output_shape (Tuple[int, ...]): shape of the output of the head
-            decoder_has_local_layer (bool): if the decoder has local layer
-            net_output_type (str): network output type
-            dist_cls (Optional[str]): output distribution, only works if required_net_out_put_type is 'distribution'
-            n_prediction_heads (Dict): additional paramter for initializing architectures. How many heads to predict
-            num_quantiles (int): number of quantile losses
+            head_n_in_features (int):
+                shape of the input to the head (usually the shape of the backbone output)
+            output_shape (Tuple[int, ...]):
+                shape of the output of the head
+            decoder_has_local_layer (bool):
+                if the decoder has local layer
+            net_output_type (str):
+                network output type
+            dist_cls (Optional[str]):
+                output distribution, only works if required_net_out_put_type is 'distribution'
+            n_prediction_heads (Dict):
+                additional paramter for initializing architectures. How many heads to predict
+            num_quantiles (int):
+                number of quantile losses
 
         Returns:
-            nn.Module: head module
+            nn.Module:
+                head module
         """
         if net_output_type == 'distribution':
             assert dist_cls is not None
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 34750f08b..826396352 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -63,16 +63,25 @@ def __init__(self,
         """
         initialize a dataloader
         Args:
-            batch_size: batch size
-            backcast (bool): if backcast is applied, where window_size is determined on the forecasting horizon
-            backcast_period (int): backcast period, window_size is computed by horizon * backcast_period
-            window_size(int): windows size, activate when backcast is false
-            num_batches_per_epoch (int): number of batches per epoch
-            n_prediction_steps (int): forecasting horizon
-            sample_strategy (str): sample strategy, if all the sequences are expected to be sampled with the same size
+            batch_size (int):
+                batch size
+            backcast (bool):
+                if backcast is applied, where window_size is determined on the forecasting horizon
+            backcast_period (int):
+                backcast period, window_size is computed by horizon * backcast_period
+            window_size(int):
+                windows size, activate when backcast is false
+            num_batches_per_epoch (int):
+                number of batches per epoch
+            n_prediction_steps (int):
+                forecasting horizon
+            sample_strategy (str):
+                sample strategy, if all the sequences are expected to be sampled with the same size
                 or all the time steps are expected to be sampled with the same size
-            transform_time_features (bool): if time features are transformed
-            random_state (Optional[np.random.RandomState]): random states
+            transform_time_features (bool):
+                if time features are transformed
+            random_state (Optional[np.random.RandomState]):
+                random states
 
         """
         super().__init__(batch_size=batch_size, random_state=random_state)
@@ -141,11 +150,16 @@ def compute_expected_num_instances_per_seq(self,
         """
         Compute the number of expected sample instances within each sequence.
         Args:
-            num_instances_dataset (int): number of all possible instances inside a dataset
-            seq_train_length (np.ndarray): length of each sequence
-            min_start (int): minimal number of start
-            fraction_seq (float): fraction of the sequence that will be sampled during training.
-            fraction_samples_per_seq (float): fraction of number of samples inside each series
+            num_instances_dataset (int):
+                number of all possible instances inside a dataset
+            seq_train_length (np.ndarray):
+                length of each sequence
+            min_start (int):
+                minimal number of start
+            fraction_seq (float):
+                fraction of the sequence that will be sampled during training.
+            fraction_samples_per_seq (float):
+                fraction of number of samples inside each series
 
         Returns:
             num_instances_per_seqs (np.ndarray): expected number of instances to be sampled inside each sequence
@@ -180,8 +194,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
         Fits a component by using an input dictionary with pre-requisites
 
         Args:
-            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
-            y (Any): not used. To comply with sklearn API
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            y (Any):
+                not used. To comply with sklearn API
 
         Returns:
             A instance of self
@@ -334,8 +350,10 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform
         Method to build a transformation that can pre-process input data
 
         Args:
-            X (X: Dict[str, Any]): Dependencies needed by current component to perform fit
-            mode (str): train/val/test
+            X (X: Dict[str, Any]):
+                Dependencies needed by current component to perform fit
+            mode (str):
+                train/val/test
 
         Returns:
             A composition of transformations
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index 2313fc82a..d2c8d98eb 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -62,11 +62,14 @@ def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> n
     compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
     this function aims at reducing the memroy requirement
     Args:
-        past_target:  Optional[List, np.ndarray] past target observations
-        sp: seasonality parameter to compute sp
+        past_target (Optional[List, np.ndarray]):
+            past target observations
+        sp (int):
+            seasonality parameter to compute sp
 
     Returns:
-        mase_coefficient: inverse of mase_denominator
+        mase_coefficient (np.ndarray):
+            inverse of mase_denominator
     """
     past_target = np.nan_to_num(past_target)
     max_past_target_abs = np.max(np.abs(past_target))
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 17aca5ca2..9f408b084 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -83,12 +83,16 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int,
         Train the model for a single epoch.
 
         Args:
-            train_loader (torch.utils.data.DataLoader): generator of features/label
-            epoch (int): The current epoch used solely for tracking purposes
+            train_loader (torch.utils.data.DataLoader):
+                generator of features/label
+            epoch (int):
+                The current epoch used solely for tracking purposes
 
         Returns:
-            float: training loss
-            Dict[str, float]: scores for each desired metric
+            float:
+                training loss
+            Dict[str, float]:
+                scores for each desired metric
         """
         loss_sum = 0.0
         N = 0
@@ -131,12 +135,16 @@ def train_step(self, data: Dict[str, torch.Tensor], future_targets: Dict[str, to
         Allows to train 1 step of gradient descent, given a batch of train/labels
 
         Args:
-            data (torch.Tensor): input features to the network
-            targets (torch.Tensor): ground truth to calculate loss
+            data ( Dict[str, torch.Tensor]):
+                input features to the network
+            future_targets (Dict[str, torch.Tensor]):
+                ground truth to calculate loss
 
         Returns:
-            torch.Tensor: The predictions of the network
-            float: the loss incurred in the prediction
+            torch.Tensor:
+                The predictions of the network
+            float:
+                the loss incurred in the prediction
         """
         past_observed_targets = data['past_observed_targets']
 
@@ -219,12 +227,16 @@ def evaluate(self, test_loader: torch.utils.data.DataLoader, epoch: int,
         Evaluate the model in both metrics and criterion
 
         Args:
-            test_loader (torch.utils.data.DataLoader): generator of features/label
-            epoch (int): the current epoch for tracking purposes
+            test_loader (torch.utils.data.DataLoader):
+                generator of features/label
+            epoch (int):
+                the current epoch for tracking purposes
 
         Returns:
-            float: test loss
-            Dict[str, float]: scores for each desired metric
+            float:
+                test loss
+            Dict[str, float]:
+                scores for each desired metric
         """
         if not isinstance(self.model, (ForecastingDeepARNet, ForecastingSeq2SeqNet)):
             # To save time, we simply make one-step prediction for DeepAR and Seq2Seq
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 1be5a6cd8..e1de7a0c4 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -19,6 +19,7 @@
 
 from autoPyTorch.constants import STRING_TO_TASK_TYPES
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
 from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
@@ -69,9 +70,10 @@ class TimeSeriesForecastingPipeline(RegressorMixin, BasePipeline):
 
 
     Args:
-        config (Configuration)
+        config (Configuration):
             The configuration to evaluate.
-        random_state (Optional[RandomState): random_state is the random number generator
+        random_state (Optional[RandomState):
+            random_state is the random number generator
 
     Attributes:
     Examples
@@ -96,21 +98,24 @@ def __init__(self,
         # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html
         torch.manual_seed(self.random_state.get_state()[1][0])
 
-    def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None, **score_kwargs: Any) -> float:
+    def score(self, X: List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]],
+              y: np.ndarray, batch_size: Optional[int] = None, **score_kwargs: Any) -> float:
         """Scores the fitted estimator on (X, y)
 
         Args:
-            X (np.ndarray): input to the pipeline, from which to guess targets
-            batch_size (Optional[int]): batch_size controls whether the pipeline
-                will be called on small chunks of the data. Useful when calling the
-                predict method on the whole array X results in a MemoryError.
+            X (List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]):
+                input to the pipeline, from which to guess targets
+            batch_size (Optional[int]):
+                batch_size controls whether the pipeline will be called on small chunks of the data.
+                 Useful when calling the predict method on the whole array X results in a MemoryError.
         Returns:
-            np.ndarray: coefficient of determination R^2 of the prediction
+            np.ndarray:
+                coefficient of determination R^2 of the prediction
         """
         from autoPyTorch.pipeline.components.training.metrics.utils import (
             calculate_score, get_metrics)
         metrics = get_metrics(self.dataset_properties, ['mean_MAPE_forecasting'])
-        y_pred = self.predict(X, batch_size=batch_size)
+        y_pred = self.predict(X, batch_size=batch_size)  # type: ignore[arg-types]
         r2 = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[str(self.dataset_properties['task_type'])],
                              metrics=metrics, **score_kwargs)['mean_MAPE_forecasting']
         return r2
@@ -127,15 +132,16 @@ def _get_hyperparameter_search_space(self,
         explore.
 
         Args:
-            include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-                to honor when creating the configuration space
-            exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-                to remove from the configuration space
-            dataset_properties (Optional[Dict[str, Union[str, int]]]): Characteristics
-                of the dataset to guide the pipeline choices of components
+            include (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to remove from the configuration space
+            dataset_properties (Optional[Dict[str, Union[str, int]]]):
+                Characteristics of the dataset to guide the pipeline choices of components
 
         Returns:
-            cs (Configuration): The configuration space describing the TimeSeriesRegressionPipeline.
+            cs (Configuration):
+                The configuration space describing the TimeSeriesRegressionPipeline.
         """
         cs = ConfigurationSpace()
 
@@ -330,8 +336,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
         before "network_backbone" such that
 
         Returns:
-            List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised
-                by the pipeline.
+            List[Tuple[str, autoPyTorchChoice]]:
+                list of steps sequentially exercised by the pipeline.
         """
         steps = []  # type: List[Tuple[str, autoPyTorchChoice]]
 
@@ -415,19 +421,19 @@ def _get_estimator_hyperparameter_name(self) -> str:
         return "time_series_forecasting"
 
     def predict(self,
-                X: Union[Dict[str, np.ndarray], pd.DataFrame],
+                X: List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]],  # type: ignore[override]
                 batch_size: Optional[int] = None) -> np.ndarray:
         """Predict the output using the selected model.
 
         Args:
-            X (np.ndarray): input data to the array
-            batch_size (Optional[int]): batch_size controls whether the pipeline will be
-                called on small chunks of the data. Useful when calling the
-                predict method on the whole array X results in a MemoryError.
-            transform_X (bool): if we want to transform
-
+            X (List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]):
+                input data to predict
+            batch_size (Optional[int]):
+                batch_size controls whether the pipeline will be called on small chunks of the data.
+                Useful when calling the predict method on the whole array X results in a MemoryError.
         Returns:
-            np.ndarray: the predicted values given input X
+            np.ndarray:
+                the predicted values given input X
         """
 
         # Pre-process X
diff --git a/autoPyTorch/utils/forecasting_time_features.py b/autoPyTorch/utils/forecasting_time_features.py
deleted file mode 100644
index 40ccf8cf1..000000000
--- a/autoPyTorch/utils/forecasting_time_features.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License").
-# You may not use this file except in compliance with the License.
-# A copy of the License is located at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# or in the "license" file accompanying this file. This file is distributed
-# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-# Frequencies used by GluonTS framework
-FREQUENCY_MAP = {
-    "minutely": "1min",
-    "10_minutes": "10min",
-    "half_hourly": "30min",
-    "hourly": "1H",
-    "daily": "1D",
-    "weekly": "1W",
-    "monthly": "1M",
-    "quarterly": "1Q",
-    "yearly": "1Y"
-}

From f68dc18f9461e082331abf8c1852722b11ecf0d8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 20:07:56 +0200
Subject: [PATCH 322/347] merge target scaler to one file

---
 .../TargetMaxAbsScaler.py                     | 17 ----
 .../TargetMeanAbsScaler.py                    | 17 ----
 .../TargetMinMaxScaler.py                     | 17 ----
 .../TargetNoScaler.py                         | 17 ----
 .../TargetStandardScaler.py                   | 17 ----
 .../forecasting_target_scaling/__init__.py    | 90 -------------------
 .../base_target_scaler.py                     | 25 +++---
 .../setup/network/forecasting_architecture.py |  3 +-
 .../setup/network/forecasting_network.py      |  2 +-
 .../trainer/forecasting_trainer/__init__.py   |  2 +-
 .../forecasting_base_trainer.py               |  5 +-
 .../pipeline/time_series_forecasting.py       |  5 +-
 .../test_forecasting_architecture.py          |  4 +-
 .../test_forecasting_target_scaling.py        | 56 ++----------
 14 files changed, 29 insertions(+), 248 deletions(-)
 delete mode 100644 autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
 delete mode 100644 autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py

diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
deleted file mode 100644
index 7e913621e..000000000
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMaxAbsScaler.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
-    BaseTargetScaler
-
-
-class TargetMaxAbsScaler(BaseTargetScaler):
-    @property
-    def scaler_mode(self) -> str:
-        return 'max_abs'
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'TargetMaxAbsScaler',
-            'name': 'TargetMaxAbsScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
deleted file mode 100644
index 38ec595ba..000000000
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMeanAbsScaler.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
-    BaseTargetScaler
-
-
-class TargetMeanAbsScaler(BaseTargetScaler):
-    @property
-    def scaler_mode(self) -> str:
-        return 'mean_abs'
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'TargetMeanAbsScaler',
-            'name': 'TargetMeanAbsScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
deleted file mode 100644
index 6e0319fb8..000000000
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetMinMaxScaler.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
-    BaseTargetScaler
-
-
-class TargetMinMaxScaler(BaseTargetScaler):
-    @property
-    def scaler_mode(self) -> str:
-        return 'min_max'
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'TargetMinMaxScaler',
-            'name': 'TargetMinMaxScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
deleted file mode 100644
index b8eef62ce..000000000
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetNoScaler.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
-    BaseTargetScaler
-
-
-class TargetNoScaler(BaseTargetScaler):
-    @property
-    def scaler_mode(self) -> str:
-        return 'none'
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'TargetNoScaler',
-            'name': 'TargetNoScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
deleted file mode 100644
index 56ed89f48..000000000
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/TargetStandardScaler.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
-    BaseTargetScaler
-
-
-class TargetStandardScaler(BaseTargetScaler):
-    @property
-    def scaler_mode(self) -> str:
-        return 'standard'
-
-    @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
-        return {
-            'shortname': 'TargetStandardScaler',
-            'name': 'TargetStandardScaler'
-        }
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
index 4bf851717..e69de29bb 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/__init__.py
@@ -1,90 +0,0 @@
-import os
-from collections import OrderedDict
-from typing import Dict, List, Optional
-
-import ConfigSpace.hyperparameters as CSH
-from ConfigSpace.configuration_space import ConfigurationSpace
-
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
-from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
-from autoPyTorch.pipeline.components.base_component import (
-    ThirdPartyComponents, autoPyTorchComponent, find_components)
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import \
-    BaseTargetScaler
-
-scaling_directory = os.path.split(__file__)[0]
-_scalers = find_components(__package__,
-                           scaling_directory,
-                           BaseTargetScaler)
-
-_addons = ThirdPartyComponents(BaseTargetScaler)
-
-
-def add_scaler(scaler: BaseTargetScaler) -> None:
-    _addons.add_component(scaler)
-
-
-class TargetScalerChoice(autoPyTorchChoice):
-    """
-    Allows for dynamically choosing scale component at runtime, Hence we consider it as part of "setup", not
-    "preprocessing"
-    """
-
-    def get_components(self) -> Dict[str, autoPyTorchComponent]:
-        """Returns the available scaler components
-
-        Args:
-            None
-
-        Returns:
-            Dict[str, autoPyTorchComponent]: all BaseScalers components available
-                as choices for scaling
-        """
-        components = OrderedDict()
-        components.update(_scalers)
-        components.update(_addons.components)
-        return components
-
-    def get_hyperparameter_search_space(self,
-                                        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-                                        default: Optional[str] = None,
-                                        include: Optional[List[str]] = None,
-                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
-        cs = ConfigurationSpace()
-
-        if dataset_properties is None:
-            dataset_properties: Dict[str, BaseDatasetPropertiesType] = self.dataset_properties  # type: ignore
-
-        available_scalers = self.get_available_components(dataset_properties=dataset_properties,
-                                                          include=include,
-                                                          exclude=exclude)
-
-        if len(available_scalers) == 0:
-            raise ValueError("no scalers found, please add a scaler")
-
-        if default is None:
-            defaults = ['TargetStandardScaler', 'TargetMinMaxScaler', 'TargetMaxAbsScaler', 'TargetNoScaler']
-            for default_ in defaults:
-                if default_ in available_scalers:
-                    default = default_
-                    break
-
-        # add only no scaler to choice hyperparameters in case the dataset is only categorical
-
-        preprocessor = CSH.CategoricalHyperparameter('__choice__',
-                                                     list(available_scalers.keys()),
-                                                     default_value=default)
-        cs.add_hyperparameter(preprocessor)
-
-        # add only child hyperparameters of early_preprocessor choices
-        for name in preprocessor.choices:
-            updates = self._get_search_space_updates(prefix=name)
-            config_space = available_scalers[name].get_hyperparameter_search_space(dataset_properties,  # type:ignore
-                                                                                   **updates)
-            parent_hyperparameter = {'parent': preprocessor, 'value': name}
-            cs.add_configuration_space(name, config_space,
-                                       parent_hyperparameter=parent_hyperparameter)
-
-        self.configuration_space = cs
-        self.dataset_properties = dataset_properties  # type: ignore[assignment]
-        return cs
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index 3f22bb692..89557b402 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from ConfigSpace import ConfigurationSpace
+from ConfigSpace import ConfigurationSpace, CategoricalHyperparameter
 
 import numpy as np
 
@@ -11,14 +11,17 @@
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import \
-    TargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import TargetScaler
+from autoPyTorch.utils.common import add_hyperparameter, HyperparameterSearchSpace
 
 
 class BaseTargetScaler(autoPyTorchComponent):
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None,
+                 scaling_mode: str = 'none'):
         super().__init__()
         self.random_state = random_state
+        self.scaling_mode = scaling_mode
         self.preprocessor: Optional[Pipeline] = None
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
@@ -34,13 +37,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 an instance of self
         """
         self.check_requirements(X, y)
-        self.scaler = TargetScaler(mode=self.scaler_mode)
+        self.scaler = TargetScaler(mode=self.scaling_mode)
         return self
 
-    @property
-    def scaler_mode(self) -> str:
-        raise NotImplementedError
-
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the time series transformer to fit dictionary
@@ -75,7 +74,13 @@ def __call__(self,
 
     @staticmethod
     def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+            scaling_mode: HyperparameterSearchSpace = HyperparameterSearchSpace(
+                hyperparameter='scaling_mode',
+                value_range=("standard", "min_max", "max_abs", "mean_abs", "none"),
+                default_value="standard",
+            ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
+        add_hyperparameter(cs, scaling_mode, CategoricalHyperparameter)
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 424fdca54..9d655d76d 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -6,8 +6,7 @@
 from torch import nn
 from torch.distributions import AffineTransform, TransformedDistribution
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import \
-    BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.cells import (
     StackedDecoder,
     StackedEncoder,
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 251553fce..2e17d2f73 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -6,7 +6,7 @@
 from torch import nn
 
 from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
 from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
     ForecastingDeepARNet,
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
index 320cf6bd1..30b504ec3 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/__init__.py
@@ -11,7 +11,7 @@
     autoPyTorchComponent,
     find_components
 )
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.training.trainer import TrainerChoice
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker
diff --git a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
index 9f408b084..858cf775b 100644
--- a/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/forecasting_trainer/forecasting_base_trainer.py
@@ -11,8 +11,7 @@
 from torch.utils.tensorboard.writer import SummaryWriter
 
 from autoPyTorch.constants import FORECASTING_TASKS, REGRESSION_TASKS
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import BaseTargetScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import (
     ForecastingDeepARNet,
@@ -44,7 +43,7 @@ def prepare(  # type: ignore[override]
             step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
             window_size: int = 20,
             dataset_properties: Dict = {},
-            target_scaler: BaseTargetScaler = TargetNoScaler(),
+            target_scaler: BaseTargetScaler = BaseTargetScaler(),
             backcast_loss_ratio: Optional[float] = None,
     ) -> None:
         # metrics_during_training is not appliable when computing scaled values
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index e1de7a0c4..1edcc1d03 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -38,7 +38,7 @@
     TimeSeriesEarlyPreprocessing,
     TimeSeriesTargetEarlyPreprocessing
 )
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScalerChoice
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.forecasting_training_loss import ForecastingLossChoices
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.forecasting_network import ForecastingNetworkComponent
@@ -359,8 +359,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
             ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
             ("target_preprocessing", TimeSeriesTargetEarlyPreprocessing(random_state=self.random_state)),
             ('loss', ForecastingLossChoices(default_dataset_properties, random_state=self.random_state)),
-            ("target_scaler", TargetScalerChoice(default_dataset_properties,
-                                                 random_state=self.random_state)),
+            ("target_scaler", BaseTargetScaler(random_state=self.random_state)),
             ("data_loader", TimeSeriesForecastingDataLoader(random_state=self.random_state)),
             ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties,
                                                          random_state=self.random_state)),
diff --git a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
index 812a40e89..252fe7d1d 100644
--- a/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
+++ b/test/test_pipeline/components/setup/forecasting/forecasting_networks/test_forecasting_architecture.py
@@ -7,7 +7,7 @@
 
 import torch
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
+from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 from autoPyTorch.pipeline.components.setup.network.forecasting_architecture import (
     AbstractForecastingNet,
     get_lagged_subsequences,
@@ -116,7 +116,7 @@ def test_network_forward(self,
 
         fit_dictionary = copy.copy(self.fit_dictionary)
         fit_dictionary['dataset_properties'] = dataset_properties
-        fit_dictionary['target_scaler'] = TargetStandardScaler().fit(fit_dictionary)
+        fit_dictionary['target_scaler'] = BaseTargetScaler(scaling_mode='standard').fit(fit_dictionary)
 
         if net_output_type.startswith("distribution"):
             fit_dictionary['dist_forecasting_strategy'] = DisForecastingStrategy(
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
index d094e1933..33fb37960 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -3,59 +3,13 @@
 
 import torch
 
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling import TargetScalerChoice
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMaxAbsScaler import TargetMaxAbsScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMeanAbsScaler import TargetMeanAbsScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetMinMaxScaler import TargetMinMaxScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetNoScaler import TargetNoScaler
-from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.TargetStandardScaler import TargetStandardScaler
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.base_target_scaler import BaseTargetScaler
 
 
 class TestTargetScalar(unittest.TestCase):
-    def test_get_set_config_space(self):
-        """Make sure that we can setup a valid choice in the encoder
-        choice"""
-        rescaler_choice = TargetScalerChoice({})
-        cs = rescaler_choice.get_hyperparameter_search_space()
-
-        # Make sure that all hyperparameters are part of the search space
-        self.assertListEqual(
-            sorted(cs.get_hyperparameter('__choice__').choices),
-            sorted(list(rescaler_choice.get_components().keys()))
-        )
-
-        # Make sure we can properly set some random configs
-        # Whereas just one iteration will make sure the algorithm works,
-        # doing five iterations increase the confidence. We will be able to
-        # catch component specific crashes
-        for i in range(5):
-            config = cs.sample_configuration()
-            config_dict = copy.deepcopy(config.get_dictionary())
-            rescaler_choice.set_hyperparameters(config)
-
-            self.assertEqual(rescaler_choice.choice.__class__,
-                             rescaler_choice.get_components()[config_dict['__choice__']])
-
-            # Then check the choice configuration
-            selected_choice = config_dict.pop('__choice__', None)
-            for key, value in config_dict.items():
-                # Remove the selected_choice string from the parameter
-                # so we can query in the object for it
-                key = key.replace(selected_choice + ':', '')
-                self.assertIn(key, vars(rescaler_choice.choice))
-                self.assertEqual(value, rescaler_choice.choice.__dict__[key])
-
-        include = ['TargetMeanAbsScaler', 'TargetMaxAbsScaler']
-        cs = rescaler_choice.get_hyperparameter_search_space(include=include)
-        self.assertTrue(
-            sorted(cs.get_hyperparameter('__choice__').choices),
-            sorted(include),
-        )
-
     def test_target_no_scalar(self):
         X = {'dataset_properties': {}}
-        scalar = TargetNoScaler()
+        scalar = BaseTargetScaler(scaling_mode='none')
         scalar = scalar.fit(X)
         X = scalar.transform(X)
         self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
@@ -77,7 +31,7 @@ def test_target_no_scalar(self):
 
     def test_target_mean_abs_scalar(self):
         X = {'dataset_properties': {}}
-        scalar = TargetMeanAbsScaler()
+        scalar = BaseTargetScaler(scaling_mode='mean_abs')
         scalar = scalar.fit(X)
         X = scalar.transform(X)
         self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
@@ -144,7 +98,7 @@ def test_target_mean_abs_scalar(self):
 
     def test_target_standard_scalar(self):
         X = {'dataset_properties': {}}
-        scalar = TargetStandardScaler()
+        scalar = BaseTargetScaler(scaling_mode='standard')
         scalar = scalar.fit(X)
         X = scalar.transform(X)
         self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
@@ -227,7 +181,7 @@ def test_target_standard_scalar(self):
 
     def test_target_min_max_scalar(self):
         X = {'dataset_properties': {}}
-        scalar = TargetMinMaxScaler()
+        scalar = BaseTargetScaler(scaling_mode='min_max')
         scalar = scalar.fit(X)
         X = scalar.transform(X)
         self.assertIsInstance(X['target_scaler'], BaseTargetScaler)
@@ -294,7 +248,7 @@ def test_target_min_max_scalar(self):
 
     def test_target_max_abs_scalar(self):
         X = {'dataset_properties': {}}
-        scalar = TargetMaxAbsScaler()
+        scalar = BaseTargetScaler(scaling_mode='max_abs')
         scalar = scalar.fit(X)
         X = scalar.transform(X)
         self.assertIsInstance(X['target_scaler'], BaseTargetScaler)

From dc4f5102a933ceffc9c8569ff3898900283a37c6 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 20:31:29 +0200
Subject: [PATCH 323/347] fix forecasting init cfgs

---
 autoPyTorch/configs/forecasting_init_cfgs.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/configs/forecasting_init_cfgs.json b/autoPyTorch/configs/forecasting_init_cfgs.json
index 685ce35c2..526fb3cfe 100644
--- a/autoPyTorch/configs/forecasting_init_cfgs.json
+++ b/autoPyTorch/configs/forecasting_init_cfgs.json
@@ -16,7 +16,7 @@
         "optimizer:AdamOptimizer:beta2": 0.999,
         "network_init:__choice__": "XavierInit",
         "network_init:XavierInit:bias_strategy": "Normal",
-        "target_scaler:__choice__": "TargetMeanAbsScaler",
+        "target_scaler:scaling_mode": "mean_abs",
         "trainer:__choice__": "ForecastingStandardTrainer",
         "network_embedding:__choice__": "NoEmbedding"
     },
@@ -182,7 +182,7 @@
             "network_backbone:seq_encoder:block_1:TransformerDecoder:layer_norm_eps_output": 1e-05
         },
         "NBEATS-I": {
-            "target_scaler:__choice__": "TargetNoScaler",
+            "target_scaler:scaling_mode": "none",
             "data_loader:backcast": true,
             "data_loader:backcast_period": 2,
             "loss:__choice__": "RegressionLoss",
@@ -230,7 +230,7 @@
         },
         "TemoporalFusionTransformer": {
             "loss:__choice__": "QuantileLoss",
-            "target_scaler:__choice__": "TargetStandardScaler",
+            "target_scaler:scaling_mode": "standard",
             "data_loader:transform_time_features": true,
             "loss:QuantileLoss:lower_quantile": 0.1,
             "loss:QuantileLoss:upper_quantile": 0.9,

From 951ef4e9ec14aa4c7c3da277feee7078548b84c3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 20:32:03 +0200
Subject: [PATCH 324/347] remove redudant pipeline configs

---
 autoPyTorch/api/time_series_forecasting.py | 36 ++++++++++------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 8ab1ebef0..0c840fb00 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -9,19 +9,22 @@
 from autoPyTorch.constants import MAX_WINDOW_SIZE_BASE, TASK_TYPES_TO_STRING, TIMESERIES_FORECASTING
 from autoPyTorch.data.time_series_forecasting_validator import \
     TimeSeriesForecastingInputValidator
-from autoPyTorch.data.utils import (DatasetCompressionSpec,
-                                    get_dataset_compression_mapping)
-from autoPyTorch.datasets.base_dataset import (BaseDataset,
-                                               BaseDatasetPropertiesType)
-from autoPyTorch.datasets.resampling_strategy import (CrossValTypes,
-                                                      HoldoutValTypes,
-                                                      ResamplingStrategies)
-from autoPyTorch.datasets.time_series_dataset import (
-    TimeSeriesForecastingDataset, TimeSeriesSequence)
-from autoPyTorch.pipeline.time_series_forecasting import \
-    TimeSeriesForecastingPipeline
-from autoPyTorch.utils.hyperparameter_search_space_update import \
-    HyperparameterSearchSpaceUpdates
+from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping
+)
+from autoPyTorch.datasets.base_dataset import (
+    BaseDataset,
+    BaseDatasetPropertiesType
+)
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+    ResamplingStrategies
+)
+from autoPyTorch.datasets.time_series_dataset import TimeSeriesForecastingDataset, TimeSeriesSequence
+from autoPyTorch.pipeline.time_series_forecasting import TimeSeriesForecastingPipeline
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 
 
 class TimeSeriesForecastingTask(BaseTask):
@@ -69,9 +72,7 @@ def __init__(
         delete_output_folder_after_terminate: bool = True,
         include_components: Optional[Dict] = None,
         exclude_components: Optional[Dict] = None,
-        resampling_strategy: Union[
-            CrossValTypes, HoldoutValTypes
-        ] = HoldoutValTypes.time_series_hold_out_validation,
+        resampling_strategy: ResamplingStrategies = HoldoutValTypes.time_series_hold_out_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
@@ -95,9 +96,6 @@ def __init__(
             search_space_updates=search_space_updates,
             task_type=TASK_TYPES_TO_STRING[TIMESERIES_FORECASTING],
         )
-        # here fraction of subset could be number of images, tabular data or resolution of time-series datasets.
-        # TODO if budget type resolution is applied to all datasets, we will put it to configs
-        self.pipeline_options.update({"min_resolution": 0.1, "full_resolution": 1.0})
 
         self.customized_window_size = False
         if self.search_space_updates is not None:

From 10f0c833be9e4fd8cb912e3a6ce1b57ff8b2e120 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Tue, 14 Jun 2022 20:55:26 +0200
Subject: [PATCH 325/347] maint

---
 autoPyTorch/api/time_series_forecasting.py    | 110 ++++++++++++------
 autoPyTorch/data/tabular_target_validator.py  |   3 +
 autoPyTorch/datasets/time_series_dataset.py   |   2 +-
 autoPyTorch/ensemble/ensemble_builder.py      |   2 +-
 autoPyTorch/evaluation/abstract_evaluator.py  |   6 +-
 autoPyTorch/evaluation/tae.py                 |  66 +++++------
 autoPyTorch/optimizer/smbo.py                 |  36 +++---
 .../TimeSeriesTransformer.py                  |   7 +-
 .../base_time_series_preprocessing.py         |   4 -
 .../encoding/NoEncoder.py                     |   1 +
 .../encoding/time_series_base_encoder.py      |   1 +
 .../imputation/TimeSeriesImputer.py           |   3 +
 .../scaling/base_scaler.py                    |   1 +
 .../time_series_preprocessing/utils.py        |   4 +
 .../TimeSeriesEarlyPreProcessing.py           |   1 +
 .../setup/early_preprocessor/utils.py         |   1 +
 .../forecasting_training_loss/__init__.py     |  29 ++---
 .../components/setup/network/base_network.py  |   2 +-
 .../setup/network/forecasting_architecture.py |  91 +++++++++------
 .../setup/network/forecasting_network.py      |  22 ----
 20 files changed, 229 insertions(+), 163 deletions(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 0c840fb00..688b67240 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -172,8 +172,8 @@ def _get_dataset_input_validator(
         **forecasting_dataset_kwargs: Any,
     ) -> Tuple[TimeSeriesForecastingDataset, TimeSeriesForecastingInputValidator]:
         """
-        Returns an object of `TabularDataset` and an object of
-        `TabularInputValidator` according to the current task.
+        Returns an object of `TimeSeriesForecastingDataset` and an object of
+        `TimeSeriesForecastingInputValidator` according to the current task.
 
         Args:
             X_train (Union[List, pd.DataFrame, np.ndarray]):
@@ -196,16 +196,26 @@ def _get_dataset_input_validator(
             dataset_compression (Optional[DatasetCompressionSpec]):
                 specifications for dataset compression. For more info check
                 documentation for `BaseTask.get_dataset`.
-            freq: Optional[Union[str, int, List[int]]]
+            freq (Optional[Union[str, int, List[int]]]):
                 frequency information, it determines the configuration space of the window size, if it is not given,
                 we will use the default configuration
+            start_times (Optional[List[pd.DatetimeIndex]]):
+                starting time of each series when they are sampled. If it is not given, we simply start with a fixed
+                timestamp.
+            series_idx (Optional[Union[List[Union[str, int]], str, int]]):
+                (only works if X is stored as pd.DataFrame). This value is applied to identify to which series the data
+                belongs if the data is presented as a "chunk" dataframe
+            n_prediction_steps (int):
+                The number of steps you want to forecast into the future (forecast horizon)
+            known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
+                future features that are known in advance. For instance, holidays.
             forecasting_kwargs (Any)
                 kwargs for forecasting dataset, for more details, please check
                 ```datasets/time_series_dataset.py```
         Returns:
-            TabularDataset:
+            TimeSeriesForecastingDataset:
                 the dataset object.
-            TabularInputValidator:
+            TimeSeriesForecastingInputValidator:
                 the input validator fitted on the data.
         """
 
@@ -296,6 +306,7 @@ def search(
         Fit both optimizes the machine learning models and builds an ensemble out of them.
         To disable ensembling, set ensemble_size==0.
         using the optimizer.
+
         Args:
             optimize_metric (str):
                 name of the metric that is used to evaluate a pipeline.
@@ -429,30 +440,7 @@ def search(
         )
 
         if not self.customized_window_size:
-            base_window_size = int(np.ceil(self.dataset.base_window_size))
-            # we don't want base window size to large, which might cause a too long computation time, in which case
-            # we will use n_prediction_step instead (which is normally smaller than base_window_size)
-            if base_window_size > MAX_WINDOW_SIZE_BASE:
-                # TODO considering padding to allow larger upper_window_size !!!
-                if n_prediction_steps > MAX_WINDOW_SIZE_BASE:
-                    base_window_size = 50
-                else:
-                    base_window_size = n_prediction_steps
-
-            if self.search_space_updates is None:
-                self.search_space_updates = HyperparameterSearchSpaceUpdates()
-
-            window_size_scales = [1, 3]
-
-            self.search_space_updates.append(
-                node_name="data_loader",
-                hyperparameter="window_size",
-                value_range=[
-                    int(window_size_scales[0] * base_window_size),
-                    int(window_size_scales[1] * base_window_size),
-                ],
-                default_value=int(np.ceil(1.25 * base_window_size)),
-            )
+            self.update_sliding_window_size(n_prediction_steps=n_prediction_steps)
 
         self._metrics_kwargs = {
             "sp": self.dataset.seasonality,
@@ -491,14 +479,32 @@ def predict(
         batch_size: Optional[int] = None,
         n_jobs: int = 1,
         past_targets: Optional[List[np.ndarray]] = None,
-        future_targets: Optional[
-            List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]
-        ] = None,
+        future_targets: Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]] = None,
         start_times: List[pd.DatetimeIndex] = [],
     ) -> np.ndarray:
         """
-            target_variables: Optional[Union[Tuple[int], Tuple[str], np.ndarray]] = None,
-        (used for multi-variable prediction), indicates which value needs to be predicted
+        Predict the future varaibles
+
+        Args:
+            X_test (List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]])
+                if it is a list of TimeSeriesSequence, then it is the series to be forecasted. Otherwise, it is the
+                known future features
+            batch_size: Optional[int]
+                batch size
+            n_jobs (int):
+                number of jobs
+            past_targets (Optional[List[np.ndarray]])
+                past observed targets, required when X_test is not a list of TimeSeriesSequence
+            future_targets (Optional[List[Union[np.ndarray, pd.DataFrame, TimeSeriesSequence]]]):
+                future targets (test sets)
+            start_times (List[pd.DatetimeIndex]):
+                starting time of each series when they are sampled. If it is not given, we simply start with a fixed
+                timestamp.
+
+        Return:
+            np.ndarray
+                predicted value, it needs to be with shape (B, H, N),
+                B is the number of series, H is forecasting horizon (n_prediction_steps), N is the number of targets
         """
         if X_test is None or not isinstance(X_test[0], TimeSeriesSequence):
             assert past_targets is not None
@@ -513,6 +519,7 @@ def predict(
         flattened_res = super(TimeSeriesForecastingTask, self).predict(
             X_test, batch_size, n_jobs
         )
+        # forecasting result from each series is stored as an array
         if self.dataset.num_targets == 1:
             forecasting = flattened_res.reshape([-1, self.dataset.n_prediction_steps])
         else:
@@ -528,3 +535,38 @@ def predict(
             )
             return forecasting * std + mean
         return forecasting
+
+    def update_sliding_window_size(self, n_prediction_steps: int):
+        """
+        the size of the sliding window is heavily dependent on the dataset,
+        so we only update them when we get the information from the
+
+        Args:
+            n_prediction_steps (int):
+                forecast horizon. Sometimes we could also make our base sliding window size based on the
+                forecast horizon
+        """
+        base_window_size = int(np.ceil(self.dataset.base_window_size))
+        # we don't want base window size to large, which might cause a too long computation time, in which case
+        # we will use n_prediction_step instead (which is normally smaller than base_window_size)
+        if base_window_size > MAX_WINDOW_SIZE_BASE:
+            # TODO considering padding to allow larger upper_window_size !!!
+            if n_prediction_steps > MAX_WINDOW_SIZE_BASE:
+                base_window_size = 50
+            else:
+                base_window_size = n_prediction_steps
+
+        if self.search_space_updates is None:
+            self.search_space_updates = HyperparameterSearchSpaceUpdates()
+
+        window_size_scales = [1, 3]
+
+        self.search_space_updates.append(
+            node_name="data_loader",
+            hyperparameter="window_size",
+            value_range=[
+                int(window_size_scales[0] * base_window_size),
+                int(window_size_scales[1] * base_window_size),
+            ],
+            default_value=int(np.ceil(1.25 * base_window_size)),
+        )
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index e34695e3c..3f1aa2f96 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -32,6 +32,9 @@ def _check_and_to_array(y: SupportedTargetTypes, allow_nan: bool = False) -> Arr
 def _modify_regression_target(y: ArrayType, allow_nan: bool = False) -> ArrayType:
     # Regression targets must have numbers after a decimal point.
     # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+
+    # For forecasting tasks, missing targets are allowed. Our TimeSeriesTargetValidator is inherent from
+    # TabularTargetValidator, if this function is called by TimeSeriesTargetValidator, we will allow nan here
     if allow_nan:
         y = ma.masked_where(np.isnan(y), y, 1e12)
 
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index f00b6401f..022e6360e 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -463,7 +463,7 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
     start_times (Optional[List[pd.DatetimeIndex]]):
         starting time of each series when they are sampled. If it is not given, we simply start with a fixed timestamp.
     series_idx (Optional[Union[List[Union[str, int]], str, int]]):
-        (only works if X is stored as pd.DataFrame). This value is applied to identify which series the data belongs to
+        (only works if X is stored as pd.DataFrame). This value is applied to identify  towhich series the data belongs
         if the data is presented as a "chunk" dataframe
     known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
         future features that are known in advance. For instance, holidays.
diff --git a/autoPyTorch/ensemble/ensemble_builder.py b/autoPyTorch/ensemble/ensemble_builder.py
index 83a488e0d..662718873 100644
--- a/autoPyTorch/ensemble/ensemble_builder.py
+++ b/autoPyTorch/ensemble/ensemble_builder.py
@@ -460,7 +460,7 @@ def __init__(
                 unittest.mock work through the pynisher with all spawn contexts. If you know a
                 better solution, please let us know by opening an issue.
             metric_kwargs: Dict
-            additional arguments for computing metrics, this is used for time series forecasting computation
+                additional arguments for computing metrics, this is used for time series forecasting computation
         """
 
         super(EnsembleBuilder, self).__init__()
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index e482973d8..44a0b4e99 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -343,7 +343,7 @@ def fit(self, X: Dict[str, Any], y: Any,
         y_train = subsampler(X['y_train'], X['train_indices'])
         return DummyClassifier.fit(self, np.ones((y_train.shape[0], 1)), y_train, sample_weight)
 
-    def _genreate_dummy_forecasting(self, X: List[Union[TimeSeriesSequence, np.ndarray]]) -> List:
+    def _generate_dummy_forecasting(self, X: List[Union[TimeSeriesSequence, np.ndarray]]) -> List:
         if isinstance(X[0], TimeSeriesSequence):
             X_tail = [x.get_target_values(-1) for x in X]
         else:
@@ -352,12 +352,12 @@ def _genreate_dummy_forecasting(self, X: List[Union[TimeSeriesSequence, np.ndarr
 
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int = 1000) -> np.ndarray:
-        X_tail = self._genreate_dummy_forecasting(X)
+        X_tail = self._generate_dummy_forecasting(X)
         return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
                 batch_size: int = 1000) -> np.ndarray:
-        X_tail = np.asarray(self._genreate_dummy_forecasting(X))
+        X_tail = np.asarray(self._generate_dummy_forecasting(X))
         if X_tail.ndim == 1:
             X_tail = np.expand_dims(X_tail, -1)
         return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 9c5f349a7..299cd4810 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -110,30 +110,30 @@ class ExecuteTaFuncWithQueue(AbstractTAFunc):
     """
 
     def __init__(
-            self,
-            backend: Backend,
-            seed: int,
-            metric: autoPyTorchMetric,
-            cost_for_crash: float,
-            abort_on_first_run_crash: bool,
-            pynisher_context: str,
-            multi_objectives: List[str],
-            pipeline_config: Optional[Dict[str, Any]] = None,
-            initial_num_run: int = 1,
-            stats: Optional[Stats] = None,
-            run_obj: str = 'quality',
-            par_factor: int = 1,
-            output_y_hat_optimization: bool = True,
-            include: Optional[Dict[str, Any]] = None,
-            exclude: Optional[Dict[str, Any]] = None,
-            memory_limit: Optional[int] = None,
-            disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
-            init_params: Dict[str, Any] = None,
-            budget_type: str = None,
-            ta: Optional[Callable] = None,
-            logger_port: int = None,
-            all_supported_metrics: bool = True,
-            search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+        self,
+        backend: Backend,
+        seed: int,
+        metric: autoPyTorchMetric,
+        cost_for_crash: float,
+        abort_on_first_run_crash: bool,
+        pynisher_context: str,
+        multi_objectives: List[str],
+        pipeline_config: Optional[Dict[str, Any]] = None,
+        initial_num_run: int = 1,
+        stats: Optional[Stats] = None,
+        run_obj: str = 'quality',
+        par_factor: int = 1,
+        output_y_hat_optimization: bool = True,
+        include: Optional[Dict[str, Any]] = None,
+        exclude: Optional[Dict[str, Any]] = None,
+        memory_limit: Optional[int] = None,
+        disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+        init_params: Dict[str, Any] = None,
+        budget_type: str = None,
+        ta: Optional[Callable] = None,
+        logger_port: int = None,
+        all_supported_metrics: bool = True,
+        search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     ):
 
         self.backend = backend
@@ -238,8 +238,8 @@ def _check_and_get_default_budget(self) -> float:
             return budget_choices[budget_type]
 
     def run_wrapper(
-            self,
-            run_info: RunInfo,
+        self,
+        run_info: RunInfo,
     ) -> Tuple[RunInfo, RunValue]:
         """
         wrapper function for ExecuteTARun.run_wrapper() to cap the target algorithm
@@ -297,13 +297,13 @@ def run_wrapper(
         return run_info, run_value
 
     def run(
-            self,
-            config: Configuration,
-            instance: Optional[str] = None,
-            cutoff: Optional[float] = None,
-            seed: int = 12345,
-            budget: float = 0.0,
-            instance_specific: Optional[str] = None,
+        self,
+        config: Configuration,
+        instance: Optional[str] = None,
+        cutoff: Optional[float] = None,
+        seed: int = 12345,
+        budget: float = 0.0,
+        instance_specific: Optional[str] = None,
     ) -> Tuple[StatusType, float, float, Dict[str, Any]]:
 
         context = multiprocessing.get_context(self.pynisher_context)
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index b5a986bfe..3c1acd1c0 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -274,21 +274,7 @@ def __init__(self,
         initial_configurations = []
 
         if STRING_TO_TASK_TYPES.get(self.task_type, -1) == TIMESERIES_FORECASTING:
-            suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models',  # type:ignore[assignment]
-                                                                    None)
-            custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path',  # type:ignore[assignment]
-                                                                 None)
-            # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
-            # do not provide any initial configurations
-            if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
-                datamanager: BaseDataset = self.backend.load_datamanager()
-                dataset_properties = datamanager.get_dataset_properties([])
-                initial_configurations = read_forecasting_init_configurations(
-                    config_space=config_space,
-                    suggested_init_models=suggested_init_models,
-                    custom_init_setting_path=custom_init_setting_path,
-                    dataset_properties=dataset_properties
-                )
+            initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
             # proxy-validation sets
             self.min_num_test_instances: Optional[int] = kwargs.get('min_num_test_instances',  # type:ignore[assignment]
                                                                     None)
@@ -437,3 +423,23 @@ def run_smbo(self, func: Optional[Callable] = None
             raise NotImplementedError(type(smac.solver.tae_runner))
 
         return self.runhistory, self.trajectory, self._budget_type
+
+    def get_init_configs_for_forecasting(self, config_space: ConfigSpace, kwargs: Dict) -> List[Configuration]:
+        """get initial configurations for forecasting tasks"""
+        suggested_init_models: Optional[List[str]] = kwargs.get('suggested_init_models',  # type:ignore[assignment]
+                                                                None)
+        custom_init_setting_path: Optional[str] = kwargs.get('custom_init_setting_path',  # type:ignore[assignment]
+                                                             None)
+        # if suggested_init_models is an empty list, and  custom_init_setting_path is not provided, we
+        # do not provide any initial configurations
+        if suggested_init_models is None or suggested_init_models or custom_init_setting_path is not None:
+            datamanager: BaseDataset = self.backend.load_datamanager()
+            dataset_properties = datamanager.get_dataset_properties([])
+            initial_configurations = read_forecasting_init_configurations(
+                config_space=config_space,
+                suggested_init_models=suggested_init_models,
+                custom_init_setting_path=custom_init_setting_path,
+                dataset_properties=dataset_properties
+            )
+            return initial_configurations
+        return []
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
index 65c1cf1f3..ecca60570 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/TimeSeriesTransformer.py
@@ -29,11 +29,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Creates a column transformer for the chosen tabular
         preprocessors
+
         Args:
             X (Dict[str, Any]): fit dictionary
 
         Returns:
-            "TabularColumnTransformer": an instance of self
+            "TimeSeriesFeatureTransformer": an instance of self
         """
         self.check_requirements(X, y)
 
@@ -71,6 +72,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the time series transformer to fit dictionary
+
         Args:
             X (Dict[str, Any]): fit dictionary
 
@@ -111,11 +113,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         """
         Creates a column transformer for the chosen tabular
         preprocessors
+
         Args:
             X (Dict[str, Any]): fit dictionary
 
         Returns:
-            "TabularColumnTransformer": an instance of self
+            "TimeSeriesTargetTransformer": an instance of self
         """
         self.check_requirements(X, y)
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
index ad327d14c..e924d360d 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/base_time_series_preprocessing.py
@@ -29,10 +29,6 @@ class autoPyTorchTimeSeriesTargetPreprocessingComponent(autoPyTorchTargetPreproc
      # TODO add support for categorical targets!
      # TODO define inverse transformation for each inversible numerical transformation (log, deseasonalization, etc. )
     """
-    """
-     Provides abstract interface for time series preprocessing algorithms in AutoPyTorch.
-    """
-
     def __init__(self) -> None:
         super().__init__()
         self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
index 0bcdecc39..b08300724 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/NoEncoder.py
@@ -34,6 +34,7 @@ def get_properties(
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the self into the 'X' dictionary and returns it.
+
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
index 6fa5b69be..a3d64ee92 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py
@@ -24,6 +24,7 @@ def __init__(self) -> None:
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the self into the 'X' dictionary and returns it.
+
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
index 2dc2891d0..22cb0062c 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/imputation/TimeSeriesImputer.py
@@ -57,6 +57,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseEstimator:
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds self into the 'X' dictionary and returns it.
+
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
@@ -139,6 +140,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseEstimator:
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds self into the 'X' dictionary and returns it.
+
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
@@ -164,6 +166,7 @@ def get_hyperparameter_search_space(
         Time series imputor, for the sake of speed, we only allow local imputation here (i.e., the filled value only
         depends on its neighbours)
         # TODO: Transformer for mean and median: df.fillna(df.groupby(df.index).agg('mean'))...
+
         Args:
             dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): dataset properties
             imputation_strategy: which strategy to use, its content is defined by
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
index ec782abdc..ff030da39 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/scaling/base_scaler.py
@@ -43,6 +43,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> 'BaseScaler':
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
         Adds the fitted scalar into the 'X' dictionary and returns it.
+
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
index f376ceef2..22252f0dd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
+++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/utils.py
@@ -11,8 +11,10 @@ def get_time_series_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstim
     Creates a dictionary with two keys,
     numerical- containing list of numerical preprocessors
     categorical- containing list of categorical preprocessors
+
     Args:
         X: fit dictionary
+
     Returns:
         (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
     """
@@ -37,8 +39,10 @@ def get_time_series_target_preprocessers(X: Dict[str, Any]) -> Dict[str, List[Ba
     Creates a dictionary with two keys,
     numerical- containing list of numerical preprocessors
     categorical- containing list of categorical preprocessors
+
     Args:
         X: fit dictionary
+
     Returns:
         (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
     """
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
index e9e421447..59035869e 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py
@@ -35,6 +35,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         While after transformation, the order of the dataset is:
         [numerical_columns, categorical_columns, unknown_columns]
         we need to change feature_names and feature_shapes accordingly
+
         Args:
             X(Dict): fit dictionary
 
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
index 5cd4941a7..830beced9 100644
--- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
+++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py
@@ -51,6 +51,7 @@ def time_series_preprocess(dataset: pd.DataFrame, transforms: torchvision.transf
     """
     preprocess time series data (both features and targets). Dataset should be pandas DataFrame whose index identifies
     which series the data belongs to.
+
     Args:
         dataset (pd.DataFrame): a dataset contains multiple series, its index identifies the series number
         transforms (torchvision.transforms.Compose): transformation applied to dataset
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index dc20fe313..cf5d4590b 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -54,12 +54,12 @@ def get_available_components(
         include/exclude directives, as well as the dataset properties
 
         Args:
-         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-            to honor when creating the configuration space
-         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-             to remove from the configuration space
-         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
-             of the dataset to guide the pipeline choices of components
+         include (Optional[Dict[str, Any]]):
+            what hyper-parameter configurations to honor when creating the configuration space
+         exclude (Optional[Dict[str, Any]]):
+            what hyper-parameter configurations to remove from the configuration space
+         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+            Characteristics of the dataset to guide the pipeline choices of components
 
         Returns:
             Dict[str, autoPyTorchComponent]: A filtered dict of learning
@@ -123,15 +123,18 @@ def get_hyperparameter_search_space(
         """Returns the configuration space of the current chosen components
 
         Args:
-            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
-            default (Optional[str]): Default component to use
-            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
-                list, and will exclusively use this components.
-            exclude: Optional[Dict[str, Any]]: which components to skip
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            default (Optional[str]):
+                Default component to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip
 
         Returns:
-            ConfigurationSpace: the configuration space of the hyper-parameters of the
-                 chosen component
+            ConfigurationSpace:
+                the configuration space of the hyper-parameters of the chosen component
         """
         cs = ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index bdae83823..768d0eb20 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -46,10 +46,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         Returns:
             A instance of self
         """
-
         # Make sure that input dictionary X has the required
         # information to fit this stage
         self.check_requirements(X, y)
+
         self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
 
         # Properly set the network training device
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 9d655d76d..585f4d1b8 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -49,23 +49,23 @@ def get_lagged_subsequences(
     outside the sliding windows. This implementation is similar to gluonTS's implementation
      the only difference is that we pad the sequence that is not long enough
 
-    Parameters
-    ----------
-    sequence : Tensor
-        the sequence from which lagged subsequences should be extracted.
-        Shape: (N, T, C).
-    subsequences_length : int
-        length of the subsequences to be extracted.
-    lags_seq: Optional[List[int]]
-        lags of the sequence, indicating the sequence that needs to be extracted
-    lag_mask: Optional[torch.Tensor]
-        a mask tensor indicating
-
-    Returns
-    --------
-    lagged : Tensor
-        a tensor of shape (N, S, I * C), where S = subsequences_length and
-        I = len(indices), containing lagged subsequences.
+    Args:
+        sequence (torch.Tensor):
+            the sequence from which lagged subsequences should be extracted, Shape: (N, T, C).
+        subsequences_length (int):
+            length of the subsequences to be extracted.
+        lags_seq (Optional[List[int]]):
+            lags of the sequence, indicating the sequence that needs to be extracted
+        mask (Optional[torch.Tensor]):
+            a mask tensor indicating, it is a cached mask tensor that allows the model to quickly extract the desired
+            lagged values
+
+    Returns:
+        lagged (Tensor)
+            A tensor of shape (N, S, I * C), where S = subsequences_length and I = len(indices),
+             containing lagged subsequences.
+        mask (torch.Tensor):
+            cached mask
     """
     batch_size = sequence.shape[0]
     num_features = sequence.shape[2]
@@ -114,9 +114,20 @@ def get_lagged_subsequences_inference(
         lags_seq: List[int]) -> torch.Tensor:
     """
     this function works exactly the same as get_lagged_subsequences. However, this implementation is faster when no
-    cached value is available, thus it more suitable during inference times.
-
-    designed for doing inference for DeepAR, the core idea is to use
+    cached value is available, thus it is applied during inference times.
+
+    Args:
+        sequence (torch.Tensor):
+            the sequence from which lagged subsequences should be extracted, Shape: (N, T, C).
+        subsequences_length (int):
+            length of the subsequences to be extracted.
+        lags_seq (Optional[List[int]]):
+            lags of the sequence, indicating the sequence that needs to be extracted
+
+    Returns:
+        lagged (Tensor)
+            A tensor of shape (N, S, I * C), where S = subsequences_length and I = len(indices),
+             containing lagged subsequences.
     """
     sequence_length = sequence.shape[1]
     batch_size = sequence.shape[0]
@@ -174,21 +185,33 @@ def __init__(self,
         This structure is active when the decoder is a MLP with auto_regressive set as false
 
         Args:
-            network_structure (NetworkStructure): network structure information
-            network_embedding (nn.Module): network embedding
-            network_encoder (Dict[str, EncoderBlockInfo]): Encoder network, could be selected to return a sequence or a
-            network_decoder (Dict[str, DecoderBlockInfo]): network decoder
-            temporal_fusion Optional[TemporalFusionLayer]: Temporal Fusion Layer
-            network_head (nn.Module): network head, maps the output of decoder to the final output
-            dataset_properties (Dict): dataset properties
-            auto_regressive (bool): if the overall model is auto-regressive model
-            output_type (str): the form that the network outputs. It could be regression, distribution and
-                quantile
-            forecast_strategy (str): only valid if output_type is distribution or quantile, how the network transforms
+            network_structure (NetworkStructure):
+                network structure information
+            network_embedding (nn.Module):
+                network embedding
+            network_encoder (Dict[str, EncoderBlockInfo]):
+                Encoder network, could be selected to return a sequence or a 2D Matrix
+            network_decoder (Dict[str, DecoderBlockInfo]):
+                network decoder
+            temporal_fusion Optional[TemporalFusionLayer]:
+                Temporal Fusion Layer
+            network_head (nn.Module):
+                network head, maps the output of decoder to the final output
+            dataset_properties (Dict):
+                dataset properties
+            auto_regressive (bool):
+                if the model is auto-regressive model
+            output_type (str):
+                the form that the network outputs. It could be regression, distribution or quantile
+            forecast_strategy (str):
+                only valid if output_type is distribution or quantile, how the network transforms
                 its output to predicted values, could be mean or sample
-            num_samples (int): only valid if output_type is not regression and forecast_strategy is sample. this
-            indicates the number of the points to sample when doing prediction
-            aggregation (str): how the samples are aggregated. We could take their mean or median values.
+            num_samples (int):
+                only valid if output_type is not regression and forecast_strategy is sample. This indicates the
+                number of the points to sample when doing prediction
+            aggregation (str):
+                only valid if output_type is not regression and forecast_strategy is sample. The way that the samples
+                are aggregated. We could take their mean or median values.
         """
         super().__init__()
         self.network_structure = network_structure
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 2e17d2f73..909ad0f4a 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -51,28 +51,6 @@ def __init__(
             FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
         ])
 
-    @property
-    def _required_fit_requirements(self) -> List[FitRequirement]:
-        return [
-            FitRequirement('dataset_properties', (Dict,), user_defined=False, dataset_property=True),
-            FitRequirement('window_size', (int,), user_defined=False, dataset_property=False),
-            FitRequirement('network_structure', (Dict,), user_defined=False, dataset_property=False),
-            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
-            FitRequirement("network_encoder", (Dict,), user_defined=False,
-                           dataset_property=False),
-            FitRequirement("network_decoder", (Dict,), user_defined=False,
-                           dataset_property=False),
-            FitRequirement("network_head", (Optional[torch.nn.Module],), user_defined=False, dataset_property=False),
-            FitRequirement("auto_regressive", (bool,), user_defined=False, dataset_property=False),
-            FitRequirement("target_scaler", (BaseTargetScaler,), user_defined=False, dataset_property=False),
-            FitRequirement("net_output_type", (str,), user_defined=False, dataset_property=False),
-            FitRequirement("feature_names", (Iterable,), user_defined=False, dataset_property=True),
-            FitRequirement("feature_shapes", (Iterable,), user_defined=False, dataset_property=True),
-            FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
-            FitRequirement('static_features', (tuple,), user_defined=True, dataset_property=True),
-            FitRequirement('time_feature_names', (Iterable,), user_defined=True, dataset_property=True),
-        ]
-
     def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         # Make sure that input dictionary X has the required
         # information to fit this stage

From 8574c6f70e174d1b8f5293998213f9ac88cd7a38 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Jun 2022 18:33:03 +0200
Subject: [PATCH 326/347] SMAC4HPO instead of SMAC4AC in smbo (will be reverted
 further if study shows that SMAC4HPO is superior to SMAC4AC)

---
 autoPyTorch/optimizer/smbo.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index 3c1acd1c0..f130ec060 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -8,11 +8,11 @@
 
 import dask.distributed
 
-from smac.facade.smac_hpo_facade import SMAC4HPO
+from smac.facade.smac_ac_facade import SMAC4AC
 from smac.intensification.hyperband import Hyperband
 from smac.intensification.intensification import Intensifier
 from smac.runhistory.runhistory import RunHistory
-from smac.runhistory.runhistory2epm import RunHistory2EPM4LogScaledCost
+from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
 from smac.scenario.scenario import Scenario
 from smac.tae.dask_runner import DaskParallelRunner
 from smac.tae.serial_runner import SerialRunner
@@ -50,7 +50,7 @@ def get_smac_object(
     max_budget: Union[int, float],
     dask_client: Optional[dask.distributed.Client],
     initial_configurations: Optional[List[Configuration]] = None,
-) -> SMAC4HPO:
+) -> SMAC4AC:
     """
     This function returns an SMAC object that is gonna be used as
     optimizer of pipelines
@@ -73,6 +73,7 @@ def get_smac_object(
 
     """
     if initial_budget == max_budget:
+        # This allows vanilla BO optimization
         intensifier = Intensifier
         intensifier_kwargs: Dict[str, Any] = {'deterministic': True, }
 
@@ -80,9 +81,9 @@ def get_smac_object(
         intensifier = Hyperband
         intensifier_kwargs = {'initial_budget': initial_budget, 'max_budget': max_budget,
                               'eta': 3, 'min_chall': 1, 'instance_order': 'shuffle_once'}
-    rh2EPM = RunHistory2EPM4LogScaledCost
+    rh2EPM = RunHistory2EPM4LogCost
 
-    return SMAC4HPO(
+    return SMAC4AC(
         scenario=Scenario(scenario_dict),
         rng=seed,
         runhistory2epm=rh2EPM,

From 86e39bc5a024c37ffe17aa1d34769a6fac222cff Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Jun 2022 19:48:43 +0200
Subject: [PATCH 327/347] fixed docstrign for RNN and Transformer Decoder

---
 .../forecasting_decoder/RNNDecoder.py             |  9 ++-------
 .../forecasting_decoder/TransformerDecoder.py     | 15 +++++++--------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
index 58b19e202..848a2a4cd 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/RNNDecoder.py
@@ -52,7 +52,7 @@ def forward(self,
 
 class ForecastingRNNDecoder(BaseForecastingDecoder):
     """
-    Standard searchable RNN decoder for time series data, only works when the encoder is
+    Standard searchable RNN decoder for time series data, only works when the encoder is an RNN encoder
     """
 
     def __init__(self, **kwargs: Any):
@@ -60,12 +60,7 @@ def __init__(self, **kwargs: Any):
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.rnn_kwargs: Optional[Dict] = None
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
-
-    @property
-    def _required_fit_requirements(self) -> List[FitRequirement]:
-        fit_requirement = super(ForecastingRNNDecoder, self)._required_fit_requirements
-        fit_requirement.append(FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False))
-        return fit_requirement
+        self.add_fit_requirements([FitRequirement('rnn_kwargs', (Dict,), user_defined=False, dataset_property=False)])
 
     def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index ef20c9e06..522325b2a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -17,7 +17,7 @@
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
     PositionalEncoding, build_transformer_layers)
-from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
+from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder. \
     base_forecasting_decoder import BaseForecastingDecoder, DecoderProperties
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.components import \
     DecoderNetwork
@@ -79,11 +79,17 @@ def forward(self,
 
 
 class ForecastingTransformerDecoder(BaseForecastingDecoder):
+    """
+    Standard searchable Transformer decoder for time series data, only works when the encoder is a
+    Transformer Encoder
+    """
     def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         # RNN is naturally auto-regressive. However, we will not consider it as a decoder for deep AR model
         self.transformer_encoder_kwargs: Optional[dict] = None
         self.lagged_value = [1, 2, 3, 4, 5, 6, 7]
+        self.add_fit_requirements([FitRequirement('transformer_encoder_kwargs', (Dict,), user_defined=False,
+                                                  dataset_property=False)])
 
     def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
@@ -108,13 +114,6 @@ def _build_decoder(self,
 
         return decoder, d_model
 
-    @property
-    def _required_fit_requirements(self) -> List[FitRequirement]:
-        fit_requirement = super(ForecastingTransformerDecoder, self)._required_fit_requirements
-        fit_requirement.append(FitRequirement('transformer_encoder_kwargs', (Dict,), user_defined=False,
-                                              dataset_property=False))
-        return fit_requirement
-
     @staticmethod
     def decoder_properties() -> DecoderProperties:
         return DecoderProperties(recurrent=True,

From 21fbcb26a254f00b1e94e7e1a67ff2fc69f31f74 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Jun 2022 19:49:48 +0200
Subject: [PATCH 328/347] uniformed docstrings for smbo and base task

---
 autoPyTorch/api/base_task.py  | 11 +++++++----
 autoPyTorch/optimizer/smbo.py | 13 +++++++------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 6a5ce1e14..84eb3e1a0 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1055,11 +1055,14 @@ def _search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
-            time_series_forecasting: bool
-                if time series forecasting task is implemented.
             kwargs: Any
-                additional arguments
-
+                additional arguments that are customed by some specific task.
+                For instance, forecasting tasks require:
+                    min_num_test_instances (int):  minimal number of instances used to initialize a proxy validation set
+                    suggested_init_models (List[str]):  A set of initial models suggested by the users. Their
+                        hyperparameters are determined by the default configurations
+                    custom_init_setting_path (str): The path to the initial hyperparameter configurations set by
+                    the users
         Returns:
             self
 
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index f130ec060..53eae4696 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -210,12 +210,13 @@ def __init__(self,
             task_type (str):
                 task type. Forecasting tasks require special process
             kwargs (Any):
-                Additional Arguments for forecasting tasks. It includes:
-                    min_num_test_instances (int): minimal number of instances used to initialize a proxy validation set
-                    suggested_init_models (List[str]): A set of initial models suggested by the users.
-                        Their hyperparameters are still determined by the default configurations
-                    custom_init_setting_path (str): the path to the initial hyperparameter configurations set by the
-                        users
+                additional arguments that are customed by some specific task.
+                For instance, forecasting tasks require:
+                    min_num_test_instances (int):  minimal number of instances used to initialize a proxy validation set
+                    suggested_init_models (List[str]):  A set of initial models suggested by the users. Their
+                        hyperparameters are determined by the default configurations
+                    custom_init_setting_path (str): The path to the initial hyperparameter configurations set by
+                    the users
 
         """
         super(AutoMLSMBO, self).__init__()

From ee66c253fb29901e2abf8b6dbfbb7e72cc18e871 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Jun 2022 20:41:27 +0200
Subject: [PATCH 329/347] correct encoder to decoder in decoder.init

---
 .../forecasting_backbone/forecasting_decoder/__init__.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
index 345dd39da..62fb78240 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/__init__.py
@@ -13,5 +13,5 @@
 decoder_addons = ThirdPartyComponents(BaseForecastingDecoder)
 
 
-def add_decoder(encoder: BaseForecastingDecoder) -> None:
-    decoder_addons.add_component(encoder)
+def add_decoder(decoder: BaseForecastingDecoder) -> None:
+    decoder_addons.add_component(decoder)

From 877a12481b6beafb235447e9521aedded9660838 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 15 Jun 2022 21:02:42 +0200
Subject: [PATCH 330/347] fix doc strings

---
 ...time_series_forecasting_train_evaluator.py |  90 ++++++++--
 .../forecasting_training_loss/__init__.py     |  12 +-
 .../setup/network/forecasting_architecture.py | 157 +++++++++++++-----
 .../forecasting_backbone/__init__.py          |  39 +++--
 .../forecasting_decoder/MLPDecoder.py         |  38 +++--
 .../forecasting_decoder/NBEATSDecoder.py      | 125 ++++++++++----
 .../forecasting_decoder/TransformerDecoder.py |  24 +++
 .../base_forecasting_decoder.py               |  26 ++-
 .../forecasting_encoder/__init__.py           |  16 +-
 .../base_forecasting_encoder.py               |   2 +-
 .../seq_encoder/InceptionTimeEncoder.py       |  36 ++--
 .../seq_encoder/TransformerEncoder.py         |  34 +++-
 .../seq_encoder/__init__.py                   |  58 ++++---
 .../forecasting_network_head/NBEATS_head.py   |   4 +-
 14 files changed, 475 insertions(+), 186 deletions(-)

diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 9f02f9305..e78dd8030 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -22,6 +22,86 @@
 
 
 class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
+    """
+    This class is  similar to the TrainEvaluator. Except that given the specific
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        configuration (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed. A dummy estimator is created for
+            integer configurations, a traditional machine learning pipeline is created
+            for string based configuration, and NAS is performed when a configuration
+            object is passed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
+            Used as a list to pass more fine-grained
+            information on what to save. Must be a member of `DisableFileOutputParameters`.
+            Allowed elements in the list are:
+
+            + `y_optimization`:
+                do not save the predictions for the optimization set,
+                which would later on be used to build an ensemble. Note that SMAC
+                optimizes a metric evaluated on the optimization set.
+            + `pipeline`:
+                do not save any individual pipeline files
+            + `pipelines`:
+                In case of cross validation, disables saving the joint model of the
+                pipelines fit on each fold.
+            + `y_test`:
+                do not save the predictions for the test set.
+            + `all`:
+                do not save any of the above.
+            For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        all_supported_metrics  (bool):
+            Whether all supported metric should be calculated for every configuration.
+        search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
+            An object used to fine tune the hyperparameter search space of the pipeline
+        max_budget (float):
+            maximal budget value available for the optimizer. This is applied to compute the size of the proxy
+            validation sets
+        min_num_test_instances (Optional[int]):
+            minimal number of instances to be validated. We do so to ensure that there are enough instances in
+            the validation set
+
+    """
     def __init__(self, backend: Backend, queue: Queue,
                  metric: autoPyTorchMetric,
                  budget: float,
@@ -41,16 +121,6 @@ def __init__(self, backend: Backend, queue: Queue,
                  search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
                  max_budget: float = 1.0,
                  min_num_test_instances: Optional[int] = None) -> None:
-        """
-        Attributes:
-            max_budget (Optional[float]):
-                maximal budget the optimizer could allocate
-            min_num_test_instances: Optional[int]
-                minimal number of validation instances to be evaluated, if the size of the validation set is greater
-                than this value, then less instances from validation sets will be evaluated. The other predictions
-                 will be filled with dummy predictor
-
-        """
         super(TimeSeriesForecastingTrainEvaluator, self).__init__(
             backend=backend,
             queue=queue,
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
index cf5d4590b..f9e2b0789 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_training_loss/__init__.py
@@ -54,12 +54,12 @@ def get_available_components(
         include/exclude directives, as well as the dataset properties
 
         Args:
-         include (Optional[Dict[str, Any]]):
-            what hyper-parameter configurations to honor when creating the configuration space
-         exclude (Optional[Dict[str, Any]]):
-            what hyper-parameter configurations to remove from the configuration space
-         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
-            Characteristics of the dataset to guide the pipeline choices of components
+            include (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to honor when creating the configuration space
+            exclude (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to remove from the configuration space
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                Characteristics of the dataset to guide the pipeline choices of components
 
         Returns:
             Dict[str, autoPyTorchComponent]: A filtered dict of learning
diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
index 585f4d1b8..fc7ac3ae1 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -154,6 +154,41 @@ def get_lagged_subsequences_inference(
 
 
 class AbstractForecastingNet(nn.Module):
+    """
+    This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
+    MLP decoder and the final head).
+
+    This structure is active when the decoder is a MLP with auto_regressive set as false
+
+    Attributes:
+        network_structure (NetworkStructure):
+            network structure information
+        network_embedding (nn.Module):
+            network embedding
+        network_encoder (Dict[str, EncoderBlockInfo]):
+            Encoder network, could be selected to return a sequence or a 2D Matrix
+        network_decoder (Dict[str, DecoderBlockInfo]):
+            network decoder
+        temporal_fusion Optional[TemporalFusionLayer]:
+            Temporal Fusion Layer
+        network_head (nn.Module):
+            network head, maps the output of decoder to the final output
+        dataset_properties (Dict):
+            dataset properties
+        auto_regressive (bool):
+            if the model is auto-regressive model
+        output_type (str):
+            the form that the network outputs. It could be regression, distribution or quantile
+        forecast_strategy (str):
+            only valid if output_type is distribution or quantile, how the network transforms
+            its output to predicted values, could be mean or sample
+        num_samples (int):
+            only valid if output_type is not regression and forecast_strategy is sample. This indicates the
+            number of the points to sample when doing prediction
+        aggregation (str):
+            only valid if output_type is not regression and forecast_strategy is sample. The way that the samples
+            are aggregated. We could take their mean or median values.
+    """
     future_target_required = False
     dtype = torch.float
 
@@ -178,41 +213,6 @@ def __init__(self,
                  num_samples: int = 50,
                  aggregation: str = 'mean'
                  ):
-        """
-        This is a basic forecasting network. It is only composed of a embedding net, an encoder and a head (including
-        MLP decoder and the final head).
-
-        This structure is active when the decoder is a MLP with auto_regressive set as false
-
-        Args:
-            network_structure (NetworkStructure):
-                network structure information
-            network_embedding (nn.Module):
-                network embedding
-            network_encoder (Dict[str, EncoderBlockInfo]):
-                Encoder network, could be selected to return a sequence or a 2D Matrix
-            network_decoder (Dict[str, DecoderBlockInfo]):
-                network decoder
-            temporal_fusion Optional[TemporalFusionLayer]:
-                Temporal Fusion Layer
-            network_head (nn.Module):
-                network head, maps the output of decoder to the final output
-            dataset_properties (Dict):
-                dataset properties
-            auto_regressive (bool):
-                if the model is auto-regressive model
-            output_type (str):
-                the form that the network outputs. It could be regression, distribution or quantile
-            forecast_strategy (str):
-                only valid if output_type is distribution or quantile, how the network transforms
-                its output to predicted values, could be mean or sample
-            num_samples (int):
-                only valid if output_type is not regression and forecast_strategy is sample. This indicates the
-                number of the points to sample when doing prediction
-            aggregation (str):
-                only valid if output_type is not regression and forecast_strategy is sample. The way that the samples
-                are aggregated. We could take their mean or median values.
-        """
         super().__init__()
         self.network_structure = network_structure
         self.embedding = network_embedding
@@ -305,6 +305,23 @@ def rescale_output(self,
                        loc: Optional[torch.Tensor],
                        scale: Optional[torch.Tensor],
                        device: torch.device = torch.device('cpu')) -> ALL_NET_OUTPUT:
+        """
+        rescale the network output to its raw scale
+
+        Args:
+            outputs (ALL_NET_OUTPUT):
+                network head output
+            loc (Optional[torch.Tensor]):
+                scaling location value
+            scale (Optional[torch.Tensor]):
+                scaling scale value
+            device (torch.device):
+                which device the output is stored
+
+        Return:
+            ALL_NET_OUTPUT:
+                rescaleed network output
+        """
         if isinstance(outputs, List):
             return [self.rescale_output(output, loc, scale, device) for output in outputs]
         if loc is not None or scale is not None:
@@ -323,17 +340,34 @@ def rescale_output(self,
         return outputs
 
     def scale_value(self,
-                    outputs: torch.Tensor,
+                    raw_value: torch.Tensor,
                     loc: Optional[torch.Tensor],
                     scale: Optional[torch.Tensor],
                     device: torch.device = torch.device('cpu')) -> torch.Tensor:
+        """
+        scale the outputs
+
+        Args:
+            raw_value (torch.Tensor):
+                network head output
+            loc (Optional[torch.Tensor]):
+                scaling location value
+            scale (Optional[torch.Tensor]):
+                scaling scale value
+            device (torch.device):
+                which device the output is stored
+
+        Return:
+            torch.Tensor:
+                scaled input value
+        """
         if loc is not None or scale is not None:
             if loc is None:
-                outputs = outputs / scale.to(device)  # type: ignore[union-attr]
+                outputs = raw_value / scale.to(device)  # type: ignore[union-attr]
             elif scale is None:
-                outputs = outputs - loc.to(device)
+                outputs = raw_value - loc.to(device)
             else:
-                outputs = (outputs - loc.to(device)) / scale.to(device)
+                outputs = (raw_value - loc.to(device)) / scale.to(device)
         return outputs
 
     @abstractmethod
@@ -349,6 +383,17 @@ def forward(self,
 
     @abstractmethod
     def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
+        """
+        This function is applied to transform the network head output to torch tensor to create the point prediction
+
+        Args:
+            net_output (ALL_NET_OUTPUT):
+                network head output
+
+        Return:
+            torch.Tensor:
+                point prediction
+        """
         raise NotImplementedError
 
     @abstractmethod
@@ -364,6 +409,23 @@ def repeat_intermediate_values(self,
                                    intermediate_values: List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]],
                                    is_hidden_states: List[bool],
                                    repeats: int) -> List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]]:
+        """
+        This function is often applied for auto-regressive model where we sample multiple points to form several
+        trajectories and we need to repeat the intermediate values to ensure that the batch sizes match
+
+        Args:
+             intermediate_values (List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]])
+                a list of intermediate values to be repeated
+             is_hidden_states  (List[bool]):
+                if the intermediate_value is hidden states in RNN-form network, we need to consider the
+                hidden states differently
+            repeats (int):
+                number of repeats
+
+        Return:
+            List[Optional[Union[torch.Tensor, Tuple[torch.Tensor]]]]:
+                repeated values
+        """
         for i, (is_hx, inter_value) in enumerate(zip(is_hidden_states, intermediate_values)):
             if isinstance(inter_value, torch.Tensor):
                 repeated_value = inter_value.repeat_interleave(repeats=repeats, dim=1 if is_hx else 0)
@@ -375,6 +437,19 @@ def repeat_intermediate_values(self,
         return intermediate_values
 
     def pad_tensor(self, tensor_to_be_padded: torch.Tensor, target_length: int) -> torch.Tensor:
+        """
+        pad tensor to meet the required length
+
+        Args:
+             tensor_to_be_padded (torch.Tensor)
+                tensor to be padded
+             target_length  (int):
+                target length
+
+        Return:
+            torch.Tensor:
+                padded tensors
+        """
         tensor_shape = tensor_to_be_padded.shape
         padding_size = [tensor_shape[0], target_length - tensor_shape[1], tensor_shape[-1]]
         tensor_to_be_padded = torch.cat([tensor_to_be_padded.new_zeros(padding_size), tensor_to_be_padded], dim=1)
@@ -1174,6 +1249,9 @@ def forward(self,  # type: ignore[override]
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor,
                                                                                    Tuple[torch.Tensor, torch.Tensor]]:
+
+        # Unlike other networks, NBEATS network is required to predict both past and future targets.
+        # Thereby, we return two tensors for backcast and forecast
         if past_observed_targets is None:
             past_observed_targets = torch.ones_like(past_targets, dtype=torch.bool)
 
@@ -1194,6 +1272,7 @@ def forward(self,  # type: ignore[override]
         forecast = torch.zeros(forcast_shape).to(self.device).flatten(1)
         backcast, _ = self.encoder(past_targets, [None])
         backcast = backcast[0]
+        # nbeats network only has one decoder block (flat decoder)
         for block in self.decoder.decoder['block_1']:
             backcast_block, forecast_block = block([None], backcast)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
index 1da656f16..e0417f587 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/__init__.py
@@ -64,16 +64,18 @@ def get_available_components(  # type: ignore[override]
         include/exclude directives, as well as the dataset properties
 
         Args:
-         include (Optional[Dict[str, Any]]): what hyper-parameter configurations
-            to honor when creating the configuration space
-         exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations
-             to remove from the configuration space
-         dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Caracteristics
-             of the dataset to guide the pipeline choices of components
+            include (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to honor when creating the configuration space. It can also include
+                nested components, for instance, flat_encoder:MLPEncoder
+            exclude (Optional[Dict[str, Any]]):
+                what hyper-parameter configurations to remove from the configuration space. It can also include
+                nested components, for instance, flat_encoder:MLPEncoder
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                Characteristics of the dataset to guide the pipeline choices of components
 
         Returns:
-            Dict[str, autoPyTorchComponent]: A filtered dict of learning
-                rate backbones
+            Dict[str, autoPyTorchComponent]:
+                A filtered dict of learning rate backbones
 
         """
         if dataset_properties is None:
@@ -157,18 +159,19 @@ def get_hyperparameter_search_space(
         """Returns the configuration space of the current chosen components
 
         Args:
-            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
-            default (Optional[str]): Default backbone to use
-            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
-                list, and will exclusively use this components.
-            exclude: Optional[Dict[str, Any]]: which components to skip
-            network_type: type of the network, it determines how to handle the sequential data: flat networks
-            (FFNN and NBEATS) simply flat the input to a 2D input, whereas seq network receives sequential 3D inputs:
-            thus, seq networks could be stacked to form a larger network that is composed of different parts.
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            default (Optional[str]):
+                Default backbone to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components.
+                It can also include nested components, for instance, flat_encoder:MLPEncoder
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip. It can also include nested components, for instance, flat_encoder:MLPEncoder
 
         Returns:
-            ConfigurationSpace: the configuration space of the hyper-parameters of the
-                 chosen component
+            ConfigurationSpace:
+                the configuration space of the hyper-parameters of the chosen component
         """
         if dataset_properties is None:
             dataset_properties = {}
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
index c67e98b98..b5d9eead8 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/MLPDecoder.py
@@ -146,22 +146,30 @@ def get_hyperparameter_search_space(
         https://arxiv.org/abs/1704.04110
 
         Args:
-            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Dataset Properties
-            can_be_auto_regressive (bool): if this decoder is allowed to be auto-regressive
-            is_top_layer (bool) if this mlp decoder is at the top layer as seq decoders. Only top layer MLP allows
-                deactivating local layers. (Otherwise the decoder cannot output a sequence)
-            num_layers (HyperparameterSearchSpace): number of decoder layers (the last layer is not included, thus it
-                could start from 0)
-            units_layer (HyperparameterSearchSpace): number of units of each layer (except for the last layer)
-            activation (HyperparameterSearchSpace): activation function
-            auto_regressive (bool): if the model acts as a DeepAR model, the corresponding hyperparaemter is
-                controlled by seq_encoder
-            has_local_layer (HyperparameterSearchSpace): if local MLP layer is applied, if not, the output of the
-                network will be directly attached with different heads
-            units_local_layer (HyperparameterSearchSpace): number of units of local layer. The size of this layer is
-                smaller as it needs to be expanded to adapt to the number of predictions
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                Dataset Properties
+            can_be_auto_regressive (bool):
+                if this decoder is allowed to be auto-regressive
+            is_top_layer (bool):
+                if this mlp decoder is at the top layer as seq decoders. Only top layer MLP allows deactivating local
+                layers. (Otherwise, the decoder cannot output a sequence)
+            num_layers (HyperparameterSearchSpace):
+                number of decoder layers (the last layer is not included, thus it starts from 0)
+            units_layer (HyperparameterSearchSpace):
+                number of units of each layer (except for the last layer)
+            activation (HyperparameterSearchSpace):
+                activation function
+            auto_regressive (HyperparameterSearchSpace):
+                if the model acts as a DeepAR model, the corresponding hyperparaemter is controlled by seq_encoder
+            has_local_layer (HyperparameterSearchSpace):
+                if local MLP layer is applied, if not, the output of the network will be directly attached
+                 with different heads
+            units_local_layer (HyperparameterSearchSpace):
+                number of units of local layer. The size of this layer is smaller as it needs to be
+                expanded to adapt to the number of predictions
         Returns:
-            cs (ConfigurationSpace): ConfigurationSpace
+            cs (ConfigurationSpace):
+                ConfigurationSpace
         """
         if dataset_properties is not None:
             encoder_can_be_auto_regressive = dataset_properties.get('encoder_can_be_auto_regressive', False)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index e7bc605e4..44ced8be9 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -28,7 +28,47 @@
 )
 
 
-class NBEATSBLock(DecoderNetwork):
+class NBEATSBlock(DecoderNetwork):
+    """
+    An N-BEATS block. An N-BEATS network is stacked by multiple Blocks.
+    For detail, we refer to
+    Oreshkin et al., N-BEATS: Neural basis expansion analysis for interpretable time series forecasting
+    https://arxiv.org/abs/1905.10437
+
+    The hyperaprameter defination are quite simialr to
+    https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
+
+    However, we only construct the forecast/ backcast head under
+    autoPyTorch.pipeline.components.setup.network_head.forecasting_network_head. As we only get to know the
+    output shape and forecasting horizon there
+
+    Attributes:
+        n_in_features (int):
+            number of input features
+        stack_idx (int):
+            index of the current stack
+        stack_type (str):
+            type of this stack. Could be one of 'generic', 'seasonality', 'trend'
+        num_blocks (int):
+            number of blocks exist in this stack
+        num_layers (int):
+            number of network layer inside each block
+        width (int):
+            network width (number of features)
+        normalization (str):
+            normalization type, could be BN or LN
+        activation (str):
+            activation function type
+        weight_sharing (bool):
+            if weights are shared for this block
+        expansion_coefficient_length (int):
+            expansion_coefficient_length
+        use_dropout (bool):
+            if dropout is applied
+        dropout_rate (Optional[float]).
+            dropout rate
+    """
+
     def __init__(self,
                  n_in_features: int,
                  stack_idx: int,
@@ -108,18 +148,18 @@ def _build_decoder(self,
                        encoder_output_shape: Tuple[int, ...],
                        future_variable_input: Tuple[int, ...],
                        n_prediction_heads: int,
-                       dataset_properties: Dict) -> Tuple[List[List[NBEATSBLock]], int]:
+                       dataset_properties: Dict) -> Tuple[List[List[NBEATSBlock]], int]:
         in_features = encoder_output_shape[-1]
         n_beats_type = self.config['n_beats_type']
         if n_beats_type == 'G':
-            stacks: List[List[NBEATSBLock]] = [[] for _ in range(self.config['num_stacks_g'])]
+            stacks: List[List[NBEATSBlock]] = [[] for _ in range(self.config['num_stacks_g'])]
             for stack_idx in range(1, self.config['num_stacks_g'] + 1):
                 for block_idx in range(self.config['num_blocks_g']):
                     if self.config['weight_sharing_g'] and block_idx > 0:
                         # for weight sharing, we only create one instance
                         break
                     ecl = self.config['expansion_coefficient_length_g']
-                    stacks[stack_idx - 1].append(NBEATSBLock(in_features,
+                    stacks[stack_idx - 1].append(NBEATSBlock(in_features,
                                                              stack_idx=stack_idx,
                                                              stack_type='generic',
                                                              num_blocks=self.config['num_blocks_g'],
@@ -134,7 +174,7 @@ def _build_decoder(self,
                                                              ))
 
         elif n_beats_type == 'I':
-            stacks: List[List[NBEATSBLock]] = [[] for _ in range(self.config['num_stacks_i'])]  # type:ignore
+            stacks: List[List[NBEATSBlock]] = [[] for _ in range(self.config['num_stacks_i'])]  # type:ignore
             for stack_idx in range(1, self.config['num_stacks_i'] + 1):
                 for block_idx in range(self.config['num_blocks_i_%d' % stack_idx]):
                     if self.config['weight_sharing_i_%d' % stack_idx] and block_idx > 0:
@@ -150,7 +190,7 @@ def _build_decoder(self,
                     else:
                         raise ValueError(f"Unsupported stack_type {stack_type}")
 
-                    stacks[stack_idx - 1].append(NBEATSBLock(
+                    stacks[stack_idx - 1].append(NBEATSBlock(
                         in_features,
                         stack_idx=stack_idx,
                         stack_type=stack_type,
@@ -296,38 +336,61 @@ def get_hyperparameter_search_space(
         The design of the configuration space follows pytorch-forecasting:
         https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
         Give that N-BEATS-I and N-BEATS-G's default hyperparameter configuration that totally different, we consider
-        them as two seperate configuration space: N-BEATS-G that only contains generic blocks and thus could be scaled
+        them as two separate configuration space: N-BEATS-G that only contains generic blocks and thus could be scaled
         up to 32 stacks, while each stacks share the same number of blocks/ width/ dropout rate. While N-BEATS-I is
         is restricted to be a network with a much smaller number of stacks. However, the block type of N-BEATS-G at each
         stack can be freely selected
-        freely selected
+
         Args:
-            dataset_properties:
-            n_beats_type: type of nbeats network, could be I (N-BEATS-I) or G (N-BEATS-G)
-            num_stacks_g: number of stacks
-            num_blocks_g: number of blocks per stack
-            num_layers_g: number of fc layers per block, this value is the same across all the blocks within one stack
-            width_g: fc layer width, this value is the same across all the blocks within one stack
-            num_stacks_i: number of stacks
-            num_blocks_i: number of blocks per stack
-            num_layers_i: number of fc layers per block, this value is the same across all the blocks within one stack
-            width_i: fc layer width, this value is the same across all the blocks within one stack
-            weight_sharing: if weights are shared inside one block
-            stack_type: stack type, used to define the final output
-            expansion_coefficient_length_generic: expansion_coefficient_length, activate if stack_type is 'generic'
-            expansion_coefficient_length_seasonality: expansion_coefficient_length, activate if stack_type is
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]):
+                dataset properties
+            n_beats_type (str):
+                type of nbeats network, could be I (N-BEATS-I) or G (N-BEATS-G)
+            num_stacks_g (int):
+                number of stacks for N-BEATS G
+            num_blocks_g (int):
+                number of blocks per stack for n-BEATS G
+            num_layers_g (int):
+                number of fc layers per block for N-BEATS G, this value is the same across all the blocks
+                within one stack
+            width_g (int):
+                fc layer width for N-BEATS G, this value is the same across all the blocks within one stack
+            num_stacks_i (int):
+                number of stacks for N-BEATS I
+            num_blocks_i (int):
+                number of blocks per stack  for N-BEATS I
+            num_layers_i (int):
+                number of fc layers per block for N-BEATS I, this value is the same across all the
+                blocks within one stack
+            width_i (int):
+                fc layer width for N-BEATS I, this value is the same across all the blocks within one stack
+            weight_sharing (bool):
+                if weights are shared inside one block
+            stack_type (str):
+                stack type, used to define the final output
+            expansion_coefficient_length_generic (int):
+                expansion_coefficient_length for N-BEATS G, activate if stack_type is 'generic'
+            expansion_coefficient_length_seasonality (int):
+                expansion_coefficient_length for N-BEATS I, activate if stack_type is
                 'seasonality' (n_dim = expansion_coefficient_length_interpretable * n_prediciton_steps)
-            expansion_coefficient_length_trend: expansion_coefficient_length, activate if stack_type is 'trend' (it
+            expansion_coefficient_length_trend (int):
+                expansion_coefficient_length for N-BEATS I, activate if stack_type is 'trend' (it
                 corresponds to the degree of the polynomial)
-            activation: activation function across fc layers
-            use_dropout: if dropout is applied
-            normalization: if normalization is applied
-            dropout: dropout value, if use_dropout is set as True
-            backcast_loss_ration: weight of backcast in comparison to forecast when calculating the loss.
-                A weight of 1.0 means that forecast and backcast loss is weighted the same (regardless of backcast and
-                forecast lengths). Defaults to 0.0, i.e. no weight.
+            activation (str):
+                activation function across fc layers
+            use_dropout (bool):
+                if dropout is applied
+            normalization (str):
+                normalization type, could be BN, LN or no normalization
+            dropout (float):
+                dropout value
+            backcast_loss_ration (float):
+                weight of backcast in comparison to forecast when calculating the loss. A weight of 1.0 indicates that
+                forecast and backcast loss is weighted the same (regardless of backcast and forecast lengths).
+                Defaults to 0.0, i.e. no weight.
         Returns:
-            Configuration Space
+            ConfigurationSpace:
+                Configuration Space
         """
 
         cs = ConfigurationSpace()
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
index 522325b2a..021846e8a 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/TransformerDecoder.py
@@ -192,6 +192,30 @@ def get_hyperparameter_search_space(
         get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
         consider their log value (with base 2) as the hyperparameters
 
+        Args:
+            num_layers (int):
+                number of transformer layers
+            n_head_log (int):
+                log value (base 2, this should work for all the following hyperparameters with logs) of number of head
+            d_feed_forward_log (int):
+                log values of feed forward network width
+            norm_first (bool):
+                if ``True``, layer norm is done prior to attention and feedforward operations, respectivaly.
+                Otherwise, it's done after. Default: ``False`` (after).
+            layer_norm_eps (float):
+                eps for layer norm
+            use_layer_norm_output (bool):
+                if layer norm output is applied
+            activation (str):
+                activation function type
+            use_dropout (bool):
+                if dropout is applied
+            dropout (float):
+                dropout rate
+
+        Returns:
+            ConfigurationSpace:
+                configuration space
         """
         cs = CS.ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
index 6b8f3fd93..8d816a413 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/base_forecasting_decoder.py
@@ -15,8 +15,8 @@
 
 class BaseForecastingDecoder(autoPyTorchComponent):
     """
-    Base class for network heads used for forecasting.
-     Holds the head module and the config which was used to create it.
+    Base class for network decoder used for forecasting. Holds the decoder module and the config which was used to
+    create it.
     """
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
@@ -26,18 +26,7 @@ def __init__(self,
                  **kwargs: Any):
         super().__init__()
         self.block_number = block_number
-        self.add_fit_requirements(self._required_fit_requirements)
-        self.auto_regressive = auto_regressive
-        self.config = kwargs
-        self.decoder: Optional[nn.Module] = None
-        self.n_decoder_output_features: Optional[int] = None
-        self.decoder_input_shape: Optional[Tuple[int, ...]] = None
-        self.n_prediction_heads = 1
-        self.is_last_decoder: Optional[bool] = False
-
-    @property
-    def _required_fit_requirements(self) -> List[FitRequirement]:
-        return [
+        self.add_fit_requirements([
             FitRequirement('known_future_features', (tuple,), user_defined=False, dataset_property=True),
             FitRequirement('feature_shapes', (Dict,), user_defined=False, dataset_property=True),
             FitRequirement('network_encoder', (OrderedDict,), user_defined=False, dataset_property=False),
@@ -45,7 +34,14 @@ def _required_fit_requirements(self) -> List[FitRequirement]:
             FitRequirement('network_structure', (NetworkStructure,), user_defined=False, dataset_property=False),
             FitRequirement('transform_time_features', (bool,), user_defined=False, dataset_property=False),
             FitRequirement('time_feature_transform', (Iterable,), user_defined=False, dataset_property=True)
-        ]
+        ])
+        self.auto_regressive = auto_regressive
+        self.config = kwargs
+        self.decoder: Optional[nn.Module] = None
+        self.n_decoder_output_features: Optional[int] = None
+        self.decoder_input_shape: Optional[Tuple[int, ...]] = None
+        self.n_prediction_heads = 1
+        self.is_last_decoder: Optional[bool] = False
 
     @property
     def fitted_encoder(self) -> List[str]:
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
index 56cd21e68..e4b905fee 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/__init__.py
@@ -155,11 +155,15 @@ def get_hyperparameter_search_space(
         """Returns the configuration space of the current chosen components
 
         Args:
-            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
-            default (Optional[str]): Default backbone to use
-            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
-                list, and will exclusively use this components.
-            exclude: Optional[Dict[str, Any]]: which components to skip
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            default (Optional[str]):
+                Default encoder to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components. It
+                allows nested encoder such as flat_encoder:MLPEncoder
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip. It allows nested encoder as such flat_encoder:MLPEncoder
 
         Returns:
             ConfigurationSpace: the configuration space of the hyper-parameters of the
@@ -170,7 +174,7 @@ def get_hyperparameter_search_space(
         if dataset_properties is None:
             dataset_properties = {}
 
-        # Compile a list of legal preprocessors for this problem
+        # Compile a list of legal components for this problem
         available_encoders = self.get_available_components(
             dataset_properties=dataset_properties,
             include=include, exclude=exclude)
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
index b2977a760..a82db4f95 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/base_forecasting_encoder.py
@@ -20,7 +20,7 @@
 
 class BaseForecastingEncoder(autoPyTorchComponent):
     """
-    Base class for network backbones. Holds the backbone module and the config which was used to create it.
+    Base class for network backbones. Holds the encoder module and the config which was used to create it.
     """
     _required_properties = ["name", "shortname", "handles_tabular", "handles_image", "handles_time_series"]
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
index f31de69af..b3decba68 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/InceptionTimeEncoder.py
@@ -173,26 +173,26 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     @staticmethod
     def get_hyperparameter_search_space(
-            dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-            num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
-                                                                              value_range=(1, 5),
-                                                                              default_value=3,
-                                                                              ),
-            num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
-                                                                               value_range=(4, 64),
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
+        num_blocks: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_blocks",
+                                                                          value_range=(1, 5),
+                                                                          default_value=3,
+                                                                          ),
+        num_filters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="num_filters",
+                                                                           value_range=(4, 64),
+                                                                           default_value=32,
+                                                                           log=True,
+                                                                           ),
+        kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
+                                                                           value_range=(4, 64),
+                                                                           default_value=32,
+                                                                           log=True,
+                                                                           ),
+        bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
+                                                                               value_range=(16, 64),
                                                                                default_value=32,
-                                                                               log=True,
+                                                                               log=True
                                                                                ),
-            kernel_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="kernel_size",
-                                                                               value_range=(4, 64),
-                                                                               default_value=32,
-                                                                               log=True,
-                                                                               ),
-            bottleneck_size: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="bottleneck_size",
-                                                                                   value_range=(16, 64),
-                                                                                   default_value=32,
-                                                                                   log=True
-                                                                                   ),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
index 262288d7b..521efc7df 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/TransformerEncoder.py
@@ -12,7 +12,9 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import BaseEstimator
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.components_util import (
-    PositionalEncoding, build_transformer_layers)
+    PositionalEncoding,
+    build_transformer_layers
+)
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.\
     base_forecasting_encoder import BaseForecastingEncoder, EncoderProperties
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_encoder.components import \
@@ -197,6 +199,36 @@ def get_hyperparameter_search_space(
         get hyperparameter search space for Transformer, Given that d_model must be a multiple of n_head_log, we
         consider their log value (with base 2) as the hyperparameters
 
+        Args:
+            num_layers (int):
+                number of transformer layers
+            n_head_log (int):
+                log value (base 2, this should work for all the following hyperparameters with logs) of number of head
+            d_model_log (int):
+                log values of input of dimensions passed to feed forward network
+            d_feed_forward_log (int):
+                log values of feed forward network width
+            norm_first (bool):
+                if ``True``, layer norm is done prior to attention and feedforward operations, respectivaly.
+                Otherwise, it's done after. Default: ``False`` (after).
+            layer_norm_eps (float):
+                eps for layer norm
+            use_positional_encoder (bool):
+                if positional encoder is applied
+            use_layer_norm_output (bool):
+                if layer norm output is applied
+            activation (str):
+                activation function type
+            use_dropout (bool):
+                if dropout is applied
+            dropout (float):
+                dropout rate
+            decoder_type (str):
+                type of decoder, could be MLPDecoder (DeepAR) or TransformerDecoder (seq2seq)
+
+        Returns:
+            ConfigurationSpace:
+                configuration space
         """
         cs = CS.ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
index 0d2b23aee..6a4b85a8b 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_encoder/seq_encoder/__init__.py
@@ -133,31 +133,41 @@ def get_hyperparameter_search_space(  # type: ignore[override]
         """Returns the configuration space of the current chosen components
 
         Args:
-            dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on
-            num_blocks (HyperparameterSearchSpace): number of encoder-decoder structure blocks
-            variable_selection (HyperparameterSearchSpace): if variable selection is applied, if True, then the first
-                block will be attached with a variable selection block while the following will be enriched with static
-                features.
-            variable_selection_use_dropout (HyperparameterSearchSpace): if variable selection network uses dropout
-            variable_selection_dropout_rate (HyperparameterSearchSpace): dropout rate of variable selection network
-            share_single_variable_networks (HyperparameterSearchSpace): if single variable networks are shared between
-                encoder and decoder
-            skip_connection: HyperparameterSearchSpace: if skip connection is applied
-            use_temporal_fusion (HyperparameterSearchSpace): if temporal fusion layer is applied
-            skip_connection_type (HyperparameterSearchSpace): skip connection type, it could be directly added or a grn
-                network (
-                Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
+            dataset_properties (Optional[Dict[str, str]]):
+                Describes the dataset to work on
+            num_blocks (int):
+                number of encoder-decoder structure blocks
+            variable_selection (bool):
+                if variable selection is applied, if True, then the first block will be attached with a variable
+                 selection block while the following will be enriched with static features.
+            variable_selection_use_dropout (bool):
+                if variable selection network uses dropout
+            variable_selection_dropout_rate (float):
+                dropout rate of variable selection network
+            share_single_variable_networks (bool):
+                if single variable networks are shared between encoder and decoder
+            skip_connection (int):
+                if skip connection is applied
+            use_temporal_fusion (int):
+                if temporal fusion layer is applied
+            skip_connection_type (str):
+                skip connection type, it could be directly added or a GRN network
+                (Lim et al, Temporal Fusion Transformers for Interpretable Multi-horizon Time Series Forecasting:
                 https://arxiv.org/abs/1912.09363) TODO consider hidden size of grn as a new HP
-            grn_use_dropout (HyperparameterSearchSpace): if dropout layer is applied to GRN, since variable selection
-                network also contains GRN, this parameter also influence variable selection network
-            grn_dropout_rate (HyperparameterSearchSpace): dropout rate of GRN, same as above, this variable also
-                influence variable selection network
-            decoder_auto_regressive: HyperparameterSearchSpace: if decoder is auto_regressive, e.g., if the decoder
-                receives the output as its input, this only works for  auto_regressive decoder models
-            default (Optional[str]): Default backbone to use
-            include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive
-                list, and will exclusively use this components.
-            exclude: Optional[Dict[str, Any]]: which components to skip
+            grn_use_dropout (bool):
+                if dropout layer is applied to GRN, since variable selection network also contains GRN,
+                this parameter also influence variable selection network
+            grn_dropout_rate (float):
+                dropout rate of GRN, same as above, this variable also influence variable selection network
+            decoder_auto_regressive (int):
+                if decoder is auto_regressive, e.g., if the decoder receives the output as its input,
+                 this only works for  auto_regressive decoder models
+            default (Optional[str]):
+                Default backbone to use
+            include: Optional[Dict[str, Any]]:
+                what components to include. It is an exhaustive list, and will exclusively use this components.
+            exclude: Optional[Dict[str, Any]]:
+                which components to skip
 
         Returns:
             ConfigurationSpace: the configuration space of the hyper-parameters of the
diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index b23929175..0871c1bd3 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -9,7 +9,7 @@
 from torch import nn
 
 from autoPyTorch.pipeline.components.setup.network_backbone.forecasting_backbone.forecasting_decoder.\
-    NBEATSDecoder import NBEATSBLock
+    NBEATSDecoder import NBEATSBlock
 
 
 class TransposeLinear(nn.Module):
@@ -95,7 +95,7 @@ def get_frequencies(n: int) -> np.ndarray:
     return backcast_head, forecast_head
 
 
-def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBLock]],
+def build_NBEATS_network(nbeats_decoder: List[List[NBEATSBlock]],
                          output_shape: Tuple[int]) -> nn.ModuleList:
     nbeats_blocks = []
     for stack_idx, stack in enumerate(nbeats_decoder):

From 1d3a74ea6f6327bd03236bd0637d2d2d0241e16b Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Jun 2022 17:25:53 +0200
Subject: [PATCH 331/347] add license and docstrings for NBEATS heads

---
 .../forecasting_network_head/NBEATS_head.py   | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
index 0871c1bd3..8ca713882 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/forecasting_network_head/NBEATS_head.py
@@ -1,3 +1,25 @@
+# THE MIT License
+
+# Copyright 2020 Jan Beitner
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
 # This part of implementation follows pytorch-forecasting:
 # https://github.com/jdb78/pytorch-forecasting/blob/master/pytorch_forecasting/models/nbeats/sub_modules.py
 
@@ -22,6 +44,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 def linspace(backcast_length: int, forecast_length: int, centered: bool = False) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    a function to generate a linear space to encode the positions of the components. For details. We refer to
+    Oreshkin et al. N-BEATS: Neural basis expansion analysis for interpretable time series forecasting
+    https://arxiv.org/abs/1905.10437
+    """
     if centered:
         norm = max(backcast_length, forecast_length)
         start = -backcast_length

From 25168595398b9c8c80f51d4455959eee30f53962 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Jun 2022 18:19:13 +0200
Subject: [PATCH 332/347] allow memory limit to be None

---
 autoPyTorch/api/time_series_forecasting.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 688b67240..87a3d1d92 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -18,7 +18,6 @@
     BaseDatasetPropertiesType
 )
 from autoPyTorch.datasets.resampling_strategy import (
-    CrossValTypes,
     HoldoutValTypes,
     ResamplingStrategies
 )
@@ -422,6 +421,8 @@ def search(
             self._dataset_compression = get_dataset_compression_mapping(
                 memory_limit, dataset_compression
             )
+        else:
+            self._dataset_compression = None
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,

From fe5e5870a97232acbcbaba10227647f8a3a6c7f4 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Jun 2022 19:49:03 +0200
Subject: [PATCH 333/347] relax test load for forecasting

---
 test/test_api/test_api.py | 46 ++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 49aca1c37..87a442df6 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -15,6 +15,7 @@
 
 import pytest
 
+
 import sklearn
 import sklearn.datasets
 from sklearn.base import BaseEstimator, clone
@@ -405,29 +406,30 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
     assert 'Estimator' in representation
 
 
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', new=dummy_eval_train_function)
-@pytest.mark.parametrize('forecasting_toy_dataset', ['multi_variant_wo_missing'], indirect=True)
+@pytest.mark.parametrize('forecasting_toy_dataset', ['uni_variant_wo_missing'], indirect=True)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.time_series_hold_out_validation, None),
-                          (CrossValTypes.time_series_cross_validation, {'num_splits': CV_NUM_SPLITS}),
                           ))
 def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, backend, resampling_strategy_args):
     forecast_horizon = 3
     freq = '1Y'
     X, Y = forecasting_toy_dataset
 
-    X_train = []
-    X_test = []
-
-    for x in X:
-        if hasattr(x, 'iloc'):
-            X_train.append(x.iloc[:-forecast_horizon].copy())
-            X_test.append(x.iloc[-forecast_horizon:].copy())
-        else:
-            X_train.append(x[:-forecast_horizon].copy())
-            X_test.append(x[-forecast_horizon:].copy())
-    known_future_features = tuple(X[0].columns) if isinstance(X[0], pd.DataFrame) else \
-        np.arange(X[0].shape[-1]).tolist()
+    if X is not None:
+        X_train = []
+        X_test = []
+        for x in X:
+            if hasattr(x, 'iloc'):
+                X_train.append(x.iloc[:-forecast_horizon].copy())
+                X_test.append(x.iloc[-forecast_horizon:].copy())
+            else:
+                X_train.append(x[:-forecast_horizon].copy())
+                X_test.append(x[-forecast_horizon:].copy())
+        known_future_features = tuple(X[0].columns) if isinstance(X[0], pd.DataFrame) else \
+            np.arange(X[0].shape[-1]).tolist()
+    else:
+        X_train = None
+        X_test = None
 
     y_train = []
     y_test = []
@@ -441,11 +443,13 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
             y_test.append(y[-forecast_horizon:].copy())
 
     # Search for a good configuration
+    # patch.mock  is not applied to partial func. We only test lightweight FFNN networks
     estimator = TimeSeriesForecastingTask(
         backend=backend,
         resampling_strategy=resampling_strategy,
         resampling_strategy_args=resampling_strategy_args,
         seed=42,
+        include_components={'network_backbone': {'flat_encoder:MLPEncoder'}}
     )
 
     with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction):
@@ -454,12 +458,13 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
             y_train=y_train,
             X_test=X_test,
             y_test=y_test,
-            optimize_metric='mean_MASE_forecasting',
+            memory_limit=None,
+            optimize_metric='mean_MSE_forecasting',
             n_prediction_steps=forecast_horizon,
             freq=freq,
             total_walltime_limit=50,
             func_eval_time_limit_secs=20,
-            known_future_features=known_future_features,
+            #known_future_features=known_future_features,
         )
 
     # Internal dataset has expected settings
@@ -540,7 +545,7 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
                                        'test', estimator.seed, successful_num_run,
                                        run_key.budget))
     assert os.path.exists(test_prediction), test_prediction
-    assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == forecast_horizon * np.shape(X_test)[0]
+    assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == forecast_horizon * np.shape(y_test)[0]
 
     # Also, for ensemble builder, the OOF predictions should be there and match
     # the Ground truth that is also physically printed to disk
@@ -566,6 +571,11 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
 
     assert np.shape(y_pred) == np.shape(y_test)
 
+    # Test refit on dummy data
+    estimator.refit(dataset=backend.load_datamanager())
+    # Make sure that a configuration space is stored in the estimator
+    assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace)
+
 
 @pytest.mark.parametrize('openml_id', (
     1590,  # Adult to test NaN in categorical columns

From 2c6f66fe0524ca1552d6fe1685367d6fd0daf664 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Jun 2022 19:54:51 +0200
Subject: [PATCH 334/347] fix docs

---
 autoPyTorch/api/base_task.py                  |  9 ++-
 autoPyTorch/api/time_series_forecasting.py    |  2 +-
 .../base_target_scaler.py                     |  4 +-
 .../setup/network_initializer/SparseInit.py   |  2 -
 .../time_series_forecasting_data_loader.py    | 41 +++++++-----
 .../training/data_loader/time_series_util.py  | 62 ++++++++++++-------
 .../components/training/metrics/base.py       | 49 +++++++--------
 .../components/training/metrics/utils.py      | 14 +++--
 .../pipeline/create_searchspace_util.py       |  1 -
 .../pipeline/time_series_forecasting.py       | 16 -----
 test/conftest.py                              | 23 +++----
 test/test_api/utils.py                        |  4 +-
 .../test_forecasting_target_scaling.py        |  1 -
 .../test_pipeline/components/training/base.py |  3 +-
 .../training/test_feature_data_loader.py      |  4 +-
 15 files changed, 122 insertions(+), 113 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 84eb3e1a0..b765f7c74 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1342,8 +1342,13 @@ def _get_fit_dictionary(
         dataset: BaseDataset,
         split_id: int = 0
     ) -> Dict[str, Any]:
-        X_test = dataset.test_tensors[0].copy() if dataset.test_tensors is not None else None
-        y_test = dataset.test_tensors[1].copy() if dataset.test_tensors is not None else None
+        if dataset.test_tensors is not None:
+            X_test = dataset.test_tensors[0].copy() if dataset.test_tensors[0] is not None else None
+            y_test = dataset.test_tensors[1].copy() if dataset.test_tensors[1] is not None else None
+        else:
+            X_test = None
+            y_test = None
+
         X_train = dataset.train_tensors[0].copy() if dataset.train_tensors[0] is not None else None
         y_train = dataset.train_tensors[1].copy()
         X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
diff --git a/autoPyTorch/api/time_series_forecasting.py b/autoPyTorch/api/time_series_forecasting.py
index 87a3d1d92..b2221b45a 100644
--- a/autoPyTorch/api/time_series_forecasting.py
+++ b/autoPyTorch/api/time_series_forecasting.py
@@ -537,7 +537,7 @@ def predict(
             return forecasting * std + mean
         return forecasting
 
-    def update_sliding_window_size(self, n_prediction_steps: int):
+    def update_sliding_window_size(self, n_prediction_steps: int) -> None:
         """
         the size of the sliding window is heavily dependent on the dataset,
         so we only update them when we get the information from the
diff --git a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
index 89557b402..a8c31081b 100644
--- a/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
+++ b/autoPyTorch/pipeline/components/setup/forecasting_target_scaling/base_target_scaler.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional, Union
 
-from ConfigSpace import ConfigurationSpace, CategoricalHyperparameter
+from ConfigSpace import CategoricalHyperparameter, ConfigurationSpace
 
 import numpy as np
 
@@ -12,7 +12,7 @@
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 from autoPyTorch.pipeline.components.setup.forecasting_target_scaling.utils import TargetScaler
-from autoPyTorch.utils.common import add_hyperparameter, HyperparameterSearchSpace
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
 class BaseTargetScaler(autoPyTorchComponent):
diff --git a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
index 4cd3dd72c..b048293e9 100644
--- a/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
+++ b/autoPyTorch/pipeline/components/setup/network_initializer/SparseInit.py
@@ -11,7 +11,6 @@ class SparseInit(BaseNetworkInitializerComponent):
     """
     Fills the 2D input Tensor as a sparse matrix
     """
-
     def weights_init(self) -> Callable:
         """Returns the actual PyTorch model, that is dynamically created
         from a self.config object.
@@ -28,5 +27,4 @@ def initialization(m: torch.nn.Module) -> None:
                 torch.nn.init.sparse_(m.weight.data, 0.9)
                 if m.bias is not None and self.bias_strategy == 'Zero':
                     torch.nn.init.constant_(m.bias.data, 0.0)
-
         return initialization
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index 826396352..b54468cca 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -4,8 +4,10 @@
 
 from ConfigSpace.conditions import EqualsCondition
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (CategoricalHyperparameter,
-                                         UniformIntegerHyperparameter)
+from ConfigSpace.hyperparameters import (
+    CategoricalHyperparameter,
+    UniformIntegerHyperparameter
+)
 
 from gluonts.time_feature import TimeFeature
 
@@ -543,23 +545,28 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
         https://arxiv.org/abs/1905.10437)
         Currently back_cast_period is only activate when back_cast is activate
         Args:
-            dataset_properties (Optional[Dict]): dataset properties
-            batch_size (int): batch size
-            window_size (int): window size, (if activate) this value directly determines the window_size of the
-                               data loader
-            num_batches_per_epoch (int): how many batches are trained at each iteration
-            sample_strategy(str): how samples are distributed. if it is LengthUnifrom, then every single data point
-                                  has the same probability to be sampled, in which case longer sequence will occupy more
-                                  samples. If it is SeqUniform, then every sequence has the same probability to be
-                                  sampled regardless of their length
-            backcast (bool): if back_cast module is activate (in which case window size is a
-            multiple of n_prediction_steps)
-            backcast_period (int): activate if backcast is activate, the window size is then computed with
-                                   backcast_period * n_prediction_steps
-            transform_time_features (bool) if time feature trasnformation is applied
+            dataset_properties (Optional[Dict]):
+                dataset properties
+            batch_size (int):
+                batch size
+            window_size (int):
+                window size, (if activate) this value directly determines the window_size of the data loader
+            num_batches_per_epoch (int):
+                how many batches are trained at each iteration
+            sample_strategy(str):
+                how samples are distributed. if it is LengthUnifrom, then every single data point has the same
+                probability to be sampled, in which case longer sequence will occupy more samples. If it is
+                SeqUniform, then every sequence has the same probability to be sampled regardless of their length
+            backcast (bool):
+                if back_cast module is activate (in which case window size is a multiple of n_prediction_steps)
+            backcast_period (int):
+                activate if backcast is activate, the window size is then computed with backcast_period * n_prediction_steps
+            transform_time_features (bool)
+                if time feature trasnformation is applied
 
         Returns:
-            cs: Configuration Space
+            cs:
+                Configuration Space
 
         """
         cs = ConfigurationSpace()
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
index bae840ef7..20c83b396 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_util.py
@@ -127,36 +127,35 @@ def __call__(self, batch: Sequence[torch.Tensor], sample_interval: int = 1,
 
 
 class TimeSeriesSampler(SubsetRandomSampler):
+    """
+    A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
+    sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
+    the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
+    and  randomly sample one instance from each interval. If num_instances_per_seqs is not an integral, then the
+    first interval is selected with a certain probability:
+    for instance, if we want to sample 1.3 instance from a sequence [0,1,2,3,4,5], then we first divide the seuqence
+    into two parts: [0, 3] and [3, 6], one sample is sampled from the second part, while an expected value of 0.3 is
+    sampled from the first part (This part will be sampled in the very end with torch.multinomial)
+
+    Attributes:
+        indices (Sequence[int]):
+            The set of all the possible indices that can be sampled from
+        seq_lengths (Union[Sequence[int], np.ndarray]):
+            lengths of each sequence, applied to unsqueeze indices
+        num_instances_per_seqs (Optional[List[int]]):
+            expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
+            sampled
+        min_start (int):
+            how many first time steps we want to skip (the first few sequences need to be padded with 0)
+        generator (Optional[torch.Generator]):
+            pytorch generator to control the randomness
+    """
     def __init__(self,
                  indices: Sequence[int],
                  seq_lengths: Union[Sequence[int], np.ndarray],
                  num_instances_per_seqs: Optional[Union[List[float], np.ndarray]] = None,
                  min_start: int = 0,
                  generator: Optional[torch.Generator] = None) -> None:
-        """
-        A sampler designed for time series sequence. For the sake of efficiency, it will not sample each possible
-        sequences from indices. Instead, it samples 'num_instances_per_seqs' for each sequence. This sampler samples
-        the instances in a Latin-Hypercube likewise way: we divide each sequence in to num_instances_per_seqs interval
-        and  randomly sample one instance from each interval. If num_instances_per_seqs is not an integral, then the
-        first interval is selected with a certain probability:
-        for instance, if we want to sample 1.3 instance from a sequence [0,1,2,3,4,5], then we first divide the seuqence
-        into two parts: [0, 3] and [3, 6], one sample is sampled from the second part, while an expected value of 0.3 is
-        sampled from the first part (This part will be sampled in the very end with torch.multinomial)
-
-        Parameters
-        ----------
-        indices: Sequence[int]
-            The set of all the possible indices that can be sampled from
-        seq_lengths:  Union[Sequence[int], np.ndarray]
-            lengths of each sequence, applied to unsqueeze indices
-        num_instances_per_seqs: Optional[List[int]]=None
-            expected number of instances to be sampled in each sequence, if it is None, all the sequences will be
-            sampled
-        min_start: int
-            the how many first instances we want to skip (the first few sequences need to be padded with 0)
-        generator: Optional[torch.Generator]
-            pytorch generator to control the randomness
-        """
         super().__init__(indices, generator)
         if num_instances_per_seqs is None:
             self.iter_all_seqs = True
@@ -246,6 +245,21 @@ def __len__(self) -> int:
 
 
 class SequentialSubSetSampler(SequentialSampler):
+    """
+    Sampler for validation set that allows to sample only a fraction of the datasetset. For those datasets that
+    have a big amount of datapoints. This function helps to reduce the inference time during validation after each
+    epoch
+
+
+    Attributes:
+        data_source (Dataset):
+            dataset to sample from, it is composed of several TimeSeriesSequence. for each TimeSeriesSequence only 1
+            sample is allowed
+        num_samples (int):
+            number of samples to be sampled from the dataset source
+        generator (Optional[torch.Generator]):
+            torch random generator
+    """
     data_source: Sized
 
     def __init__(self, data_source: Sized, num_samples: int, generator: Optional[torch.Generator] = None) -> None:
diff --git a/autoPyTorch/pipeline/components/training/metrics/base.py b/autoPyTorch/pipeline/components/training/metrics/base.py
index d58124789..0cac3c560 100644
--- a/autoPyTorch/pipeline/components/training/metrics/base.py
+++ b/autoPyTorch/pipeline/components/training/metrics/base.py
@@ -201,28 +201,27 @@ def __call__(  # type: ignore[override]
             sample_weight: Optional[List[float]] = None,
             **kwarg: Any,
     ) -> float:
-        """Evaluate time series forecasting losses given input data
+        """
+        Evaluate time series forecasting losses given input data
         The description is nearly the same as the one defined under
         https://www.sktime.org/en/stable/api_reference/performance_metrics.html
 
-        Parameters
-        ----------
-        y_true : array-like, [n_seq x n_prediction_steps, n_output]
-            Ground truth (correct) target values.
-
-        y_pred : array-like, [n_seq x n_prediction_steps, n_output]
-            Forecasted values.
-
-        sp: int
+        Args:
+        y_true (np.ndarray):
+             array-like ([n_seq x n_prediction_steps, n_output]). Ground truth (correct) target values.
+        y_pred (np.ndarray):
+            array-like ([n_seq x n_prediction_steps, n_output]). Forecasted values.
+        sp (int):
             Seasonal periodicity of training data.
-
-        horizon_weight : array-like, optional (default=None)
+        horizon_weight (Optional[List[float]]):
             Forecast horizon weights.
-            TODO consider weights for each individual prediction, i.e., we could mask the unobserved values
+        sample_weight (Optional[List[float]]):
+            weights w.r.t. each sample
+
         Returns
         -------
-        score : float
-            Score function applied to prediction of estimator on X.
+            score (float):
+                Score function applied to prediction of estimator on X.
         """
 
         agg = self._kwargs['aggregation']
@@ -250,7 +249,7 @@ def __call__(  # type: ignore[override]
         # shape is [n_prediction_steps, n_sequence * n_outputs]
         y_true = y_true.reshape((n_prediction_steps, -1))
         y_pred = y_pred.reshape((n_prediction_steps, -1))
-
+        # TODO consider weights for each individual prediction, i.e., we could mask the unobserved values
         losses_all: np.ndarray = self._metric_func(y_true=y_true,
                                                    y_pred=y_pred,
                                                    sp=sp,
@@ -275,15 +274,15 @@ def __call__(  # type: ignore[override]
 
 
 def make_metric(
-        name: str,
-        score_func: Callable,
-        optimum: float = 1.0,
-        worst_possible_result: float = 0.0,
-        greater_is_better: bool = True,
-        needs_proba: bool = False,
-        needs_threshold: bool = False,
-        do_forecasting: bool = False,
-        **kwargs: Any
+    name: str,
+    score_func: Callable,
+    optimum: float = 1.0,
+    worst_possible_result: float = 0.0,
+    greater_is_better: bool = True,
+    needs_proba: bool = False,
+    needs_threshold: bool = False,
+    do_forecasting: bool = False,
+    **kwargs: Any
 ) -> autoPyTorchMetric:
     """
     Make a autoPyTorchMetric from a performance metric or loss function.
diff --git a/autoPyTorch/pipeline/components/training/metrics/utils.py b/autoPyTorch/pipeline/components/training/metrics/utils.py
index 080862555..80adfbe73 100644
--- a/autoPyTorch/pipeline/components/training/metrics/utils.py
+++ b/autoPyTorch/pipeline/components/training/metrics/utils.py
@@ -22,7 +22,7 @@
 def sanitize_array(array: np.ndarray) -> np.ndarray:
     """
     Replace NaN and Inf (there should not be any!)
-    :param array:z
+    :param array:
     :return:
     """
     a = np.ravel(array)
@@ -117,11 +117,12 @@ def get_metrics(dataset_properties: Dict[str, Any],
 
 
 def calculate_score(
-        target: np.ndarray,
-        prediction: np.ndarray,
-        task_type: int,
-        metrics: Iterable[autoPyTorchMetric],
-        **score_kwargs: Any) -> Dict[str, float]:
+    target: np.ndarray,
+    prediction: np.ndarray,
+    task_type: int,
+    metrics: Iterable[autoPyTorchMetric],
+    **score_kwargs: Any
+) -> Dict[str, float]:
     score_dict = dict()
     if task_type in FORECASTING_TASKS:
         cprediction = sanitize_array(prediction)
@@ -150,6 +151,7 @@ def calculate_score(
                     continue
                 else:
                     raise e
+
     else:
         for metric_ in metrics:
             try:
diff --git a/autoPyTorch/pipeline/create_searchspace_util.py b/autoPyTorch/pipeline/create_searchspace_util.py
index 7b13542df..640a787e2 100644
--- a/autoPyTorch/pipeline/create_searchspace_util.py
+++ b/autoPyTorch/pipeline/create_searchspace_util.py
@@ -64,7 +64,6 @@ def find_active_choices(
 ) -> List[str]:
     if not hasattr(node, "get_available_components"):
         raise ValueError()
-
     available_components = node.get_available_components(dataset_properties,
                                                          include=include,
                                                          exclude=exclude)
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 1edcc1d03..27c4bfd53 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -76,7 +76,6 @@ class TimeSeriesForecastingPipeline(RegressorMixin, BasePipeline):
             random_state is the random number generator
 
     Attributes:
-    Examples
     """
 
     def __init__(self,
@@ -194,20 +193,6 @@ def _get_hyperparameter_search_space(self,
                                 except IndexError:
                                     raise ValueError("Cannot find a legal default configuration")
                                 cs.get_hyperparameter('network_embedding:__choice__').default_value = default
-                                """
-                                # in this case we cannot deactivate the hps, we might need to think about this
-                                if 'RegressionLoss' in hp_loss.choices:
-                                    forbidden_hp_regression_loss = ForbiddenEqualsClause(hp_loss, 'RegressionLoss')
-                                    for hp_dist in hp_distribution_children:
-                                        forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
-                                        forbidden_hp_dist = AndConjunction(forbidden_hp_dist,
-                                                                           forbidden_hp_regression_loss)
-                                        forbidden_regression_losses_all.append(forbidden_hp_dist)
-                                else:
-                                    for hp_dist in hp_distribution_children:
-                                        forbidden_hp_dist = ForbiddenEqualsClause(hp_dist, True)
-                                        forbidden_regression_losses_all.append(forbidden_hp_dist)
-                                """
 
                 if 'network_backbone:flat_encoder:__choice__' in cs:
                     hp_flat_encoder = cs.get_hyperparameter('network_backbone:flat_encoder:__choice__')
@@ -354,7 +339,6 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L
                           ("preprocessing", TimeSeriesEarlyPreprocessing(random_state=self.random_state)),
                           ])
 
-        # TODO consider the correct way of doing imputer for time series forecasting tasks.
         steps.extend([
             ("target_imputer", TimeSeriesTargetImputer(random_state=self.random_state)),
             ("target_preprocessing", TimeSeriesTargetEarlyPreprocessing(random_state=self.random_state)),
diff --git a/test/conftest.py b/test/conftest.py
index e686b4904..2bc292fff 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -32,6 +32,7 @@
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
+
 N_SAMPLES = 300
 
 
@@ -45,19 +46,19 @@ def callattr_ahead_of_alltests(request):
     """
     tasks_used = [
         146818,  # Australian
-        2295,  # cholesterol
-        2075,  # abalone
-        2071,  # adult
-        3,  # kr-vs-kp
-        9981,  # cnae-9
+        2295,    # cholesterol
+        2075,    # abalone
+        2071,    # adult
+        3,       # kr-vs-kp
+        9981,    # cnae-9
         146821,  # car
         146822,  # Segment
-        2,  # anneal
-        53,  # vehicle
-        5136,  # tecator
-        4871,  # sensory
-        4857,  # boston
-        3916,  # kc1
+        2,       # anneal
+        53,      # vehicle
+        5136,    # tecator
+        4871,    # sensory
+        4857,    # boston
+        3916,    # kc1
     ]
 
     # Populate the cache
diff --git a/test/test_api/utils.py b/test/test_api/utils.py
index f4cd1c2d9..2228e88a7 100644
--- a/test/test_api/utils.py
+++ b/test/test_api/utils.py
@@ -106,10 +106,8 @@ def dummy_eval_train_function(
 ) -> None:
     if evaluator_class is None:
         evaluator_class = DummyTrainEvaluator
-    elif isinstance(evaluator_class, FORECASTING_TASKS):
+    else:
         evaluator_class = DummyForecastingEvaluator
-    import pdb
-    pdb.set_trace()
 
     evaluator = evaluator_class(
         backend=backend,
diff --git a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
index 33fb37960..a415e2e22 100644
--- a/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
+++ b/test/test_pipeline/components/setup/forecasting/test_forecasting_target_scaling.py
@@ -1,4 +1,3 @@
-import copy
 import unittest
 
 import torch
diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py
index b5feba9f1..b4db199e1 100644
--- a/test/test_pipeline/components/training/base.py
+++ b/test/test_pipeline/components/training/base.py
@@ -11,7 +11,8 @@
     CONTINUOUS,
     OUTPUT_TYPES_TO_STRING,
     REGRESSION_TASKS,
-    TASK_TYPES_TO_STRING)
+    TASK_TYPES_TO_STRING
+)
 from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker
 
diff --git a/test/test_pipeline/components/training/test_feature_data_loader.py b/test/test_pipeline/components/training/test_feature_data_loader.py
index 77cf82152..7d4c9d80d 100644
--- a/test/test_pipeline/components/training/test_feature_data_loader.py
+++ b/test/test_pipeline/components/training/test_feature_data_loader.py
@@ -3,7 +3,9 @@
 
 import torchvision
 
-from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import FeatureDataLoader
+from autoPyTorch.pipeline.components.training.data_loader.feature_data_loader import (
+    FeatureDataLoader
+)
 
 
 class TestFeatureDataLoader(unittest.TestCase):

From bb7f5c5648c16f61cba7d412de30546650060ce3 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Thu, 16 Jun 2022 20:01:02 +0200
Subject: [PATCH 335/347] fix pre-commit

---
 .../pipeline/components/setup/network/forecasting_network.py   | 2 +-
 .../data_loader/time_series_forecasting_data_loader.py         | 3 ++-
 test/test_api/test_api.py                                      | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
index 909ad0f4a..2750348a5 100644
--- a/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/forecasting_network.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, Optional
 
 import numpy as np
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
index b54468cca..3ddd66b2a 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -560,7 +560,8 @@ def get_hyperparameter_search_space(dataset_properties: Optional[Dict] = {},
             backcast (bool):
                 if back_cast module is activate (in which case window size is a multiple of n_prediction_steps)
             backcast_period (int):
-                activate if backcast is activate, the window size is then computed with backcast_period * n_prediction_steps
+                activate if backcast is activate, the window size is then computed with
+                 backcast_period * n_prediction_steps
             transform_time_features (bool)
                 if time feature trasnformation is applied
 
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 87a442df6..9a16bd5b1 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -430,6 +430,7 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
     else:
         X_train = None
         X_test = None
+        known_future_features = None
 
     y_train = []
     y_test = []
@@ -464,7 +465,7 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
             freq=freq,
             total_walltime_limit=50,
             func_eval_time_limit_secs=20,
-            #known_future_features=known_future_features,
+            known_future_features=known_future_features,
         )
 
     # Internal dataset has expected settings

From 9d728b58ec4a71f05b0873eda26524f35a2db9b9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 11:08:09 +0200
Subject: [PATCH 336/347] make test compatible with py37

---
 test/test_evaluation/test_forecasting_evaluators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_evaluation/test_forecasting_evaluators.py b/test/test_evaluation/test_forecasting_evaluators.py
index 677a268e1..580402d5c 100644
--- a/test/test_evaluation/test_forecasting_evaluators.py
+++ b/test/test_evaluation/test_forecasting_evaluators.py
@@ -273,4 +273,4 @@ def test_finish_up(self, pipeline_mock, queue_mock):
             status=StatusType.SUCCESS,
             **metric_kwargs
         )
-        self.assertTrue('test_loss' in queue_mock.put.call_args.args[0]['additional_run_info'])
+        self.assertTrue('test_loss' in queue_mock.put.call_args[0][0]['additional_run_info'])

From a331093bdc9daa8edd1921a4a02fe2d10ad83546 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 13:49:56 +0200
Subject: [PATCH 337/347] maint docstring

---
 .../forecasting_backbone/forecasting_decoder/NBEATSDecoder.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
index 44ced8be9..419c9ef34 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/forecasting_backbone/forecasting_decoder/NBEATSDecoder.py
@@ -35,7 +35,7 @@ class NBEATSBlock(DecoderNetwork):
     Oreshkin et al., N-BEATS: Neural basis expansion analysis for interpretable time series forecasting
     https://arxiv.org/abs/1905.10437
 
-    The hyperaprameter defination are quite simialr to
+    The hyperaprameter definitions are quite similar to
     https://github.com/jdb78/pytorch-forecasting/tree/master/pytorch_forecasting/models/nbeats
 
     However, we only construct the forecast/ backcast head under

From 8a5a91b0c71a33a73bd2d382e0c4f9e6462a55bb Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 15:39:03 +0200
Subject: [PATCH 338/347] split forecasting_eval_train_function from
 eval_train_function

---
 autoPyTorch/evaluation/tae.py                 |   5 +-
 ...time_series_forecasting_train_evaluator.py | 115 ++++++++++++++++++
 autoPyTorch/evaluation/train_evaluator.py     |   9 +-
 test/test_api/test_api.py                     |  11 +-
 test/test_api/utils.py                        |  55 +++++++--
 5 files changed, 174 insertions(+), 21 deletions(-)

diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
index 299cd4810..b144da76f 100644
--- a/autoPyTorch/evaluation/tae.py
+++ b/autoPyTorch/evaluation/tae.py
@@ -34,7 +34,7 @@
     NoResamplingStrategyTypes
 )
 from autoPyTorch.evaluation.test_evaluator import eval_test_function
-from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import TimeSeriesForecastingTrainEvaluator
+from autoPyTorch.evaluation.time_series_forecasting_train_evaluator import forecasting_eval_train_function
 from autoPyTorch.evaluation.train_evaluator import eval_train_function
 from autoPyTorch.evaluation.utils import (
     DisableFileOutputParameters,
@@ -152,8 +152,7 @@ def __init__(
         self.resampling_strategy_args = dm.resampling_strategy_args
 
         if STRING_TO_TASK_TYPES.get(dm.task_type, -1) == TIMESERIES_FORECASTING:
-            eval_function: Callable = functools.partial(eval_train_function,
-                                                        evaluator_class=TimeSeriesForecastingTrainEvaluator)
+            eval_function: Callable = forecasting_eval_train_function
             if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)):
                 self.output_y_hat_optimization = output_y_hat_optimization
             elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index e78dd8030..8f83ac09d 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -441,3 +441,118 @@ def _predict(self, pipeline: BaseEstimator,
             test_pred = None
 
         return np.empty(1), opt_pred, valid_pred, test_pred
+
+
+# create closure for evaluating an algorithm
+def forecasting_eval_train_function(
+    backend: Backend,
+    queue: Queue,
+    metric: autoPyTorchMetric,
+    budget: float,
+    config: Optional[Configuration],
+    seed: int,
+    output_y_hat_optimization: bool,
+    num_run: int,
+    include: Optional[Dict[str, Any]],
+    exclude: Optional[Dict[str, Any]],
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
+    pipeline_config: Optional[Dict[str, Any]] = None,
+    budget_type: str = None,
+    init_params: Optional[Dict[str, Any]] = None,
+    logger_port: Optional[int] = None,
+    all_supported_metrics: bool = True,
+    search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
+    instance: str = None,
+    max_budget: float = 1.0,
+    min_num_test_instances: Optional[int] = None
+) -> None:
+    """
+    This closure allows the communication between the ExecuteTaFuncWithQueue and the
+    pipeline trainer (TrainEvaluator).
+
+    Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
+    builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
+    to disc via the backend, and puts the performance result of the run in the queue.
+
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        config (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        instance (str):
+            An instance on which to evaluate the current pipeline. By default we work
+            with a single instance, being the provided X_train, y_train of a single dataset.
+            This instance is a compatibility argument for SMAC, that is capable of working
+            with multiple datasets at the same time.
+        max_budget (float):
+            maximal budget value available for the optimizer. This is applied to compute the size of the proxy
+            validation sets
+        min_num_test_instances (Optional[int]):
+            minimal number of instances to be validated. We do so to ensure that there are enough instances in
+            the validation set
+    """
+    evaluator = TimeSeriesForecastingTrainEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates,
+        max_budget=max_budget,
+        min_num_test_instances=min_num_test_instances,
+    )
+    evaluator.fit_predict_and_loss()
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index e761cc77b..c72745723 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -427,8 +427,6 @@ def eval_train_function(
     all_supported_metrics: bool = True,
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     instance: str = None,
-    evaluator_class: Type[TrainEvaluator] = TrainEvaluator,
-    **evaluator_kwargs: Any,
 ) -> None:
     """
     This closure allows the communication between the ExecuteTaFuncWithQueue and the
@@ -491,12 +489,8 @@ def eval_train_function(
             with a single instance, being the provided X_train, y_train of a single dataset.
             This instance is a compatibility argument for SMAC, that is capable of working
             with multiple datasets at the same time.
-        evaluator_class (Type[AbstractEvaluator]):
-            the class name of evaluator, when not specified, it is set as vanilla TrainEvaluator
-        evaluator_kwargs: Any
-            additionally evaluation kwargs
     """
-    evaluator = evaluator_class(
+    evaluator = TrainEvaluator(
         backend=backend,
         queue=queue,
         metric=metric,
@@ -514,6 +508,5 @@ def eval_train_function(
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
         search_space_updates=search_space_updates,
-        **evaluator_kwargs  # type: ignore
     )
     evaluator.fit_predict_and_loss()
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 9a16bd5b1..c02459552 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -4,7 +4,11 @@
 import pickle
 import tempfile
 import unittest
-from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_train_function
+from test.test_api.utils import (
+    dummy_do_dummy_prediction,
+    dummy_eval_train_function,
+    dummy_forecasting_eval_train_function
+)
 
 import ConfigSpace as CS
 from ConfigSpace.configuration_space import Configuration
@@ -407,8 +411,11 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
 
 
 @pytest.mark.parametrize('forecasting_toy_dataset', ['uni_variant_wo_missing'], indirect=True)
+@unittest.mock.patch('autoPyTorch.evaluation.time_series_forecasting_train_evaluator.forecasting_eval_train_function',
+                     new=dummy_forecasting_eval_train_function)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.time_series_hold_out_validation, None),
+                          (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS})
                           ))
 def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, backend, resampling_strategy_args):
     forecast_horizon = 3
@@ -459,7 +466,7 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
             y_train=y_train,
             X_test=X_test,
             y_test=y_test,
-            memory_limit=None,
+            memory_limit=8192,
             optimize_metric='mean_MSE_forecasting',
             n_prediction_steps=forecast_horizon,
             freq=freq,
diff --git a/test/test_api/utils.py b/test/test_api/utils.py
index 2228e88a7..968fafe5b 100644
--- a/test/test_api/utils.py
+++ b/test/test_api/utils.py
@@ -101,15 +101,53 @@ def dummy_eval_train_function(
         all_supported_metrics=True,
         search_space_updates=None,
         instance: str = None,
-        evaluator_class=None,
-        **evaluator_kwargs,
 ) -> None:
-    if evaluator_class is None:
-        evaluator_class = DummyTrainEvaluator
-    else:
-        evaluator_class = DummyForecastingEvaluator
+    evaluator = DummyTrainEvaluator(
+        backend=backend,
+        queue=queue,
+        metric=metric,
+        configuration=config,
+        seed=seed,
+        num_run=num_run,
+        output_y_hat_optimization=output_y_hat_optimization,
+        include=include,
+        exclude=exclude,
+        disable_file_output=disable_file_output,
+        init_params=init_params,
+        budget=budget,
+        budget_type=budget_type,
+        logger_port=logger_port,
+        all_supported_metrics=all_supported_metrics,
+        pipeline_config=pipeline_config,
+        search_space_updates=search_space_updates,
+    )
+    evaluator.fit_predict_and_loss()
+
 
-    evaluator = evaluator_class(
+# create closure for evaluating an algorithm
+def dummy_forecasting_eval_train_function(
+        backend,
+        queue,
+        metric,
+        budget: float,
+        config,
+        seed: int,
+        output_y_hat_optimization: bool,
+        num_run: int,
+        include,
+        exclude,
+        disable_file_output,
+        pipeline_config=None,
+        budget_type=None,
+        init_params=None,
+        logger_port=None,
+        all_supported_metrics=True,
+        search_space_updates=None,
+        instance: str = None,
+        max_budget=1.0,
+        min_num_test_instances=None
+) -> None:
+    evaluator = DummyForecastingEvaluator(
         backend=backend,
         queue=queue,
         metric=metric,
@@ -127,7 +165,8 @@ def dummy_eval_train_function(
         all_supported_metrics=all_supported_metrics,
         pipeline_config=pipeline_config,
         search_space_updates=search_space_updates,
-        **evaluator_kwargs
+        max_budget=max_budget,
+        min_num_test_instances=min_num_test_instances,
     )
     evaluator.fit_predict_and_loss()
 

From acddd2284fc783589488dfbe7011212eb92e9142 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 15:45:21 +0200
Subject: [PATCH 339/347] fix namespace for test_api from train_evaluator to
 tae

---
 test/test_api/test_api.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index c02459552..732720270 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -48,7 +48,7 @@
 
 # Test
 # ====
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function',
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
                      new=dummy_eval_train_function)
 @pytest.mark.parametrize('openml_id', (40981, ))
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
@@ -225,7 +225,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl
 
 
 @pytest.mark.parametrize('openml_name', ("boston", ))
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function',
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
                      new=dummy_eval_train_function)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.holdout_validation, None),
@@ -411,7 +411,7 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
 
 
 @pytest.mark.parametrize('forecasting_toy_dataset', ['uni_variant_wo_missing'], indirect=True)
-@unittest.mock.patch('autoPyTorch.evaluation.time_series_forecasting_train_evaluator.forecasting_eval_train_function',
+@unittest.mock.patch('autoPyTorch.evaluation.tae.forecasting_eval_train_function',
                      new=dummy_forecasting_eval_train_function)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.time_series_hold_out_validation, None),
@@ -677,7 +677,7 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular):
     del estimator
 
 
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function',
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
                      new=dummy_eval_train_function)
 @pytest.mark.parametrize('openml_id', (40981, ))
 def test_portfolio_selection(openml_id, backend, n_samples):
@@ -719,7 +719,7 @@ def test_portfolio_selection(openml_id, backend, n_samples):
     assert any(successful_config in portfolio_configs for successful_config in successful_configs)
 
 
-@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function',
+@unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function',
                      new=dummy_eval_train_function)
 @pytest.mark.parametrize('openml_id', (40981, ))
 def test_portfolio_selection_failure(openml_id, backend, n_samples):

From b18ce927a2790ec26136a9227d58a4c375738bb9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 16:27:21 +0200
Subject: [PATCH 340/347] maint test api for forecasting

---
 test/test_api/test_api.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 732720270..022357f26 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -415,7 +415,7 @@ def test_tabular_regression(openml_name, resampling_strategy, backend, resamplin
                      new=dummy_forecasting_eval_train_function)
 @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args',
                          ((HoldoutValTypes.time_series_hold_out_validation, None),
-                          (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS})
+                          (CrossValTypes.time_series_cross_validation, {'num_splits': CV_NUM_SPLITS}),
                           ))
 def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, backend, resampling_strategy_args):
     forecast_horizon = 3
@@ -457,7 +457,6 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
         resampling_strategy=resampling_strategy,
         resampling_strategy_args=resampling_strategy_args,
         seed=42,
-        include_components={'network_backbone': {'flat_encoder:MLPEncoder'}}
     )
 
     with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction):
@@ -466,12 +465,12 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
             y_train=y_train,
             X_test=X_test,
             y_test=y_test,
-            memory_limit=8192,
+            memory_limit=None,
             optimize_metric='mean_MSE_forecasting',
             n_prediction_steps=forecast_horizon,
             freq=freq,
-            total_walltime_limit=50,
-            func_eval_time_limit_secs=20,
+            total_walltime_limit=30,
+            func_eval_time_limit_secs=10,
             known_future_features=known_future_features,
         )
 

From 0700e61125b7272912ef60cb80a43074427ba7e8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 16:31:28 +0200
Subject: [PATCH 341/347] decrease number of ensemble size of
 test_time_series_forecasting to reduce test time

---
 test/test_api/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index 022357f26..465d74c6b 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -456,6 +456,7 @@ def test_time_series_forecasting(forecasting_toy_dataset, resampling_strategy, b
         backend=backend,
         resampling_strategy=resampling_strategy,
         resampling_strategy_args=resampling_strategy_args,
+        ensemble_size=2,
         seed=42,
     )
 

From e4328ee932ecd19ed3259f82034e09f4cc0bf9cd Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 16:31:55 +0200
Subject: [PATCH 342/347] flatten all the prediction for forecasting pipelines

---
 autoPyTorch/evaluation/abstract_evaluator.py    | 4 ++--
 autoPyTorch/pipeline/time_series_forecasting.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
index 44a0b4e99..f5f10f664 100644
--- a/autoPyTorch/evaluation/abstract_evaluator.py
+++ b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -353,14 +353,14 @@ def _generate_dummy_forecasting(self, X: List[Union[TimeSeriesSequence, np.ndarr
     def predict_proba(self, X: Union[np.ndarray, pd.DataFrame],
                       batch_size: int = 1000) -> np.ndarray:
         X_tail = self._generate_dummy_forecasting(X)
-        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
+        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).flatten()
 
     def predict(self, X: Union[np.ndarray, pd.DataFrame],
                 batch_size: int = 1000) -> np.ndarray:
         X_tail = np.asarray(self._generate_dummy_forecasting(X))
         if X_tail.ndim == 1:
             X_tail = np.expand_dims(X_tail, -1)
-        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).squeeze()
+        return np.tile(X_tail, (1, self.n_prediction_steps)).astype(np.float32).flatten()
 
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/time_series_forecasting.py b/autoPyTorch/pipeline/time_series_forecasting.py
index 27c4bfd53..53143e4df 100644
--- a/autoPyTorch/pipeline/time_series_forecasting.py
+++ b/autoPyTorch/pipeline/time_series_forecasting.py
@@ -436,6 +436,6 @@ def predict(self,
                 warnings.warn('| WARNING: ran out of memory, retrying batch')
                 torch.cuda.empty_cache()
                 batch_size = batch_size // 2
-                return self.predict(X, batch_size=batch_size // 2)
+                return self.predict(X, batch_size=batch_size // 2).flatten()
             else:
                 raise e

From b6baef1aa2067c17e3eba07a15555e7c7ead35a9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Fri, 17 Jun 2022 16:35:21 +0200
Subject: [PATCH 343/347] pre-commit fix

---
 autoPyTorch/evaluation/train_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
index c72745723..142af6bcc 100644
--- a/autoPyTorch/evaluation/train_evaluator.py
+++ b/autoPyTorch/evaluation/train_evaluator.py
@@ -1,5 +1,5 @@
 from multiprocessing.queues import Queue
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration
 

From 0771c8e011a1e19fa204bfd949b8a259b153b7c8 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Mon, 20 Jun 2022 16:49:50 +0200
Subject: [PATCH 344/347] fix docstrings and typing

---
 .../data/time_series_forecasting_validator.py   |  3 +++
 autoPyTorch/datasets/time_series_dataset.py     | 17 +++++++----------
 .../time_series_forecasting_train_evaluator.py  |  2 +-
 .../components/training/metrics/metrics.py      |  3 ++-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/data/time_series_forecasting_validator.py b/autoPyTorch/data/time_series_forecasting_validator.py
index d6bcfb12e..c19224b70 100644
--- a/autoPyTorch/data/time_series_forecasting_validator.py
+++ b/autoPyTorch/data/time_series_forecasting_validator.py
@@ -22,6 +22,7 @@ class TimeSeriesForecastingInputValidator(TabularInputValidator):
     As a time series forecasting dataset might contain several time sequence with different length, we will transform
     all the data to DataFrameGroupBy whereas each group represents a series
     TODO for multiple output: target names and shapes
+    TODO check if we can compress time series forecasting datasets
     """
 
     def __init__(
@@ -56,6 +57,7 @@ def fit(  # type: ignore[override]
     ) -> BaseEstimator:
         """
         fit the validator with the training data, (optionally) start times and other information
+
         Args:
             X_train (Optional[Union[List, pd.DataFrame]]):
                 training features, could be None for uni-variant forecasting tasks
@@ -175,6 +177,7 @@ def transform(  # type: ignore[override]
     ) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame], np.ndarray]:
         """
         transform the data with the fitted validator
+
         Args:
             X: Optional[Union[List, pd.DataFrame]]
                 time features
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 022e6360e..5e477897f 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -38,6 +38,7 @@
     HoldOutFuncs,
     HoldoutValTypes,
     NoResamplingStrategyTypes,
+    ResamplingStrategies
 )
 from autoPyTorch.pipeline.components.training.metrics.metrics import compute_mase_coefficient
 from autoPyTorch.utils.common import FitRequirement
@@ -472,7 +473,7 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
         gluonts.time_feature
     freq (Optional[Union[str, int, List[int]]]):
         the frequency that the data is sampled. It needs to keep consistent within one dataset
-    resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]])
+    resampling_strategy (Optional[ResamplingStrategies])
         resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks. Please
         refer to autoPyTorch.datasets.resampling_strategy
     resampling_strategy_args (Optional[Dict[str, Any]]):
@@ -509,9 +510,7 @@ def __init__(self,
                  known_future_features: Optional[Union[Tuple[Union[str, int]], Tuple[()]]] = None,
                  time_feature_transform: Optional[List[TimeFeature]] = None,
                  freq: Optional[Union[str, int, List[int]]] = None,
-                 resampling_strategy: Optional[
-                     Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
-                 ] = HoldoutValTypes.time_series_hold_out_validation,
+                 resampling_strategy: Optional[ResamplingStrategies] = HoldoutValTypes.time_series_hold_out_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -1090,11 +1089,9 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
     def get_split_strategy(sequence_lengths: List[int],
                            n_prediction_steps: int,
                            freq_value: Union[float, int],
-                           resampling_strategy: Union[
-                               CrossValTypes, HoldoutValTypes,
-                               NoResamplingStrategyTypes] = HoldoutValTypes.time_series_hold_out_validation,
+                           resampling_strategy: ResamplingStrategies = HoldoutValTypes.time_series_hold_out_validation,
                            resampling_strategy_args: Optional[Dict[str, Any]] = None, ) -> \
-            Tuple[Optional[Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]], Optional[Dict[str, Any]]]:
+            Tuple[ResamplingStrategies, Optional[Dict[str, Any]]]:
         """
         Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
         enough to support cross-validation split, thus we need to carefully compute the number of folds
@@ -1105,12 +1102,12 @@ def get_split_strategy(sequence_lengths: List[int],
                 forecasting horizon
             freq_value (Union[float, int]):
                 period of the dataset, determined by its sampling frequency
-            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]):
+            resampling_strategy(ResamplingStrategies):
                 resampling strategy to be checked
             resampling_strategy_args (Optional[Dict[str, Any]]):
                 resampling strategy arguments to be checked
         Returns:
-            resampling_strategy(Optional[Union[CrossValTypes, HoldoutValTypes]]):
+            resampling_strategy(ResamplingStrategies):
                 resampling strategy
             resampling_strategy_args (Optional[Dict[str, Any]]):
                 resampling strategy arguments
diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index 8f83ac09d..cd3e074b3 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -308,7 +308,7 @@ def fit_predict_and_loss(self) -> None:
             ])
 
             if self.y_valid is not None:
-                warnings.warn('valid_pred is current unsuported for fore casting tasks!')
+                warnings.warn('valid_pred is currently unsupported for fore casting tasks!')
             Y_valid_preds = None
 
             if self.y_test is not None:
diff --git a/autoPyTorch/pipeline/components/training/metrics/metrics.py b/autoPyTorch/pipeline/components/training/metrics/metrics.py
index d2c8d98eb..51921dffb 100644
--- a/autoPyTorch/pipeline/components/training/metrics/metrics.py
+++ b/autoPyTorch/pipeline/components/training/metrics/metrics.py
@@ -60,7 +60,8 @@
 def compute_mase_coefficient(past_target: Union[List, np.ndarray], sp: int) -> np.ndarray:
     """
     compute mase coefficient, then mase value is computed as mase_coefficient * mse_error,
-    this function aims at reducing the memroy requirement
+    this function aims at reducing the memory requirement
+
     Args:
         past_target (Optional[List, np.ndarray]):
             past target observations

From d066fda1f79991dc4c46fe1b53a66ac5aa343165 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Jun 2022 12:01:49 +0200
Subject: [PATCH 345/347] maint time series dataset docstrings

---
 autoPyTorch/datasets/time_series_dataset.py | 123 +++++++++++---------
 1 file changed, 68 insertions(+), 55 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 5e477897f..9cea28562 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -49,6 +49,7 @@ def extract_feature_index(feature_shapes: Dict[str, int],
                           queried_features: Union[Tuple[Union[str, int]], Tuple[()]]) -> Tuple[int]:
     """
     extract the index of a set of queried_features from the extracted feature_shapes
+
     Args:
         feature_shapes (dict):
             feature_shapes recoding the shape of each features
@@ -93,6 +94,7 @@ def compute_time_features(start_time: pd.DatetimeIndex,
 class TimeSeriesSequence(Dataset):
     """
     A dataset representing a time series sequence. It returns all the previous observations once it is asked for an item
+
     Args:
         X (Optional[np.ndarray]):
             past features
@@ -450,51 +452,52 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
     """
     Dataset class for time series forecasting used in AutoPyTorch. It consists of multiple TimeSeriesSequence.
     Train and test tensors are stored as pd.DataFrame whereas their index indicates which series the data belongs to
+
     Args:
-    X (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
-        time series features. can be None if we work with a uni-variant forecasting task
-    Y (Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]):
-        forecasting targets. Must be given
-    X_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
-        known future features. It is a collection of series that has the same amount of data as X. It
-        is designed to be at the tail of X. If no feature is known in the future, this value can be omitted.
-    Y_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None):
-        future targets. It is a collection of series that has the same data of series as Y. It is designed to be at
-        the tail of Y after the timestamps that need to be predicted.
-    start_times (Optional[List[pd.DatetimeIndex]]):
-        starting time of each series when they are sampled. If it is not given, we simply start with a fixed timestamp.
-    series_idx (Optional[Union[List[Union[str, int]], str, int]]):
-        (only works if X is stored as pd.DataFrame). This value is applied to identify  towhich series the data belongs
-        if the data is presented as a "chunk" dataframe
-    known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
-        future features that are known in advance. For instance, holidays.
-    time_feature_transform (Optional[List[TimeFeature]]):
-        A list of time feature transformation methods implemented in gluonts. For more information, please check
-        gluonts.time_feature
-    freq (Optional[Union[str, int, List[int]]]):
-        the frequency that the data is sampled. It needs to keep consistent within one dataset
-    resampling_strategy (Optional[ResamplingStrategies])
-        resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks. Please
-        refer to autoPyTorch.datasets.resampling_strategy
-    resampling_strategy_args (Optional[Dict[str, Any]]):
-        arguments passed to resampling_strategy
-    seed (int):
-        random seeds
-    train_transforms (Optional[torchvision.transforms.Compose]):
-        Transformation applied to training data before it is fed to the dataloader
-    val_transforms (Optional[torchvision.transforms.Compose]):
-        Transformation applied to validation data before it is fed to the dataloader
-    validator (Optional[TimeSeriesForecastingInputValidator]):
-        Input Validator
-    lagged_value (Optional[List[int]])
-        We could consider past targets as additional features for the current timestep. This item indicates the number
-        timesteps in advanced that we want to apply the targets as our current features
-    n_prediction_steps (int):
-        The number of steps you want to forecast into the future (forecast horizon)
-    dataset_name (Optional[str]):
-        dataset name
-    normalize_y(bool):
-        if targets are normalized within each series
+        X (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
+            time series features. can be None if we work with a uni-variant forecasting task
+        Y (Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]):
+            forecasting targets. Must be given
+        X_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]):
+            known future features. It is a collection of series that has the same amount of data as X. It
+            is designed to be at the tail of X. If no feature is known in the future, this value can be omitted.
+        Y_test (Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]] = None):
+            future targets. It is a collection of series that has the same data of series as Y. It is designed to be at
+            the tail of Y after the timestamps that need to be predicted.
+        start_times (Optional[List[pd.DatetimeIndex]]):
+            starting time of each series when they are sampled. If it is not given, we simply start with a fixed timestamp.
+        series_idx (Optional[Union[List[Union[str, int]], str, int]]):
+            (only works if X is stored as pd.DataFrame). This value is applied to identify  towhich series the data belongs
+            if the data is presented as a "chunk" dataframe
+        known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
+            future features that are known in advance. For instance, holidays.
+        time_feature_transform (Optional[List[TimeFeature]]):
+            A list of time feature transformation methods implemented in gluonts. For more information, please check
+            gluonts.time_feature
+        freq (Optional[Union[str, int, List[int]]]):
+            the frequency that the data is sampled. It needs to keep consistent within one dataset
+        resampling_strategy (Optional[ResamplingStrategies])
+            resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks. Please
+            refer to autoPyTorch.datasets.resampling_strategy
+        resampling_strategy_args (Optional[Dict[str, Any]]):
+            arguments passed to resampling_strategy
+        seed (int):
+            random seeds
+        train_transforms (Optional[torchvision.transforms.Compose]):
+            Transformation applied to training data before it is fed to the dataloader
+        val_transforms (Optional[torchvision.transforms.Compose]):
+            Transformation applied to validation data before it is fed to the dataloader
+        validator (Optional[TimeSeriesForecastingInputValidator]):
+            Input Validator
+        lagged_value (Optional[List[int]])
+            We could consider past targets as additional features for the current timestep. This item indicates the number
+            timesteps in advanced that we want to apply the targets as our current features
+        n_prediction_steps (int):
+            The number of steps you want to forecast into the future (forecast horizon)
+        dataset_name (Optional[str]):
+            dataset name
+        normalize_y(bool):
+            if targets are normalized within each series
     """
 
     datasets: List[TimeSeriesSequence]
@@ -821,6 +824,7 @@ def transform_data_into_time_series_sequence(self,
         """
         Transform the raw data into a list of TimeSeriesSequence that can be processed by AutoPyTorch Time Series
                 build a series time sequence datasets
+
         Args:
             X: Optional[Union[np.ndarray, List[Union[pd.DataFrame, np.ndarray]]]]
                 features, if is_test_set is True, then its length of
@@ -834,7 +838,8 @@ def transform_data_into_time_series_sequence(self,
             Y_test: np.ndarray (N_all_test, N_target)
                 flattened test target array with size N_all (the sum of all the series sequences) and number of targets
             is_test_set: Optional[List[pd.DatetimeIndex]]
-                if the genereated sequecne used for test
+                if the generated sequence used for test
+
         Returns:
             sequence_datasets : List[TimeSeriesSequence]
                 a list of datasets
@@ -891,6 +896,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
     ]:
         """
         build a series time sequence datasets
+
         Args:
             X: pd.DataFrame (N_all, N_feature)
                 flattened train feature DataFrame with size N_all (the sum of all the series sequences) and N_feature,
@@ -910,6 +916,7 @@ def make_sequences_datasets(X: Optional[pd.DataFrame],
                 if the generated sequence used for test
             sequences_kwargs: Dict
                 additional arguments for test sets
+
         Returns:
             sequence_datasets : List[TimeSeriesSequence]
                 a list of datasets
@@ -1094,7 +1101,10 @@ def get_split_strategy(sequence_lengths: List[int],
             Tuple[ResamplingStrategies, Optional[Dict[str, Any]]]:
         """
         Determines the most possible sampling strategy for the datasets: the lengths of each sequence might not be long
-        enough to support cross-validation split, thus we need to carefully compute the number of folds
+        enough to support cross-validation split, thus we need to carefully compute the number of folds. Additionally,
+        each fold might contain multiple forecasting instances (each with length n_prediction_steps and there is no
+        overlapping between the test instances). This value is considered as 'n_repeats'
+
         Args:
             sequence_lengths (List[int]):
                 lengths of each sequence
@@ -1106,6 +1116,7 @@ def get_split_strategy(sequence_lengths: List[int],
                 resampling strategy to be checked
             resampling_strategy_args (Optional[Dict[str, Any]]):
                 resampling strategy arguments to be checked
+
         Returns:
             resampling_strategy(ResamplingStrategies):
                 resampling strategy
@@ -1120,6 +1131,7 @@ def get_split_strategy(sequence_lengths: List[int],
             if resampling_strategy_args is not None:
                 num_splits = resampling_strategy_args.get('num_splits', num_splits)
 
+            # Check if all the series can be properly split, if not, we reduce the number of split
             if resampling_strategy != CrossValTypes.time_series_ts_cross_validation:
                 while minimal_seq_length - n_prediction_steps * num_splits <= 0:
                     num_splits -= 1
@@ -1154,17 +1166,19 @@ def get_split_strategy(sequence_lengths: List[int],
         if resampling_strategy_args is not None and "n_repeats" in resampling_strategy_args:
             n_repeats = resampling_strategy_args["n_repeats"]
         else:
-            n_repeats = None
-        if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
-                minimal_seq_length > 50 * n_prediction_steps:
-            if n_repeats is None:
+            # we want to keep the amount of forecasting instances large enough to generalize well or make full use of
+            # the information from the training set
+            # if there are not enough series in the dataset or the minimal length of the sequence is large enough
+            # to support multiple predictions
+            if (num_seqs < 100 and minimal_seq_length > 10 * n_prediction_steps) or \
+                    minimal_seq_length > 50 * n_prediction_steps:
                 if num_seqs < 100:
                     n_repeats = int(np.ceil(100.0 / num_seqs))
                 else:
                     n_repeats = int(np.round(minimal_seq_length / (50 * n_prediction_steps)))
+            else:
+                n_repeats = 1
 
-        if n_repeats is None:
-            n_repeats = 1
         if resampling_strategy == CrossValTypes.time_series_cross_validation:
             n_repeats = min(n_repeats, minimal_seq_length // (5 * n_prediction_steps * num_splits))
         elif resampling_strategy == CrossValTypes.time_series_ts_cross_validation:
@@ -1182,9 +1196,6 @@ def get_split_strategy(sequence_lengths: List[int],
 
         n_repeats = max(n_repeats, 1)
 
-        if n_repeats is None:
-            n_repeats = 1
-
         if resampling_strategy_args is None:
             resampling_strategy_args = {'n_repeats': n_repeats}
         else:
@@ -1201,6 +1212,7 @@ def create_cross_val_splits(
         This function creates the cross validation split for the given task.
 
         It is done once per dataset to have comparable results among pipelines
+
         Args:
             cross_val_type (CrossValTypes):
                 cross validation type
@@ -1261,6 +1273,7 @@ def create_holdout_val_split(
         This function creates the holdout split for the given task.
 
         It is done once per dataset to have comparable results among pipelines
+
         Args:
             holdout_val_type (HoldoutValTypes):
                 holdout type

From f701df3a690a29a62efcc6cae475538af440b8e9 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Jun 2022 12:08:33 +0200
Subject: [PATCH 346/347] maint warning message in
 time_series_forecasting_train_evaluator

---
 .../evaluation/time_series_forecasting_train_evaluator.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
index cd3e074b3..729399321 100644
--- a/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
+++ b/autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py
@@ -431,7 +431,7 @@ def _predict(self, pipeline: BaseEstimator,
         opt_pred = opt_pred.reshape(-1, self.num_targets)
 
         if self.y_valid is not None:
-            warnings.warn('valid_pred is current unsuported for fore casting tasks!')
+            warnings.warn('valid_pred is current unsupported for forecasting tasks!')
         valid_pred = None
 
         if self.y_test is not None and self.eval_test_tensors:

From 5e970f645108c47f2f73a322d284c550bae788f5 Mon Sep 17 00:00:00 2001
From: dengdifan <difandeng@gmail.com>
Date: Wed, 22 Jun 2022 13:58:11 +0200
Subject: [PATCH 347/347] fix lines that are overlength

---
 autoPyTorch/datasets/time_series_dataset.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
index 9cea28562..fe5ddd234 100644
--- a/autoPyTorch/datasets/time_series_dataset.py
+++ b/autoPyTorch/datasets/time_series_dataset.py
@@ -465,10 +465,11 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
             future targets. It is a collection of series that has the same data of series as Y. It is designed to be at
             the tail of Y after the timestamps that need to be predicted.
         start_times (Optional[List[pd.DatetimeIndex]]):
-            starting time of each series when they are sampled. If it is not given, we simply start with a fixed timestamp.
+            starting time of each series when they are sampled. If it is not given, we simply start with a fixed
+            timestamp.
         series_idx (Optional[Union[List[Union[str, int]], str, int]]):
-            (only works if X is stored as pd.DataFrame). This value is applied to identify  towhich series the data belongs
-            if the data is presented as a "chunk" dataframe
+            (only works if X is stored as pd.DataFrame). This value is applied to identify  towhich series the data
+            belongs if the data is presented as a "chunk" dataframe
         known_future_features (Optional[Union[Tuple[Union[str, int]], Tuple[()]]]):
             future features that are known in advance. For instance, holidays.
         time_feature_transform (Optional[List[TimeFeature]]):
@@ -477,8 +478,8 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
         freq (Optional[Union[str, int, List[int]]]):
             the frequency that the data is sampled. It needs to keep consistent within one dataset
         resampling_strategy (Optional[ResamplingStrategies])
-            resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks. Please
-            refer to autoPyTorch.datasets.resampling_strategy
+            resampling strategy. We designed several special resampling resampling_strategy for forecasting tasks.
+            Please refer to autoPyTorch.datasets.resampling_strategy
         resampling_strategy_args (Optional[Dict[str, Any]]):
             arguments passed to resampling_strategy
         seed (int):
@@ -490,8 +491,8 @@ class TimeSeriesForecastingDataset(BaseDataset, ConcatDataset):
         validator (Optional[TimeSeriesForecastingInputValidator]):
             Input Validator
         lagged_value (Optional[List[int]])
-            We could consider past targets as additional features for the current timestep. This item indicates the number
-            timesteps in advanced that we want to apply the targets as our current features
+            We could consider past targets as additional features for the current timestep. This item indicates the
+            number of timesteps in advanced that we want to apply the targets as our current features
         n_prediction_steps (int):
             The number of steps you want to forecast into the future (forecast horizon)
         dataset_name (Optional[str]):