diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
index 87aa2be317..0487594dc8 100644
--- a/autosklearn/estimators.py
+++ b/autosklearn/estimators.py
@@ -76,7 +76,7 @@ def __init__(
         ensemble_size : int, optional (default=50)
             Number of models added to the ensemble built by *Ensemble
             selection from libraries of models*. Models are drawn with
-            replacement.
+            replacement. If set to ``0`` no ensemble is fit.
 
         ensemble_nbest : int, optional (default=50)
             Only consider the ``ensemble_nbest`` models when building an
@@ -96,10 +96,14 @@ def __init__(
         memory_limit : int, optional (3072)
             Memory limit in MB for the machine learning algorithm.
             `auto-sklearn` will stop fitting the machine learning algorithm if
-            it tries to allocate more than `memory_limit` MB.
-            If None is provided, no memory limit is set.
-            In case of multi-processing, `memory_limit` will be per job.
-            This memory limit also applies to the ensemble creation process.
+            it tries to allocate more than ``memory_limit`` MB.
+            
+            **Important notes:** 
+            
+            * If ``None`` is provided, no memory limit is set.
+            * In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is 
+              ``n_jobs x memory_limit``.
+            * The memory limit also applies to the ensemble creation process.
 
         include : dict, optional (None)
             If None, all possible algorithms are used. Otherwise specifies
@@ -145,10 +149,10 @@ def __init__(
             * 'cv-iterative-fit': {'folds': int}
             * 'partial-cv': {'folds': int, 'shuffle': bool}
             * BaseCrossValidator or _RepeatedSplits or BaseShuffleSplit object: all arguments
-                required by chosen class as specified in scikit-learn documentation.
-                If arguments are not provided, scikit-learn defaults are used.
-                If no defaults are available, an exception is raised.
-                Refer to the 'n_splits' argument as 'folds'.
+              required by chosen class as specified in scikit-learn documentation.
+              If arguments are not provided, scikit-learn defaults are used.
+              If no defaults are available, an exception is raised.
+              Refer to the 'n_splits' argument as 'folds'.
 
         tmp_folder : string, optional (None)
             folder to store configuration output and log files, if ``None``
@@ -160,13 +164,15 @@ def __init__(
 
         n_jobs : int, optional, experimental
             The number of jobs to run in parallel for ``fit()``. ``-1`` means
-            using all processors. By default, Auto-sklearn uses a single core
-            for fitting the machine learning model and a single core for fitting
-            an ensemble. Ensemble building is not affected by ``n_jobs`` but
-            can be controlled by the number of models in the ensemble. In
-            contrast to most scikit-learn models, ``n_jobs`` given in the
-            constructor is not applied to the ``predict()`` method. If
-            ``dask_client`` is None, a new dask client is created.
+            using all processors. 
+            
+            **Important notes**: 
+            
+            * By default, Auto-sklearn uses one core. 
+            * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number 
+              of models in the ensemble.
+            * ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models)
+            * If ``dask_client`` is ``None``, a new dask client is created.
 
         dask_client : dask.distributed.Client, optional
             User-created dask client, can be used to start a dask cluster and then
@@ -182,7 +188,7 @@ def __init__(
             * ``'y_optimization'`` : do not save the predictions for the
               optimization/validation set, which would later on be used to build
               an ensemble.
-            * ``'model'`` : do not save any model files
+            * ``model`` : do not save any model files
 
         smac_scenario_args : dict, optional (None)
             Additional arguments inserted into the scenario of SMAC. See the
@@ -559,7 +565,7 @@ def leaderboard(
         Gives an overview of all models trained during the search process along
         with various statistics about their training.
 
-        The availble statistics are:
+        The available statistics are:
 
         **Simple**:
 
diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py
index 7cbeebc9d0..c01282fc47 100644
--- a/autosklearn/experimental/askl2.py
+++ b/autosklearn/experimental/askl2.py
@@ -218,7 +218,7 @@ def __init__(
         ensemble_size : int, optional (default=50)
             Number of models added to the ensemble built by *Ensemble
             selection from libraries of models*. Models are drawn with
-            replacement.
+            replacement. If set to ``0`` no ensemble is fit.
 
         ensemble_nbest : int, optional (default=50)
             Only consider the ``ensemble_nbest`` models when building an
@@ -238,10 +238,14 @@ def __init__(
         memory_limit : int, optional (3072)
             Memory limit in MB for the machine learning algorithm.
             `auto-sklearn` will stop fitting the machine learning algorithm if
-            it tries to allocate more than `memory_limit` MB.
-            If None is provided, no memory limit is set.
-            In case of multi-processing, `memory_limit` will be per job.
-            This memory limit also applies to the ensemble creation process.
+            it tries to allocate more than ``memory_limit`` MB.
+            
+            **Important notes:** 
+            
+            * If ``None`` is provided, no memory limit is set.
+            * In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is 
+              ``n_jobs x memory_limit``.
+            * The memory limit also applies to the ensemble creation process.
 
         tmp_folder : string, optional (None)
             folder to store configuration output and log files, if ``None``
@@ -253,13 +257,15 @@ def __init__(
 
         n_jobs : int, optional, experimental
             The number of jobs to run in parallel for ``fit()``. ``-1`` means
-            using all processors. By default, Auto-sklearn uses a single core
-            for fitting the machine learning model and a single core for fitting
-            an ensemble. Ensemble building is not affected by ``n_jobs`` but
-            can be controlled by the number of models in the ensemble. In
-            contrast to most scikit-learn models, ``n_jobs`` given in the
-            constructor is not applied to the ``predict()`` method. If
-            ``dask_client`` is None, a new dask client is created.
+            using all processors. 
+            
+            **Important notes**: 
+            
+            * By default, Auto-sklearn uses one core. 
+            * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number 
+              of models in the ensemble.
+            * ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models)
+            * If ``dask_client`` is ``None``, a new dask client is created.
 
         dask_client : dask.distributed.Client, optional
             User-created dask client, can be used to start a dask cluster and then
@@ -275,7 +281,7 @@ def __init__(
             * ``'y_optimization'`` : do not save the predictions for the
               optimization/validation set, which would later on be used to build
               an ensemble.
-            * ``'model'`` : do not save any model files
+            * ``model`` : do not save any model files
 
         smac_scenario_args : dict, optional (None)
             Additional arguments inserted into the scenario of SMAC. See the
diff --git a/doc/conf.py b/doc/conf.py
index a63ca70ce3..a7bfae8cd9 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -43,6 +43,7 @@
               'sphinx.ext.doctest', 'sphinx.ext.coverage',
               'sphinx.ext.mathjax', 'sphinx.ext.viewcode',
               'sphinx_gallery.gen_gallery', 'sphinx.ext.autosectionlabel',
+              'sphinx_toolbox.collapse',
               # sphinx.ext.autosexctionlabel raises duplicate label warnings
               # because same section headers are used multiple times throughout
               # the documentation.
@@ -180,7 +181,7 @@
         ('Start', 'index'),
         ('Releases', 'releases'),
         ('Installation', 'installation'),
-        ('Manual', 'manual'),
+        #('Manual', 'manual'),
         ('Examples', 'examples/index'),
         ('API', 'api'),
         ('Extending', 'extending'),
diff --git a/doc/faq.rst b/doc/faq.rst
index d562eadc06..439e5c9be3 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -6,267 +6,518 @@
 FAQ
 ===
 
-Issues
-======
+General
+=======
 
-Auto-sklearn is extremely memory hungry in a sequential setting
----------------------------------------------------------------
+.. collapse:: <b>Where can I find examples on how to use auto-sklearn?</b>
 
-Auto-sklearn can appear very memory hungry (i.e. requiring a lot of memory for small datasets) due
-to the use of ``fork`` for creating new processes when running in sequential manner (if this
-happens in a parallel setting or if you pass your own dask client this is due to a different
-issue, see the other issues below).
+    We provide examples on using *auto-sklearn* for multiple use cases ranging from
+    simple classification to advanced uses such as feature importance, parallel runs
+    and customization. They can be found in the :ref:`sphx_glr_examples`.
 
-Let's go into some more detail and discuss how to fix it:
-Auto-sklearn executes each machine learning algorithm in its own process to be able to apply a
-memory limit and a time limit. To start such a process, Python gives three options: ``fork``,
-``forkserver`` and ``spawn``. The default ``fork`` copies the whole process memory into the
-subprocess. If the main process already uses 1.5GB of main memory and we apply a 3GB memory
-limit to Auto-sklearn, executing a machine learning pipeline is limited to use at most 1.5GB.
-We would have loved to use ``forkserver`` or ``spawn`` as the default option instead, which both
-copy only relevant data into the subprocess and thereby alleaviate the issue of eating up a lot
-of your main memory
-(and also do not suffer from potential deadlocks as ``fork`` does, see
-`here <https://pythonspeed.com/articles/python-multiprocessing/>`_),
-but they have the downside that code must be guarded by ``if __name__ == "__main__"`` or executed
-in a notebook, and we decided that we do not want to require this by default.
+.. collapse:: <b>What type of tasks can auto-sklearn tackle?</b>
 
-There are now two possible solutions:
+    *auto-sklearn* can accept targets for the following tasks (more details on `Sklearn algorithms <https://scikit-learn.org/stable/modules/multiclass.html>`_):
 
-1. Use Auto-sklearn in parallel: if you use Auto-sklean in parallel, it defaults to ``forkserver``
-   as the parallelization mechanism itself requires Auto-sklearn the code to be guarded. Please
-   find more information on how to do this in the following two examples:
+    * Binary Classification
+    * Multiclass Classification
+    * Multilabel Classification
+    * Regression
+    * Multioutput Regression
 
-   1. :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py`
-   2. :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py`
+    You can provide feature and target training pairs (X_train/y_train) to *auto-sklearn* to fit an
+    ensemble of pipelines as described in the next section. This X_train/y_train dataset must belong
+    to one of the supported formats: np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix and python lists.
+    Optionally, you can measure the ability of this fitted model to generalize to unseen data by
+    providing an optional testing pair (X_test/Y_test). For further details, please refer to the
+    Example :ref:`sphx_glr_examples_40_advanced_example_pandas_train_test.py`.
+    Supported formats for these training and testing pairs are: np.ndarray,
+    pd.DataFrame, scipy.sparse.csr_matrix and python lists.
 
-   .. note::
+    If your data contains categorical values (in the features or targets), autosklearn will automatically encode your
+    data using a `sklearn.preprocessing.LabelEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_
+    for unidimensional data and a `sklearn.preprocessing.OrdinalEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html>`_
+    for multidimensional data.
 
-       This requires all code to be guarded by ``if __name__ == "__main__"``.
+    Regarding the features, there are two methods to guide *auto-sklearn* to properly encode categorical columns:
 
-2. Pass a `dask client <https://distributed.dask.org/en/latest/client.html>`_. If the user passes
-   a dask client, Auto-sklearn can no longer assume that it runs in sequential mode and will use
-   a ``forkserver`` to start new processes.
+    * Providing a X_train/X_test numpy array with the optional flag feat_type. For further details, you
+      can check the Example :ref:`sphx_glr_examples_40_advanced_example_feature_types.py`.
+    * You can provide a pandas DataFrame, with properly formatted columns. If a column has numerical
+      dtype, *auto-sklearn* will not encode it and it will be passed directly to scikit-learn. If the
+      column has a categorical/boolean class, it will be encoded. If the column is of any other type
+      (Object or Timeseries), an error will be raised. For further details on how to properly encode
+      your data, you can check the Pandas Example
+      `Working with categorical data <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_).
+      If you are working with time series, it is recommended that you follow this approach
+      `Working with time data <https://stats.stackexchange.com/questions/311494/>`_.
 
-   .. note::
+    Regarding the targets (y_train/y_test), if the task involves a classification problem, such features will be
+    automatically encoded. It is recommended to provide both y_train and y_test during fit, so that a common encoding
+    is created between these splits (if only y_train is provided during fit, the categorical encoder will not be able
+    to handle new classes that are exclusive to y_test). If the task is regression, no encoding happens on the
+    targets.
 
-       This requires all code to be guarded by ``if __name__ == "__main__"``.
+.. collapse:: <b>Where can I find slides and notebooks from talks and tutorials?</b>
 
-We therefore suggest using one of the above settings by default.
+    We provide resources for talks, tutorials and presentations on *auto-sklearn* under `auto-sklearn-talks <https://github.com/automl/auto-sklearn-talks>`_
 
-Auto-sklearn is extremely memory hungry in a parallel setting
--------------------------------------------------------------
+.. collapse:: <b>How should I cite auto-sklearn in a scientific publication?</b>
 
-When running Auto-sklearn in a parallel setting it starts new processes for evaluating machine
-learning models using the ``forkserver`` mechanism. Code that is in the main script and that is
-not guarded by ``if __name__ == "__main__"`` will be executed for each subprocess. If, for example,
-you are loading your dataset outside of the guarded code, your dataset will be loaded for each
-evaluation of a machine learning algorithm and thus blocking your RAM.
+    If you've used auto-sklearn in scientific publications, we would appreciate citations.
 
-We therefore suggest moving all code inside functions or the main block.
+    .. code-block::
 
-Auto-sklearn crashes with a segmentation fault
-----------------------------------------------
+        @inproceedings{feurer-neurips15a,
+            title     = {Efficient and Robust Automated Machine Learning},
+            author    = {Feurer, Matthias and Klein, Aaron and Eggensperger, Katharina  Springenberg, Jost and Blum, Manuel and Hutter, Frank},
+            booktitle = {Advances in Neural Information Processing Systems 28 (2015)},
+            pages     = {2962--2970},
+            year      = {2015}
+        }
 
-Please make sure that you have read and followed the :ref:`installation` section! In case
-everything is set up correctly, this is most likely due to the dependency
-`pyrfr <https://github.com/automl/random_forest_run>`_ not being compiled correctly. If this is the
-case please execute:
+    Or this, if you've used auto-sklearn 2.0 in your work:
 
-.. code:: python
+    .. code-block::
 
-    import pyrfr.regression as reg
-    data = reg.default_data_container(64)
+        @article{feurer-arxiv20a,
+            title     = {Auto-Sklearn 2.0: Hands-free AutoML via Meta-Learning},
+            author    = {Feurer, Matthias and Eggensperger, Katharina and Falkner, Stefan and Lindauer, Marius and Hutter, Frank},
+            booktitle = {arXiv:2007.04074 [cs.LG]},
+            year      = {2020}
+        }
 
-If this fails, the pyrfr dependency is most likely not compiled correctly. We advice you to do the
-following:
+.. collapse:: <b>I want to contribute. What can I do?</b>
 
-1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. We
-   provide pre-compiled versions of the pyrfr on `pypi <https://pypi.org/project/pyrfr/#files>`_.
-2. Check if the dependencies specified under :ref:`installation` are correctly installed,
-   especially that you have ``swig`` and a ``C++`` compiler.
-3. If you are not yet using Conda, consider using it; it simplifies installation of the correct
-   dependencies.
-4. Install correct build dependencies before installing the pyrfr, you can check the following
-   github issues for suggestions: `1025 <https://github.com/automl/auto-sklearn/issues/1025>`_,
-   `856 <https://github.com/automl/auto-sklearn/issues/856>`_
+    This sounds great. Please have a look at our `contribution guide <https://github.com/automl/auto-sklearn/blob/master/CONTRIBUTING.md>`_
 
-Log files and output
-====================
+.. collapse:: <b>I have a question which is not answered here. What should I do?</b>
 
-Where does Auto-sklearn output files by default?
-------------------------------------------------
+    Thanks a lot. We regularly update this section with questions from our issue tracker. So please use the
+    `issue tracker <https://github.com/automl/auto-sklearn/issues>`_
 
-*Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can
-be used to inspect the behavior of Auto-sklearn. Each run of Auto-sklearn requires
-its own directory. If not provided by the user, *Auto-sklearn* requests a temporary directory from
-Python, which by default is located under ``/tmp`` and starts with ``autosklearn_tmp_`` followed
-by a random string. By default, this directory is deleted when the *Auto-sklearn* object is
-destroyed. If you want to keep these files you can pass the argument
-``delete_tmp_folder_after_terminate=True`` to the *Auto-sklearn* object.
+Resource Management
+===================
 
-The :class:`autosklearn.classification.AutoSklearnClassifier` and all other *auto-sklearn*
-estimators accept the argument ``tmp_directory`` which change where such output is written to.
+.. collapse:: <b>How should I set the time and memory limits?</b>
 
-There's an additional argument ``output_directory`` which can be passed to *Auto-sklearn* and it
-controls where test predictions of the ensemble are stored if the test set is passed to ``fit()``.
+    While *auto-sklearn* alleviates manual hyperparameter tuning, the user still
+    has to set memory and time limits. For most datasets a memory limit of 3GB or
+    6GB as found on most modern computers is sufficient. For the time limits it
+    is harder to give clear guidelines. If possible, a good default is a total
+    time limit of one day, and a time limit of 30 minutes for a single run.
 
-Auto-sklearn eats up all my disk space
---------------------------------------
+    Further guidelines can be found in
+    `auto-sklearn/issues/142 <https://github.com/automl/auto-sklearn/issues/142>`_.
 
-*Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can
-be used to inspect the behavior of Auto-sklearn. By default, *Auto-sklearn* stores 50
-models and their predictions on the validation data (which is a subset of the training data in
-case of holdout and the full training data in case of cross-validation) on the hard drive.
-Redundant models and their predictions (i.e. when we have more than 50 models) are removed
-everytime the ensemble builder finishes an iteration, which means that the number of models stored
-on disk can temporarily be higher if a model is output while the ensemble builder is running.
+.. collapse:: <b>How many CPU cores does auto-sklearn use by default?</b>
 
-One can therefore change the number of models that will be stored on disk by passing an integer
-for the argument ``max_models_on_disc`` to *Auto-sklearn*, for example reduce the number of models
-stored on disk if you have space issues.
+    By default, *auto-sklearn* uses **one core**. See also :ref:`parallel` on how to configure this.
 
-As the number of models is only an indicator of the disk space used it is also possible to pass
-the memory in MB the models are allowed to use as a ``float`` (also via the ``max_models_on_disc``
-arguments). As above, this is rather a guideline on how much memory is used as redundant models
-are only removed from disk when the ensemble builder finishes an iteration.
+.. collapse:: <b>How can I run auto-sklearn in parallel?</b>
 
-.. note::
+    Nevertheless, *auto-sklearn* also supports parallel Bayesian optimization via the use of
+    `Dask.distributed  <https://distributed.dask.org/>`_. By providing the arguments ``n_jobs``
+    to the estimator construction, one can control the number of cores available to *auto-sklearn*
+    (As shown in the Example :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py`).
+    Distributed processes are also supported by providing a custom client object to *auto-sklearn* like
+    in the Example: :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py`. When
+    multiple cores are
+    available, *auto-sklearn* will create a worker per core, and use the available workers to both search
+    for better machine learning models as well as building an ensemble with them until the time resource
+    is exhausted.
 
-    Especially when running in parallel it can happen that multiple models are constructed during
-    one run of the ensemble builder and thus *Auto-sklearn* can exceed the given limit.
+    **Note:** *auto-sklearn* requires all workers to have access to a shared file system for storing training data and models.
 
-.. note::
+    *auto-sklearn* employs `threadpoolctl <https://github.com/joblib/threadpoolctl/>`_ to control the number of threads employed by scientific libraries like numpy or scikit-learn. This is done exclusively during the building procedure of models, not during inference. In particular, *auto-sklearn* allows each pipeline to use at most 1 thread during training. At predicting and scoring time this limitation is not enforced by *auto-sklearn*. You can control the number of resources
+    employed by the pipelines by setting the following variables in your environment, prior to running *auto-sklearn*:
 
-   These limits do only apply to models and their predictions, but not to other files stored in
-   the temporary directory such as the log files.
+    .. code-block:: shell-session
 
-Available machine learning models
-=================================
+        $ export OPENBLAS_NUM_THREADS=1
+        $ export MKL_NUM_THREADS=1
+        $ export OMP_NUM_THREADS=1
 
-Will non-scikit-learn models be added to Auto-sklearn?
-------------------------------------------------------
+    For further information about how scikit-learn handles multiprocessing, please check the `Parallelism, resource management, and configuration <https://scikit-learn.org/stable/computing/parallelism.html>`_ documentation from the library.
 
-The short answer: no.
 
-The long answer answer is a bit more nuanced: maintaining Auto-sklearn requires a lot of time and
-effort, which would grow even larger when depending on more libraries. Also, adding more
-libraries would require us to generate meta-data more often. Lastly, having more choices does not
-guarantee a better performance for most users as having more choices demands a longer search for
-good models and can lead to more overfitting.
+.. collapse:: <b>Auto-sklearn is extremely memory hungry in a sequential setting</b>
 
-Nevertheless, everyone can still add their favorite model to Auto-sklearn's search space by
-following the `examples on how to extend Auto-sklearn
-<https://automl.github.io/auto-sklearn/master/examples/index.html#extension-examples>`_.
+    Auto-sklearn can appear very memory hungry (i.e. requiring a lot of memory for small datasets) due
+    to the use of ``fork`` for creating new processes when running in sequential manner (if this
+    happens in a parallel setting or if you pass your own dask client this is due to a different
+    issue, see the other issues below).
 
-If there is interest in creating a Auto-sklearn-contrib repository with 3rd-party models please
-open an issue for that.
+    Let's go into some more detail and discuss how to fix it:
+    Auto-sklearn executes each machine learning algorithm in its own process to be able to apply a
+    memory limit and a time limit. To start such a process, Python gives three options: ``fork``,
+    ``forkserver`` and ``spawn``. The default ``fork`` copies the whole process memory into the
+    subprocess. If the main process already uses 1.5GB of main memory and we apply a 3GB memory
+    limit to Auto-sklearn, executing a machine learning pipeline is limited to use at most 1.5GB.
+    We would have loved to use ``forkserver`` or ``spawn`` as the default option instead, which both
+    copy only relevant data into the subprocess and thereby alleaviate the issue of eating up a lot
+    of your main memory
+    (and also do not suffer from potential deadlocks as ``fork`` does, see
+    `here <https://pythonspeed.com/articles/python-multiprocessing/>`_),
+    but they have the downside that code must be guarded by ``if __name__ == "__main__"`` or executed
+    in a notebook, and we decided that we do not want to require this by default.
 
-Can the preprocessing be disabled
----------------------------------
+    There are now two possible solutions:
 
-Feature preprocessing can be disabled as discussed in the example
-:ref:`restricting_the_searchspace`. Other preprocessing steps such as one hot encoding, missing
-feature imputation and normalization cannot yet be disabled, but we're working on that.
+    1. Use Auto-sklearn in parallel: if you use Auto-sklean in parallel, it defaults to ``forkserver``
+       as the parallelization mechanism itself requires Auto-sklearn the code to be guarded. Please
+       find more information on how to do this in the following two examples:
 
-Usage
-=====
+       1. :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py`
+       2. :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py`
+
+       .. note::
+
+           This requires all code to be guarded by ``if __name__ == "__main__"``.
+
+    2. Pass a `dask client <https://distributed.dask.org/en/latest/client.html>`_. If the user passes
+       a dask client, Auto-sklearn can no longer assume that it runs in sequential mode and will use
+       a ``forkserver`` to start new processes.
+
+       .. note::
+
+           This requires all code to be guarded by ``if __name__ == "__main__"``.
+
+    We therefore suggest using one of the above settings by default.
+
+.. collapse:: <b>Auto-sklearn is extremely memory hungry in a parallel setting</b>
+
+    When running Auto-sklearn in a parallel setting it starts new processes for evaluating machine
+    learning models using the ``forkserver`` mechanism. Code that is in the main script and that is
+    not guarded by ``if __name__ == "__main__"`` will be executed for each subprocess. If, for example,
+    you are loading your dataset outside of the guarded code, your dataset will be loaded for each
+    evaluation of a machine learning algorithm and thus blocking your RAM.
+
+    We therefore suggest moving all code inside functions or the main block.
+
+.. collapse:: <b>Auto-sklearn crashes with a segmentation fault</b>
+
+    Please make sure that you have read and followed the :ref:`installation` section! In case
+    everything is set up correctly, this is most likely due to the dependency
+    `pyrfr <https://github.com/automl/random_forest_run>`_ not being compiled correctly. If this is the
+    case please execute:
+
+    .. code:: python
+
+        import pyrfr.regression as reg
+        data = reg.default_data_container(64)
+
+    If this fails, the pyrfr dependency is most likely not compiled correctly. We advice you to do the
+    following:
+
+    1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. We
+       provide pre-compiled versions of the pyrfr on `pypi <https://pypi.org/project/pyrfr/#files>`_.
+    2. Check if the dependencies specified under :ref:`installation` are correctly installed,
+       especially that you have ``swig`` and a ``C++`` compiler.
+    3. If you are not yet using Conda, consider using it; it simplifies installation of the correct
+       dependencies.
+    4. Install correct build dependencies before installing the pyrfr, you can check the following
+       github issues for suggestions: `1025 <https://github.com/automl/auto-sklearn/issues/1025>`_,
+       `856 <https://github.com/automl/auto-sklearn/issues/856>`_
+
+Results, Log Files and Output
+=============================
+
+.. collapse:: <b>How can I get an overview of the run statistics?</b>
+
+    ``sprint_statistics()`` is a method that prints the name of the  dataset, the metric used, and the best validation score
+    obtained by running *auto-sklearn*. It additionally prints the number of both successful and unsuccessful
+    algorithm runs.
+
+.. collapse:: <b>What was the performance over time?</b>
+
+    ``performance_over_time_``  returns a DataFrame containing the models performance over time data, which can
+    be used for plotting directly (Here is an example: :ref:`sphx_glr_examples_40_advanced_example_pandas_train_test.py`).
+
+    .. code:: python
+
+        automl.performance_over_time_.plot(
+                x='Timestamp',
+                kind='line',
+                legend=True,
+                title='Auto-sklearn accuracy over time',
+                grid=True,
+            )
+            plt.show()
+
+.. collapse:: <b>Which models were evaluated?</b>
+
+    You can see all models evaluated using :meth:`automl.leaderboard(ensemble_only=False) <autosklearn.classification.AutoSklearnClassifier.leaderboard>`.
+
+.. collapse:: <b>Which models are in the final ensemble?</b>
+
+    Use either :meth:`automl.leaderboard(ensemble_only=True) <autosklearn.classification.AutoSklearnClassifier.leaderboard>` or ``automl.show_models()``
+
+.. collapse:: <b>Is there more data I can look at?</b>
+
+    ``cv_results_`` returns a dict with keys as column headers and values as columns, that can be imported into
+    a pandas DataFrame, e.g. ``df = pd.DataFrame(automl.cv_results_)``
+
+.. collapse:: <b>Where does Auto-sklearn output files by default?</b>
+
+    *Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can
+    be used to inspect the behavior of Auto-sklearn. Each run of Auto-sklearn requires
+    its own directory. If not provided by the user, *Auto-sklearn* requests a temporary directory from
+    Python, which by default is located under ``/tmp`` and starts with ``autosklearn_tmp_`` followed
+    by a random string. By default, this directory is deleted when the *Auto-sklearn* object is
+    finished fitting. If you want to keep these files you can pass the argument
+    ``delete_tmp_folder_after_terminate=True`` to the *Auto-sklearn* object.
+
+    The :class:`autosklearn.classification.AutoSklearnClassifier` and all other *auto-sklearn*
+    estimators accept the argument ``tmp_folder`` which change where such output is written to.
+
+    There's an additional argument ``output_directory`` which can be passed to *Auto-sklearn* and it
+    controls where test predictions of the ensemble are stored if the test set is passed to ``fit()``.
+
+.. collapse:: <b>Auto-sklearn's logfiles eat up all my disk space. What can I do?</b>
+
+    *Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can
+    be used to inspect the behavior of Auto-sklearn. By default, *Auto-sklearn* stores 50
+    models and their predictions on the validation data (which is a subset of the training data in
+    case of holdout and the full training data in case of cross-validation) on the hard drive.
+    Redundant models and their predictions (i.e. when we have more than 50 models) are removed
+    everytime the ensemble builder finishes an iteration, which means that the number of models stored
+    on disk can temporarily be higher if a model is output while the ensemble builder is running.
+
+    One can therefore change the number of models that will be stored on disk by passing an integer
+    for the argument ``max_models_on_disc`` to *Auto-sklearn*, for example reduce the number of models
+    stored on disk if you have space issues.
+
+    As the number of models is only an indicator of the disk space used it is also possible to pass
+    the memory in MB the models are allowed to use as a ``float`` (also via the ``max_models_on_disc``
+    arguments). As above, this is rather a guideline on how much memory is used as redundant models
+    are only removed from disk when the ensemble builder finishes an iteration.
+
+    .. note::
+
+        Especially when running in parallel it can happen that multiple models are constructed during
+        one run of the ensemble builder and thus *Auto-sklearn* can exceed the given limit.
+
+    .. note::
+
+       These limits do only apply to models and their predictions, but not to other files stored in
+       the temporary directory such as the log files.
+
+The Search Space
+================
 
-Only use interpretable models
------------------------------
+.. collapse:: <b>How can I restrict the searchspace?</b>
 
-Auto-sklearn can be restricted to only use interpretable models and preprocessing algorithms.
-Please see the Section :ref:`restricting_the_searchspace` to learn how to restrict the models
-which are searched over or see the Example
-:ref:`sphx_glr_examples_40_advanced_example_interpretable_models.py`.
+ The following shows an example of how to exclude all preprocessing methods and restrict the configuration space to
+ only random forests.
 
-We don't provide a judgement which of the models are interpretable as this is very much up to the
-specific use case, but would like to note that decision trees and linear models usually most
-interpretable.
+    .. code:: python
 
-Limiting the number of model evaluations
-----------------------------------------
+        import autosklearn.classification
+        automl = autosklearn.classification.AutoSklearnClassifier(
+            include = {
+                'classifier': ["random_forest"],
+                'feature_preprocessor': ["no_preprocessing"]
+            },
+            exclude=None
+        )
+        automl.fit(X_train, y_train)
+        predictions = automl.predict(X_test)
 
-In certain cases, for example for debugging, it can be helpful to limit the number of
-model evaluations. We do not provide this as an argument in the API as we believe that it
-should NOT be used in practice, but that the user should rather provide time limits.
-An example on how to add the number of models to try as an additional stopping condition
-can be found `in this github issue <https://github.com/automl/auto-sklearn/issues/451#issuecomment-376445607>`_.
-Please note that Auto-sklearn will stop when either the time limit or the number of
-models termination condition is reached.
+    **Note:** The strings used to identify estimators and preprocessors are the filenames without *.py*.
 
-Ensemble contains only a dummy model
-------------------------------------
+    For a full list please have a look at the source code (in `autosklearn/pipeline/components/`):
 
-This is a symptom of the problem that all runs started by Auto-sklearn failed. Usually, the issue
-is that the runtime or memory limit were too tight. Please check the output of
-``sprint_statistics`` to see the distribution of why runs failed. If there are mostly crashed
-runs, please check the log file for further details. If there are mostly runs that exceed the
-memory or time limit, please increase the respective limit and rerun the optimization.
+      * `Classifiers <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/classification>`_
+      * `Regressors <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/regression>`_
+      * `Preprocessors <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/feature_preprocessing>`_
 
-Parallel processing and oversubscription
-----------------------------------------
+    We do also provide an example on how to restrict the classifiers to search over
+    :ref:`sphx_glr_examples_40_advanced_example_interpretable_models.py`.
 
-Auto-sklearn wraps scikit-learn and therefore inherits its parallelism implementation. In short,
-scikit-learn uses two modes of parallelizing computations:
+.. collapse:: <b>How can I turn off data preprocessing?</b>
 
-1. By using joblib to distribute independent function calls on multiple cores.
-2. By using lower level libraries such as OpenMP and numpy to distribute more fine-grained
-   computation.
+    Data preprocessing includes One-Hot encoding of categorical features, imputation
+    of missing values and the normalization of features or samples. These ensure that
+    the data the gets to the sklearn models is well formed and can be used for
+    training models.
 
-This means that Auto-sklearn can use more resources than expected by the user. For technical
-reasons we can only control the 1st way of parallel execution, but not the 2nd. Thus, the user
-needs to make sure that the lower level parallelization libraries only use as many cores as
-allocated (on a laptop or workstation running a single copy of Auto-sklearn it can be fine to not
-adjust this, but when using a compute cluster it is necessary to align the parallelism setting
-with the number of requested CPUs). This can be done by setting the following environment
-variables: ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, ``BLIS_NUM_THREADS`` and
-``OMP_NUM_THREADS``.
+    While this is necessary in general, if you'd like to disable this step, please
+    refer to this :ref:`example <sphx_glr_examples_80_extending_example_extending_data_preprocessor.py>`.
 
-More details can be found in the `scikit-learn docs <https://scikit-learn.org/stable/computing/parallelism.html?highlight=joblib#parallelism>`_.
+.. collapse:: <b>How can I turn off feature preprocessing?</b>
+
+    Feature preprocessing is a single transformer which implements for example feature
+    selection or transformation of features into a different space (i.e. PCA).
+
+    This can be turned off by setting
+    ``include={'feature_preprocessor'=["no_preprocessing"]}`` as shown in the example above.
+
+.. collapse:: <b>Will non-scikit-learn models be added to Auto-sklearn?</b>
+
+    The short answer: no.
+
+    The long answer answer is a bit more nuanced: maintaining Auto-sklearn requires a lot of time and
+    effort, which would grow even larger when depending on more libraries. Also, adding more
+    libraries would require us to generate meta-data more often. Lastly, having more choices does not
+    guarantee a better performance for most users as having more choices demands a longer search for
+    good models and can lead to more overfitting.
+
+    Nevertheless, everyone can still add their favorite model to Auto-sklearn's search space by
+    following the `examples on how to extend Auto-sklearn
+    <https://automl.github.io/auto-sklearn/master/examples/index.html#extension-examples>`_.
+
+    If there is interest in creating a Auto-sklearn-contrib repository with 3rd-party models please
+    open an issue for that.
+
+.. collapse:: <b>How can I only search for interpretable models</b>
+
+    Auto-sklearn can be restricted to only use interpretable models and preprocessing algorithms.
+    Please see the Section :ref:`space` to learn how to restrict the models
+    which are searched over or see the Example
+    :ref:`sphx_glr_examples_40_advanced_example_interpretable_models.py`.
+
+    We don't provide a judgement which of the models are interpretable as this is very much up to the
+    specific use case, but would like to note that decision trees and linear models usually most
+    interpretable.
+
+Ensembling
+==========
+
+.. collapse:: <b>What can I configure wrt the ensemble building process?</b>
+
+    The following hyperparameters control how the ensemble is constructed:
+
+    * ``ensemble_size`` determines the maximal size of the ensemble. If it is set to zero, no ensemble will be constructed.
+    * ``ensemble_nbest`` allows the user to directly specify the number of models considered for the ensemble.  This hyperparameter can be an integer *n*, such that only the best *n* models are used in the final ensemble. If a float between 0.0 and 1.0 is provided, ``ensemble_nbest`` would be interpreted as a fraction suggesting the percentage of models to use in the ensemble building process (namely, if ensemble_nbest is a float, library pruning is implemented as described in `Caruana et al. (2006) <https://dl.acm.org/doi/10.1109/ICDM.2006.76>`_).
+    * ``max_models_on_disc`` defines the maximum number of models that are kept on the disc, as a mechanism to control the amount of disc space consumed by *auto-sklearn*. Throughout the automl process, different individual models are optimized, and their predictions (and other metadata) is stored on disc. The user can set the upper bound on how many models are acceptable to keep on disc, yet this variable takes priority in the definition of the number of models used by the ensemble builder (that is, the minimum of ``ensemble_size``, ``ensemble_nbest`` and ``max_models_on_disc`` determines the maximal amount of models used in the ensemble). If set to None, this feature is disabled.
+
+.. collapse:: <b>Which models are in the final ensemble?</b>
+
+    The results obtained from the final ensemble can be printed by calling ``show_models()`` or  ``leaderboard()``.
+    The *auto-sklearn* ensemble is composed of scikit-learn models that can be inspected as exemplified
+    in the Example :ref:`sphx_glr_examples_40_advanced_example_get_pipeline_components.py`.
+
+.. collapse:: <b>Can I fit an ensemble also only post-hoc?</b>
+
+    It is possible to build ensembles post-hoc. An example on how to do this (first searching for individual models, and then building an ensemble from them) can be seen in :ref:`sphx_glr_examples_60_search_example_sequential.py`.
+
+Configuring the Search Procedure
+================================
+
+.. collapse:: <b>Can I change the resampling strategy?</b>
+
+    Examples for using holdout and cross-validation can be found in :ref:`example <sphx_glr_examples_40_advanced_example_resampling.py>`
+
+.. collapse:: <b>Can I use a custom metric</b>
+
+    Examples for using a custom metric can be found in :ref:`example <sphx_glr_examples_40_advanced_example_metrics.py>`
 
 Meta-Learning
 =============
 
-Which datasets are used for meta-learning?
-------------------------------------------
+.. collapse:: <b>Which datasets are used for meta-learning?</b>
+
+    We updated the list of datasets used for meta-learning several times and this list now differs
+    significantly from the original 140 datasets we used in 2015 when the paper and the package were
+    released. An up-to-date list of `OpenML task IDs <https://docs.openml.org/#tasks>`_ can be found
+    on `github <https://github.com/automl/auto-sklearn/blob/master/scripts/update_metadata_util.py>`_.
+
+.. collapse:: <b>How can datasets from the meta-data be excluded?</b>
+
+    For *Auto-sklearn 1.0* one can pass the dataset name via the ``fit()`` function. If a dataset
+    with the same name is within the meta-data, that datasets will not be used.
+
+    For *Auto-sklearn 2.0* it is not possible to do so because of the method used to construct the
+    meta-data.
+
+.. collapse:: <b>Which meta-features are used for meta-learning?</b>
 
-We updated the list of datasets used for meta-learning several times and this list now differs
-significantly from the original 140 datasets we used in 2015 when the paper and the package were
-released. An up-to-date list of `OpenML task IDs <https://docs.openml.org/#tasks>`_ can be found
-on `github <https://github.com/automl/auto-sklearn/blob/master/scripts/update_metadata_util.py>`_.
+    We do not have a user guide on meta-features but they are all pretty simple and can be found
+    `in the source code <https://github.com/automl/auto-sklearn/blob/master/autosklearn/metalearning/metafeatures/metafeatures.py>`_.
 
-How can datasets from the meta-data be excluded?
-------------------------------------------------
+.. collapse:: <b>How is the meta-data generated for Auto-sklearn 1.0?</b>
 
-For *Auto-sklearn 1.0* one can pass the dataset name via the ``fit()`` function. If a dataset
-with the same name is within the meta-data, that datasets will not be used.
+    We currently generate meta-data the following way. First, for each of the datasets mentioned
+    above, we run Auto-sklearn without meta-learning for a total of two days on multiple metrics (for
+    classification these are accuracy, balanced accuracy, log loss and the area under the curce).
+    Second, for each run we then have a look at each models that improved the score, i.e. the
+    trajectory of the best known model at a time, and refit it on the whole training data. Third, for
+    each of these models we then compute all scores we're interested in, these also include other
+    ones such F1 and precision. Finally, for each combination of dataset and metric we store the best
+    model we know of.
+
+.. collapse:: <b>How is the meta-data generated for Auto-sklearn 2.0?</b>
+
+    Please check `our paper <https://arxiv.org/abs/2007.04074>`_ for details.
+
+
+Issues and Debugging
+====================
 
-For *Auto-sklearn 2.0* it is not possible to do so because of the method used to construct the
-meta-data.
+.. collapse:: <b>How can I limit the number of model evaluations for debugging?</b>
+
+    In certain cases, for example for debugging, it can be helpful to limit the number of
+    model evaluations. We do not provide this as an argument in the API as we believe that it
+    should NOT be used in practice, but that the user should rather provide time limits.
+    An example on how to add the number of models to try as an additional stopping condition
+    can be found `in this github issue <https://github.com/automl/auto-sklearn/issues/451#issuecomment-376445607>`_.
+    Please note that Auto-sklearn will stop when either the time limit or the number of
+    models termination condition is reached.
+
+.. collapse:: <b>Why does the final ensemble contains only a dummy model?</b>
+
+    This is a symptom of the problem that all runs started by Auto-sklearn failed. Usually, the issue
+    is that the runtime or memory limit were too tight. Please check the output of
+    ``sprint_statistics()`` to see the distribution of why runs failed. If there are mostly crashed
+    runs, please check the log file for further details. If there are mostly runs that exceed the
+    memory or time limit, please increase the respective limit and rerun the optimization.
+
+.. collapse:: <b>Auto-sklearn does not use the specified amount of resources?</b>
+
+    Auto-sklearn wraps scikit-learn and therefore inherits its parallelism implementation. In short,
+    scikit-learn uses two modes of parallelizing computations:
+
+    1. By using joblib to distribute independent function calls on multiple cores.
+    2. By using lower level libraries such as OpenMP and numpy to distribute more fine-grained
+       computation.
+
+    This means that Auto-sklearn can use more resources than expected by the user. For technical
+    reasons we can only control the 1st way of parallel execution, but not the 2nd. Thus, the user
+    needs to make sure that the lower level parallelization libraries only use as many cores as
+    allocated (on a laptop or workstation running a single copy of Auto-sklearn it can be fine to not
+    adjust this, but when using a compute cluster it is necessary to align the parallelism setting
+    with the number of requested CPUs). This can be done by setting the following environment
+    variables: ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, ``BLIS_NUM_THREADS`` and
+    ``OMP_NUM_THREADS``.
+
+    More details can be found in the `scikit-learn docs <https://scikit-learn.org/stable/computing/parallelism.html?highlight=joblib#parallelism>`_.
+
+Other
+=====
 
-Which meta-features are used for meta-learning?
------------------------------------------------
+.. collapse:: <b>Model persistence</b>
 
-We do not have a user guide on meta-features but they are all pretty simple and can be found
-`in the source code <https://github.com/automl/auto-sklearn/blob/master/autosklearn/metalearning/metafeatures/metafeatures.py>`_.
+    *auto-sklearn* is mostly a wrapper around scikit-learn. Therefore, it is
+    possible to follow the
+    `persistence Example <https://scikit-learn.org/stable/modules/model_persistence.html>`_
+    from scikit-learn.
 
-How is the meta-data generated?
--------------------------------
+.. collapse:: <b>Vanilla auto-sklearn</b>
 
-Auto-sklearn 1.0
-~~~~~~~~~~~~~~~~
+    In order to obtain *vanilla auto-sklearn* as used in `Efficient and Robust Automated Machine Learning
+    <https://papers.neurips.cc/paper/5872-efficient-and-robust-automated-machine-learning>`_
+    set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``:
 
-We currently generate meta-data the following way. First, for each of the datasets mentioned
-above, we run Auto-sklearn without meta-learning for a total of two days on multiple metrics (for
-classification these are accuracy, balanced accuracy, log loss and the area under the curce).
-Second, for each run we then have a look at each models that improved the score, i.e. the
-trajectory of the best known model at a time, and refit it on the whole training data. Third, for
-each of these models we then compute all scores we're interested in, these also include other
-ones such F1 and precision. Finally, for each combination of dataset and metric we store the best
-model we know of.
+    .. code:: python
 
-Auto-sklearn 2.0
-~~~~~~~~~~~~~~~~
+        import autosklearn.classification
+        automl = autosklearn.classification.AutoSklearnClassifier(
+            ensemble_size=1,
+            initial_configurations_via_metalearning=0
+        )
 
-Please check `our paper <https://arxiv.org/abs/2007.04074>`_ for details.
+    An ensemble of size one will result in always choosing the current best model
+    according to its performance on the validation set. Setting the initial
+    configurations found by meta-learning to zero makes *auto-sklearn* use the
+    regular SMAC algorithm for suggesting new hyperparameter configurations.
diff --git a/doc/index.rst b/doc/index.rst
index c82cdb0eae..e0690ac8e7 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -22,7 +22,7 @@ replacement for a scikit-learn estimator:
 hyperparameter tuning. It leverages recent advantages in *Bayesian
 optimization*, *meta-learning* and *ensemble construction*. Learn more about
 the technology behind *auto-sklearn* by reading our paper published at
-`NIPS 2015 <https://papers.neurips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf>`_
+`NeurIPS 2015 <https://papers.neurips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf>`_
 .
 
 .. topic:: NEW: Auto-sklearn 2.0
@@ -38,6 +38,11 @@ the technology behind *auto-sklearn* by reading our paper published at
 
 A paper describing our advances is available on `arXiv <https://arxiv.org/abs/2007.04074>`_.
 
+.. topic:: NEW: Material from tutorials and presentations
+
+    We provide slides and notebooks from talks and tutorials here: `auto-sklearn-talks <https://github.com/automl/auto-sklearn-talks>`_
+
+
 Example
 *******
 
diff --git a/doc/manual.rst b/doc/manual.rst
index 252626666d..2a3df6528b 100644
--- a/doc/manual.rst
+++ b/doc/manual.rst
@@ -6,232 +6,299 @@
 Manual
 ======
 
-This manual shows how to use several aspects of auto-sklearn. It either
-references the examples where possible or explains certain configurations.
+This manual gives an overview of different aspects of *auto-sklearn*. For each section, we either references examples or
+give short explanations (click the title to expand text), e.g.
 
-Examples
-========
+.. collapse:: <b>Code examples</b>
 
-We provide examples on using *auto-sklearn* for multiple use cases ranging from
-simple classification to advanced uses such as feature importance, parallel runs
-and customization. They can be found in the :ref:`sphx_glr_examples`.
+    We provide examples on using *auto-sklearn* for multiple use cases ranging from
+    simple classification to advanced uses such as feature importance, parallel runs
+    and customization. They can be found in the :ref:`sphx_glr_examples`.
 
-Time and memory limits
-======================
+.. collapse:: <b>Material from talks and presentations</b>
 
-A crucial feature of *auto-sklearn* is limiting the resources (memory and
-time) which the scikit-learn algorithms are allowed to use. Especially for
-large datasets, on which algorithms can take several hours and make the
-machine swap, it is important to stop the evaluations after some time in order
-to make progress in a reasonable amount of time. Setting the resource limits
-is therefore a tradeoff between optimization time and the number of models
-that can be tested.
+    We provide resources for talks, tutorials and presentations on *auto-sklearn* under `auto-sklearn-talks <https://github.com/automl/auto-sklearn-talks>`_
 
-While *auto-sklearn* alleviates manual hyperparameter tuning, the user still
-has to set memory and time limits. For most datasets a memory limit of 3GB or
-6GB as found on most modern computers is sufficient. For the time limits it
-is harder to give clear guidelines. If possible, a good default is a total
-time limit of one day, and a time limit of 30 minutes for a single run.
+.. _limits:
 
-Further guidelines can be found in
-`auto-sklearn/issues/142 <https://github.com/automl/auto-sklearn/issues/142>`_.
+Resource limits
+===============
 
-.. _restricting_the_searchspace:
+A crucial feature of *auto-sklearn* is limiting the resources (memory and time) which the scikit-learn algorithms are
+allowed to use. Especially for large datasets, on which algorithms can take several hours and make the machine swap,
+it is important to stop the evaluations after some time in order to make progress in a reasonable amount of time.
+Setting the resource limits is therefore a tradeoff between optimization time and the number of models that can be
+tested.
 
-Restricting the searchspace
-===========================
+.. collapse:: <b>Time and memory limits</b>
 
-Instead of using all available estimators, it is possible to restrict
-*auto-sklearn*'s searchspace. The following shows an example of how to exclude
-all preprocessing methods and restrict the configuration space to only
-random forests.
+    While *auto-sklearn* alleviates manual hyperparameter tuning, the user still
+    has to set memory and time limits. For most datasets a memory limit of 3GB or
+    6GB as found on most modern computers is sufficient. For the time limits it
+    is harder to give clear guidelines. If possible, a good default is a total
+    time limit of one day, and a time limit of 30 minutes for a single run.
 
-.. code:: python
+    Further guidelines can be found in
+    `auto-sklearn/issues/142 <https://github.com/automl/auto-sklearn/issues/142>`_.
+
+.. collapse:: <b>CPU cores</b>
 
-    import autosklearn.classification
-    automl = autosklearn.classification.AutoSklearnClassifier(
-        include = {
-            'classifier': ["random_forest"],
-            'feature_preprocessor': ["no_preprocessing"]
-        },
-        exclude=None
-    )
-    automl.fit(X_train, y_train)
-    predictions = automl.predict(X_test)
+    By default, *auto-sklearn* uses **one core**. See also :ref:`parallel` on how to configure this.
 
-**Note:** The strings used to identify estimators and preprocessors are the filenames without *.py*.
+.. _space:
 
-For a full list please have a look at the source code (in `autosklearn/pipeline/components/`):
+The search space
+================
 
-  * `Classifiers <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/classification>`_
-  * `Regressors <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/regression>`_
-  * `Preprocessors <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/feature_preprocessing>`_
+*Auto-sklearn* by default searches a large space to find a well performing configuration. However, it is also possible
+to restrict the searchspace:
 
-We do also provide an example on how to restrict the classifiers to search over
-:ref:`sphx_glr_examples_40_advanced_example_interpretable_models.py`.
+.. collapse:: <b>Restricting the searchspace</b>
 
-Data preprocessing
-~~~~~~~~~~~~~~~~~~
-Data preprocessing includes One-Hot encoding of categorical features, imputation
-of missing values and the normalization of features or samples. These ensure that
-the data the gets to the sklearn models is well formed and can be used for 
-training models.
+ The following shows an example of how to exclude all preprocessing methods and restrict the configuration space to
+ only random forests.
 
-While this is necessary in general, if you'd like to disable this step, please
-refer to this :ref:`example <sphx_glr_examples_80_extending_example_extending_data_preprocessor.py>`.
+    .. code:: python
 
-Feature preprocessing
-~~~~~~~~~~~~~~~~~~~~~
-Feature preprocessing is a single transformer which implements for example feature
-selection or transformation of features into a different space (i.e. PCA).
+        import autosklearn.classification
+        automl = autosklearn.classification.AutoSklearnClassifier(
+            include = {
+                'classifier': ["random_forest"],
+                'feature_preprocessor': ["no_preprocessing"]
+            },
+            exclude=None
+        )
+        automl.fit(X_train, y_train)
+        predictions = automl.predict(X_test)
 
-This can be turned off by setting
-``include={'feature_preprocessor'=["no_preprocessing"]}`` as shown in the example above.
+    **Note:** The strings used to identify estimators and preprocessors are the filenames without *.py*.
 
-Resampling strategies
-=====================
+    For a full list please have a look at the source code (in `autosklearn/pipeline/components/`):
 
-Examples for using holdout and cross-validation can be found in :ref:`auto-sklearn/examples/ <examples>`.
+      * `Classifiers <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/classification>`_
+      * `Regressors <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/regression>`_
+      * `Preprocessors <https://github.com/automl/auto-sklearn/tree/master/autosklearn/pipeline/components/feature_preprocessing>`_
 
-Supported Inputs
-================
-*auto-sklearn* can accept targets for the following tasks (more details on `Sklearn algorithms <https://scikit-learn.org/stable/modules/multiclass.html>`_):
+    We do also provide an example on how to restrict the classifiers to search over
+    :ref:`sphx_glr_examples_40_advanced_example_interpretable_models.py`.
+
+.. collapse:: <b>Turn off data preprocessing</b>
+
+    Data preprocessing includes One-Hot encoding of categorical features, imputation
+    of missing values and the normalization of features or samples. These ensure that
+    the data the gets to the sklearn models is well formed and can be used for
+    training models.
+
+    While this is necessary in general, if you'd like to disable this step, please
+    refer to this :ref:`example <sphx_glr_examples_80_extending_example_extending_data_preprocessor.py>`.
+
+.. collapse:: <b>Turn off feature preprocessing</b>
+
+    Feature preprocessing is a single transformer which implements for example feature
+    selection or transformation of features into a different space (i.e. PCA).
+
+    This can be turned off by setting
+    ``include={'feature_preprocessor'=["no_preprocessing"]}`` as shown in the example above.
+
+.. _bestmodel:
 
-* Binary Classification
-* Multiclass Classification
-* Multilabel Classification
-* Regression
-* Multioutput Regression
+Model selection
+===============
 
-You can provide feature and target training pairs (X_train/y_train) to *auto-sklearn* to fit an
-ensemble of pipelines as described in the next section. This X_train/y_train dataset must belong
-to one of the supported formats: np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix and python lists.
-Optionally, you can measure the ability of this fitted model to generalize to unseen data by
-providing an optional testing pair (X_test/Y_test). For further details, please refer to the
-Example :ref:`sphx_glr_examples_40_advanced_example_pandas_train_test.py`.
-Supported formats for these training and testing pairs are: np.ndarray,
-pd.DataFrame, scipy.sparse.csr_matrix and python lists.
+*Auto-sklearn* implements different strategies to identify the best performing model. For some use cases it might be
+necessary to adapt the resampling strategy or define a custom metric:
 
-If your data contains categorical values (in the features or targets), autosklearn will automatically encode your data using a `sklearn.preprocessing.LabelEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_ for unidimensional data and a `sklearn.preprocessing.OrdinalEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html>`_ for multidimensional data.
+.. collapse:: <b>Use different resampling strategies</b>
 
-Regarding the features, there are two methods to guide *auto-sklearn* to properly encode categorical columns:
+    Examples for using holdout and cross-validation can be found in :ref:`example <sphx_glr_examples_40_advanced_example_resampling.py>`
 
-* Providing a X_train/X_test numpy array with the optional flag feat_type. For further details, you
-  can check the Example :ref:`sphx_glr_examples_40_advanced_example_feature_types.py`.
-* You can provide a pandas DataFrame, with properly formatted columns. If a column has numerical
-  dtype, *auto-sklearn* will not encode it and it will be passed directly to scikit-learn. If the
-  column has a categorical/boolean class, it will be encoded. If the column is of any other type
-  (Object or Timeseries), an error will be raised. For further details on how to properly encode
-  your data, you can check the Pandas Example
-  `Working with categorical data <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_).
-  If you are working with time series, it is recommended that you follow this approach
-  `Working with time data <https://stats.stackexchange.com/questions/311494/>`_.
+.. collapse:: <b>Use a custom metric</b>
 
-Regarding the targets (y_train/y_test), if the task involves a classification problem, such features will be automatically encoded. It is recommended to provide both y_train and y_test during fit, so that a common encoding is created between these splits (if only y_train is provided during fit, the categorical encoder will not be able to handle new classes that are exclusive to y_test). If the task is regression, no encoding happens on the targets.
+    Examples for using a custom metric can be found in :ref:`example <sphx_glr_examples_40_advanced_example_metrics.py>`
 
-Ensemble Building Process
-=========================
+.. _ensembles:
 
-*auto-sklearn* uses ensemble selection by `Caruana et al. (2004) <https://dl.acm.org/doi/pdf/10.1145/1015330.1015432>`_
-to build an ensemble based on the models’ prediction for the validation set. The following hyperparameters control how the ensemble is constructed:
+Ensembling
+==========
 
-* ``ensemble_size`` determines the maximal size of the ensemble. If it is set to zero, no ensemble will be constructed.
-* ``ensemble_nbest`` allows the user to directly specify the number of models considered for the ensemble.  This hyperparameter can be an integer *n*, such that only the best *n* models are used in the final ensemble. If a float between 0.0 and 1.0 is provided, ``ensemble_nbest`` would be interpreted as a fraction suggesting the percentage of models to use in the ensemble building process (namely, if ensemble_nbest is a float, library pruning is implemented as described in `Caruana et al. (2006) <https://dl.acm.org/doi/10.1109/ICDM.2006.76>`_).
-* ``max_models_on_disc`` defines the maximum number of models that are kept on the disc, as a mechanism to control the amount of disc space consumed by *auto-sklearn*. Throughout the automl process, different individual models are optimized, and their predictions (and other metadata) is stored on disc. The user can set the upper bound on how many models are acceptable to keep on disc, yet this variable takes priority in the definition of the number of models used by the ensemble builder (that is, the minimum of ``ensemble_size``, ``ensemble_nbest`` and ``max_models_on_disc`` determines the maximal amount of models used in the ensemble). If set to None, this feature is disabled.
+To get the best performance out of the evaluated models, *auto-sklearn* uses ensemble selection by `Caruana et al. (2004) <https://dl.acm.org/doi/pdf/10.1145/1015330.1015432>`_
+to build an ensemble based on the models’ prediction for the validation set.
 
-.. _inspecting_the_results:
+.. collapse:: <b>Configure the ensemble building process</b>
+
+    The following hyperparameters control how the ensemble is constructed:
+
+    * ``ensemble_size`` determines the maximal size of the ensemble. If it is set to zero, no ensemble will be constructed.
+    * ``ensemble_nbest`` allows the user to directly specify the number of models considered for the ensemble.  This hyperparameter can be an integer *n*, such that only the best *n* models are used in the final ensemble. If a float between 0.0 and 1.0 is provided, ``ensemble_nbest`` would be interpreted as a fraction suggesting the percentage of models to use in the ensemble building process (namely, if ensemble_nbest is a float, library pruning is implemented as described in `Caruana et al. (2006) <https://dl.acm.org/doi/10.1109/ICDM.2006.76>`_).
+    * ``max_models_on_disc`` defines the maximum number of models that are kept on the disc, as a mechanism to control the amount of disc space consumed by *auto-sklearn*. Throughout the automl process, different individual models are optimized, and their predictions (and other metadata) is stored on disc. The user can set the upper bound on how many models are acceptable to keep on disc, yet this variable takes priority in the definition of the number of models used by the ensemble builder (that is, the minimum of ``ensemble_size``, ``ensemble_nbest`` and ``max_models_on_disc`` determines the maximal amount of models used in the ensemble). If set to None, this feature is disabled.
+
+.. collapse:: <b>Inspect the final ensemble</b>
+
+    The results obtained from the final ensemble can be printed by calling ``show_models()``.
+    The *auto-sklearn* ensemble is composed of scikit-learn models that can be inspected as exemplified
+    in the Example :ref:`sphx_glr_examples_40_advanced_example_get_pipeline_components.py`.
+
+.. collapse:: <b>Fit ensemble post-hoc</b>
+
+    To use a single core only, it is possible to build ensembles post-hoc. An example on how to do this (first searching
+    for individual models, and then building an ensemble from them) can be seen in
+    :ref:`sphx_glr_examples_60_search_example_sequential.py`.
+
+
+.. _inspect:
 
 Inspecting the results
 ======================
 
-*auto-sklearn* allows users to inspect the training results and statistics. The following example shows how different
-statistics can be printed for the inspection.
+*auto-sklearn* allows users to inspect the training results and statistics. Assume we have a fitted estimator:
 
 .. code:: python
 
-    import autosklearn.classification
-    automl = autosklearn.classification.AutoSklearnClassifier()
-    automl.fit(X_train, y_train)
-    automl.cv_results_
-    automl.performance_over_time_.plot(
-        x='Timestamp',
-        kind='line',
-        legend=True,
-        title='Auto-sklearn accuracy over time',
-        grid=True,
-    )
-    plt.show()
-
-    automl.sprint_statistics()
-    automl.show_models()
-
-``cv_results_`` returns a dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame.
-``performance_over_time_``  returns a DataFrame containing the models performance over time data, which can be used for plotting directly (Here is an example: :ref:`sphx_glr_examples_40_advanced_example_pandas_train_test.py`).
-``sprint_statistics()`` is a method that prints the name of the  dataset, the metric used, and the best validation score
-obtained by running *auto-sklearn*. It additionally prints the number of both successful and unsuccessful
-algorithm runs.
-
-The results obtained from the final ensemble can be printed by calling ``show_models()``.
-*auto-sklearn* ensemble is composed of scikit-learn models that can be inspected as exemplified
-in the Example :ref:`sphx_glr_examples_40_advanced_example_get_pipeline_components.py`.
+        import autosklearn.classification
+        automl = autosklearn.classification.AutoSklearnClassifier()
+        automl.fit(X_train, y_train)
+
+*auto-sklearn* offers the following ways to inspect the results
+
+.. collapse:: <b>Basic statistics</b>
+
+    ``sprint_statistics()`` is a method that prints the name of the  dataset, the metric used, and the best validation score
+    obtained by running *auto-sklearn*. It additionally prints the number of both successful and unsuccessful
+    algorithm runs.
+
+.. collapse:: <b>Performance over Time</b>
+
+    ``performance_over_time_``  returns a DataFrame containing the models performance over time data, which can
+    be used for plotting directly (Here is an example: :ref:`sphx_glr_examples_40_advanced_example_pandas_train_test.py`).
+
+    .. code:: python
+
+        automl.performance_over_time_.plot(
+                x='Timestamp',
+                kind='line',
+                legend=True,
+                title='Auto-sklearn accuracy over time',
+                grid=True,
+            )
+            plt.show()
+
+.. collapse:: <b>Evaluated models</b>
+
+    The results obtained from the final ensemble can be printed by calling ``show_models()``.
+
+.. collapse:: <b>Leaderboard</b>
+
+    ``automl.leaderboard()`` shows the ensemble members, check the :meth:`docs <autosklearn.classification.AutoSklearnClassifier.leaderboard>` for using leaderboard for getting information on *all* runs.
+
+.. collapse:: <b>Other</b>
+
+    ``cv_results_`` returns a dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame.
+
+.. _parallel:
 
 Parallel computation
 ====================
 
-In it's default mode, *auto-sklearn* already uses two cores. The first one is
-used for model building, the second for building an ensemble every time a new
-machine learning model has finished training. An example on how to do this sequentially (first searching for individual models, and then building an ensemble from them) can be seen in
-:ref:`sphx_glr_examples_60_search_example_sequential.py`.
+In it's default mode, *auto-sklearn* uses **one core** and interleaves ensemble building with evaluating new
+configurations.
 
-Nevertheless, *auto-sklearn* also supports parallel Bayesian optimization via the use of
-`Dask.distributed  <https://distributed.dask.org/>`_. By providing the arguments ``n_jobs``
-to the estimator construction, one can control the number of cores available to *auto-sklearn*
-(As shown in the Example :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py`).
-Distributed processes are also supported by providing a custom client object to *auto-sklearn* like
-in the Example: :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py`. When
-multiple cores are
-available, *auto-sklearn* will create a worker per core, and use the available workers to both search
-for better machine learning models as well as building an ensemble with them until the time resource
-is exhausted.
+.. collapse:: <b>Parallelization with Dask</b>
 
-**Note:** *auto-sklearn* requires all workers to have access to a shared file system for storing training data and models.
+    Nevertheless, *auto-sklearn* also supports parallel Bayesian optimization via the use of
+    `Dask.distributed  <https://distributed.dask.org/>`_. By providing the arguments ``n_jobs``
+    to the estimator construction, one can control the number of cores available to *auto-sklearn*
+    (As shown in the Example :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py`).
+    Distributed processes are also supported by providing a custom client object to *auto-sklearn* like
+    in the Example: :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py`. When
+    multiple cores are
+    available, *auto-sklearn* will create a worker per core, and use the available workers to both search
+    for better machine learning models as well as building an ensemble with them until the time resource
+    is exhausted.
 
-*auto-sklearn* employs `threadpoolctl <https://github.com/joblib/threadpoolctl/>`_ to control the number of threads employed by scientific libraries like numpy or scikit-learn. This is done exclusively during the building procedure of models, not during inference. In particular, *auto-sklearn* allows each pipeline to use at most 1 thread during training. At predicting and scoring time this limitation is not enforced by *auto-sklearn*. You can control the number of resources
-employed by the pipelines by setting the following variables in your environment, prior to running *auto-sklearn*:
+    **Note:** *auto-sklearn* requires all workers to have access to a shared file system for storing training data and models.
 
-.. code-block:: shell-session
+    *auto-sklearn* employs `threadpoolctl <https://github.com/joblib/threadpoolctl/>`_ to control the number of threads employed by scientific libraries like numpy or scikit-learn. This is done exclusively during the building procedure of models, not during inference. In particular, *auto-sklearn* allows each pipeline to use at most 1 thread during training. At predicting and scoring time this limitation is not enforced by *auto-sklearn*. You can control the number of resources
+    employed by the pipelines by setting the following variables in your environment, prior to running *auto-sklearn*:
 
-    $ export OPENBLAS_NUM_THREADS=1
-    $ export MKL_NUM_THREADS=1
-    $ export OMP_NUM_THREADS=1
+    .. code-block:: shell-session
 
+        $ export OPENBLAS_NUM_THREADS=1
+        $ export MKL_NUM_THREADS=1
+        $ export OMP_NUM_THREADS=1
 
-For further information about how scikit-learn handles multiprocessing, please check the `Parallelism, resource management, and configuration <https://scikit-learn.org/stable/computing/parallelism.html>`_ documentation from the library.
 
-Model persistence
-=================
+    For further information about how scikit-learn handles multiprocessing, please check the `Parallelism, resource management, and configuration <https://scikit-learn.org/stable/computing/parallelism.html>`_ documentation from the library.
 
-*auto-sklearn* is mostly a wrapper around scikit-learn. Therefore, it is
-possible to follow the
-`persistence Example <https://scikit-learn.org/stable/modules/model_persistence.html>`_
-from scikit-learn.
+.. _othermanual:
 
-Vanilla auto-sklearn
-====================
+Other
+=====
 
-In order to obtain *vanilla auto-sklearn* as used in `Efficient and Robust Automated Machine Learning
-<https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine -learning>`_
-set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``:
+.. collapse:: <b>Supported input types</b>
 
-.. code:: python
+    *auto-sklearn* can accept targets for the following tasks (more details on `Sklearn algorithms <https://scikit-learn.org/stable/modules/multiclass.html>`_):
+
+    * Binary Classification
+    * Multiclass Classification
+    * Multilabel Classification
+    * Regression
+    * Multioutput Regression
+
+    You can provide feature and target training pairs (X_train/y_train) to *auto-sklearn* to fit an
+    ensemble of pipelines as described in the next section. This X_train/y_train dataset must belong
+    to one of the supported formats: np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix and python lists.
+    Optionally, you can measure the ability of this fitted model to generalize to unseen data by
+    providing an optional testing pair (X_test/Y_test). For further details, please refer to the
+    Example :ref:`sphx_glr_examples_40_advanced_example_pandas_train_test.py`.
+    Supported formats for these training and testing pairs are: np.ndarray,
+    pd.DataFrame, scipy.sparse.csr_matrix and python lists.
+
+    If your data contains categorical values (in the features or targets), autosklearn will automatically encode your
+    data using a `sklearn.preprocessing.LabelEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_
+    for unidimensional data and a `sklearn.preprocessing.OrdinalEncoder <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html>`_
+    for multidimensional data.
+
+    Regarding the features, there are two methods to guide *auto-sklearn* to properly encode categorical columns:
+
+    * Providing a X_train/X_test numpy array with the optional flag feat_type. For further details, you
+      can check the Example :ref:`sphx_glr_examples_40_advanced_example_feature_types.py`.
+    * You can provide a pandas DataFrame, with properly formatted columns. If a column has numerical
+      dtype, *auto-sklearn* will not encode it and it will be passed directly to scikit-learn. If the
+      column has a categorical/boolean class, it will be encoded. If the column is of any other type
+      (Object or Timeseries), an error will be raised. For further details on how to properly encode
+      your data, you can check the Pandas Example
+      `Working with categorical data <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_).
+      If you are working with time series, it is recommended that you follow this approach
+      `Working with time data <https://stats.stackexchange.com/questions/311494/>`_.
+
+    Regarding the targets (y_train/y_test), if the task involves a classification problem, such features will be
+    automatically encoded. It is recommended to provide both y_train and y_test during fit, so that a common encoding
+    is created between these splits (if only y_train is provided during fit, the categorical encoder will not be able
+    to handle new classes that are exclusive to y_test). If the task is regression, no encoding happens on the
+    targets.
+
+.. collapse:: <b>Model persistence</b>
+
+    *auto-sklearn* is mostly a wrapper around scikit-learn. Therefore, it is
+    possible to follow the
+    `persistence Example <https://scikit-learn.org/stable/modules/model_persistence.html>`_
+    from scikit-learn.
+
+.. collapse:: <b>Vanilla auto-sklearn</b>
+
+    In order to obtain *vanilla auto-sklearn* as used in `Efficient and Robust Automated Machine Learning
+    <https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine -learning>`_
+    set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``:
+
+    .. code:: python
 
-    import autosklearn.classification
-    automl = autosklearn.classification.AutoSklearnClassifier(
-        ensemble_size=1,
-        initial_configurations_via_metalearning=0
-    )
+        import autosklearn.classification
+        automl = autosklearn.classification.AutoSklearnClassifier(
+            ensemble_size=1,
+            initial_configurations_via_metalearning=0
+        )
 
-An ensemble of size one will result in always choosing the current best model
-according to its performance on the validation set. Setting the initial
-configurations found by meta-learning to zero makes *auto-sklearn* use the
-regular SMAC algorithm for suggesting new hyperparameter configurations.
+    An ensemble of size one will result in always choosing the current best model
+    according to its performance on the validation set. Setting the initial
+    configurations found by meta-learning to zero makes *auto-sklearn* use the
+    regular SMAC algorithm for suggesting new hyperparameter configurations.
diff --git a/doc/releases.rst b/doc/releases.rst
index c89418d851..7692329285 100644
--- a/doc/releases.rst
+++ b/doc/releases.rst
@@ -621,7 +621,7 @@ Version 0.4.0
   minimization problem.
 * Implements `#271 <https://github.com/automl/auto-sklearn/issues/271>`_:
   XGBoost is available again, even configuring the new dropout functionality.
-* New documentation section :ref:`inspecting_the_results`.
+* New documentation section :ref:`inspect`.
 * Fixes `#444 <https://github.com/automl/auto-sklearn/issues/444>`_:
   Auto-sklearn now only loads models for refit which are actually relevant
   for the ensemble.
diff --git a/setup.py b/setup.py
index a38fd20948..e355c0d1ec 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,13 @@
         "notebook",
         "seaborn",
     ],
-    "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"],
+    "docs": [
+        "sphinx",
+        "sphinx-gallery<=0.10.0",
+        "sphinx_bootstrap_theme",
+        "numpydoc",
+        "sphinx_toolbox",
+    ],
 }
 
 with open(os.path.join(HERE, 'autosklearn', '__version__.py')) as fh: