diff --git a/.travis.yml b/.travis.yml
index bbe0c58f..c35efb10 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ install:
   # Useful for debugging any issues with conda
   - conda info -a
 
-  - conda create -q -n test-environment python=$PYTHON_VERSION numpy scipy pytest cython scikit-learn six joblib
+  - conda create -q -n test-environment python=$PYTHON_VERSION numpy scipy pytest cython scikit-learn joblib
   - source activate test-environment
   - make all
 
diff --git a/README.rst b/README.rst
index dbdfbaa6..a6854452 100644
--- a/README.rst
+++ b/README.rst
@@ -17,9 +17,9 @@ ranking in Python.
 
 Highlights:
 
-- follows the `scikit-learn <http://scikit-learn.org>`_ API conventions
+- follows the `scikit-learn <https://scikit-learn.org>`_ API conventions
 - supports natively both dense and sparse data representations
-- computationally demanding parts implemented in `Cython <http://cython.org>`_
+- computationally demanding parts implemented in `Cython <https://cython.org>`_
 
 Solvers supported:
 
@@ -66,8 +66,8 @@ penalty on the News20 dataset (c.f., `Blondel et al. 2013
 Dependencies
 ------------
 
-lightning requires Python >= 2.7, setuptools, Numpy >= 1.3, SciPy >= 0.7 and
-scikit-learn >= 0.15. Building from source also requires Cython and a working C/C++ compiler. To run the tests you will also need pytest.
+lightning requires Python >= 3.6, setuptools, Numpy >= 1.12, SciPy >= 0.19 and
+scikit-learn >= 0.19. Building from source also requires Cython and a working C/C++ compiler. To run the tests you will also need pytest.
 
 Installation
 ------------
@@ -93,7 +93,7 @@ Documentation
 
 http://contrib.scikit-learn.org/lightning/
 
-On Github
+On GitHub
 ---------
 
 https://github.com/scikit-learn-contrib/lightning
diff --git a/appveyor.yml b/appveyor.yml
index 34ebf83f..4f47661f 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,6 +1,5 @@
 # AppVeyor.com is a Continuous Integration service to build and run tests under
 # Windows
-# https://ci.appveyor.com/project/fabianp/lightning-bpc6r
 
 image: Visual Studio 2019
 
@@ -52,7 +51,7 @@ install:
   - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
   - "python -m pip --version"
 
-  - "python -m pip install --timeout=60 numpy scipy cython pytest scikit-learn wheel six joblib"
+  - "python -m pip install --timeout=60 numpy scipy cython pytest scikit-learn wheel joblib"
   - "python setup.py bdist_wheel bdist_wininst"
 
   - ps: "ls dist"
diff --git a/build_tools/move-conda-package.py b/build_tools/move-conda-package.py
index fdc1057a..ecd7cbd4 100644
--- a/build_tools/move-conda-package.py
+++ b/build_tools/move-conda-package.py
@@ -5,7 +5,7 @@
 import shutil
 from conda_build.config import config
 
-with open(os.path.join(sys.argv[1], 'meta.yaml')) as f:
+with open(os.path.join(sys.argv[1], 'meta.yaml'), encoding='utf-8') as f:
     name = yaml.load(f)['package']['name']
 
 binary_package_glob = os.path.join(
diff --git a/doc/sphinxext/gen_rst.py b/doc/sphinxext/gen_rst.py
index d3c9bc72..7467e340 100644
--- a/doc/sphinxext/gen_rst.py
+++ b/doc/sphinxext/gen_rst.py
@@ -7,7 +7,6 @@
 Files that generate images should start with 'plot'
 
 """
-from __future__ import division, print_function
 from time import time
 import ast
 import os
@@ -20,37 +19,19 @@
 import posixpath
 import subprocess
 import warnings
-import six
 
+from io import StringIO
+import pickle
+import urllib.request
+import urllib.error
+import urllib.parse
+from urllib.error import HTTPError, URLError
 
-# Try Python 2 first, otherwise load from Python 3
-try:
-    from StringIO import StringIO
-    import cPickle as pickle
-    import urllib2 as urllib
-    from urllib2 import HTTPError, URLError
-except ImportError:
-    from io import StringIO
-    import pickle
-    import urllib.request
-    import urllib.error
-    import urllib.parse
-    from urllib.error import HTTPError, URLError
-
-
-try:
-    # Python 2 built-in
-    execfile
-except NameError:
-    def execfile(filename, global_vars=None, local_vars=None):
-        with open(filename, encoding='utf-8') as f:
-            code = compile(f.read(), filename, 'exec')
-            exec(code, global_vars, local_vars)
 
-try:
-    basestring
-except NameError:
-    basestring = str
+def execfile(filename, global_vars=None, local_vars=None):
+    with open(filename, encoding='utf-8') as f:
+        code = compile(f.read(), filename, 'exec')
+        exec(code, global_vars, local_vars)
 
 import token
 import tokenize
@@ -93,13 +74,8 @@ def flush(self):
 def _get_data(url):
     """Helper function to get data over http or from a local file"""
     if url.startswith('http://'):
-        # Try Python 2, use Python 3 on exception
-        try:
-            resp = urllib.urlopen(url)
-            encoding = resp.headers.dict.get('content-encoding', 'plain')
-        except AttributeError:
-            resp = urllib.request.urlopen(url)
-            encoding = resp.headers.get('content-encoding', 'plain')
+        resp = urllib.request.urlopen(url)
+        encoding = resp.headers.get('content-encoding', 'plain')
         data = resp.read()
         if encoding == 'plain':
             pass
@@ -427,10 +403,8 @@ def resolve(self, cobj, this_url):
 def extract_docstring(filename, ignore_heading=False):
     """ Extract a module-level docstring, if any
     """
-    if six.PY2:
-        lines = open(filename).readlines()
-    else:
-        lines = open(filename, encoding='utf-8').readlines()
+    with open(filename, encoding='utf-8') as f:
+        lines = f.readlines()
     start_row = 0
     if lines[0].startswith('#!'):
         lines.pop(0)
@@ -526,10 +500,8 @@ def generate_example_rst(app):
 def extract_line_count(filename, target_dir):
     # Extract the line count of a file
     example_file = os.path.join(target_dir, filename)
-    if six.PY2:
-        lines = open(example_file).readlines()
-    else:
-        lines = open(example_file, encoding='utf-8').readlines()
+    with open(example_file, encoding='utf-8') as f:
+        lines = f.readlines()
     start_row = 0
     if lines and lines[0].startswith('#!'):
         lines.pop(0)
@@ -620,7 +592,7 @@ def generate_dir_rst(directory, fhindex, example_dir, root_dir, plot_gallery, se
 %s
 
 
-""" % open(os.path.join(src_dir, 'README.txt')).read())
+""" % open(os.path.join(src_dir, 'README.txt'), encoding='utf-8').read())
     if not os.path.exists(target_dir):
         os.makedirs(target_dir)
     sorted_listdir = line_count_sort(os.listdir(src_dir),
@@ -676,8 +648,8 @@ def make_thumbnail(in_fname, out_fname, width, height):
         import Image
     img = Image.open(in_fname)
     width_in, height_in = img.size
-    scale_w = width / float(width_in)
-    scale_h = height / float(height_in)
+    scale_w = width / width_in
+    scale_h = height / height_in
 
     if height_in * scale_w <= height:
         scale = scale_w
@@ -727,7 +699,7 @@ class NameFinder(ast.NodeVisitor):
     """
 
     def __init__(self):
-        super(NameFinder, self).__init__()
+        super().__init__()
         self.imported_names = {}
         self.accessed_names = set()
 
@@ -964,11 +936,8 @@ def generate_file_rst(fname, target_dir, src_dir, root_dir, plot_gallery):
     f.flush()
 
     # save variables so we can later add links to the documentation
-    if six.PY2:
-        example_code_obj = identify_names(open(example_file).read())
-    else:
-        example_code_obj = \
-            identify_names(open(example_file, encoding='utf-8').read())
+    with open(example_file, encoding='utf-8') as f:
+        example_code_obj = identify_names(f.read())
     if example_code_obj:
         codeobj_fname = example_file[:-3] + '_codeobj.pickle'
         with open(codeobj_fname, 'wb') as fid:
diff --git a/doc/sphinxext/numpy_ext/docscrape.py b/doc/sphinxext/numpy_ext/docscrape.py
index 93097893..7e1ec56c 100644
--- a/doc/sphinxext/numpy_ext/docscrape.py
+++ b/doc/sphinxext/numpy_ext/docscrape.py
@@ -7,11 +7,7 @@
 import re
 import pydoc
 from warnings import warn
-# Try Python 2 first, otherwise load from Python 3
-try:
-    from StringIO import StringIO
-except:
-    from io import StringIO
+from io import StringIO
 
 
 class Reader(object):
@@ -466,7 +462,7 @@ def __str__(self):
             out += '.. %s:: %s\n    \n\n' % (roles.get(self._role, ''),
                                              func_name)
 
-        out += super(FunctionDoc, self).__str__(func_role=self._role)
+        out += super().__str__(func_role=self._role)
         return out
 
 
diff --git a/doc/sphinxext/numpy_ext/docscrape_sphinx.py b/doc/sphinxext/numpy_ext/docscrape_sphinx.py
index ca28300e..2cd9625f 100644
--- a/doc/sphinxext/numpy_ext/docscrape_sphinx.py
+++ b/doc/sphinxext/numpy_ext/docscrape_sphinx.py
@@ -117,7 +117,7 @@ def _str_section(self, name):
     def _str_see_also(self, func_role):
         out = []
         if self['See Also']:
-            see_also = super(SphinxDocString, self)._str_see_also(func_role)
+            see_also = super()._str_see_also(func_role)
             out = ['.. seealso::', '']
             out += self._str_indent(see_also[2:])
         return out
diff --git a/doc/sphinxext/numpy_ext/numpydoc.py b/doc/sphinxext/numpy_ext/numpydoc.py
index 6ff03e0d..b2d506a7 100644
--- a/doc/sphinxext/numpy_ext/numpydoc.py
+++ b/doc/sphinxext/numpy_ext/numpydoc.py
@@ -17,9 +17,6 @@
 
 """
 
-from __future__ import unicode_literals
-
-import sys # Only needed to check Python version
 import os
 import re
 import pydoc
@@ -41,10 +38,7 @@ def mangle_docstrings(app, what, name, obj, options, lines,
         lines[:] = title_re.sub('', "\n".join(lines)).split("\n")
     else:
         doc = get_doc_object(obj, what, "\n".join(lines), config=cfg)
-        if sys.version_info[0] < 3:
-            lines[:] = unicode(doc).splitlines()
-        else:
-            lines[:] = str(doc).splitlines()
+        lines[:] = str(doc).splitlines()
 
     if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \
            obj.__name__:
@@ -104,12 +98,8 @@ def setup(app, get_doc_object_=get_doc_object):
     global get_doc_object
     get_doc_object = get_doc_object_
 
-    if sys.version_info[0] < 3:
-        app.connect(b'autodoc-process-docstring', mangle_docstrings)
-        app.connect(b'autodoc-process-signature', mangle_signature)
-    else:
-        app.connect('autodoc-process-docstring', mangle_docstrings)
-        app.connect('autodoc-process-signature', mangle_signature)
+    app.connect('autodoc-process-docstring', mangle_docstrings)
+    app.connect('autodoc-process-signature', mangle_signature)
     app.add_config_value('numpydoc_edit_link', None, False)
     app.add_config_value('numpydoc_use_plots', None, False)
     app.add_config_value('numpydoc_show_class_members', True, True)
@@ -135,7 +125,7 @@ class ManglingDomainBase(object):
     directive_mangling_map = {}
 
     def __init__(self, *a, **kw):
-        super(ManglingDomainBase, self).__init__(*a, **kw)
+        super().__init__(*a, **kw)
         self.wrap_mangling_directives()
 
     def wrap_mangling_directives(self):
diff --git a/examples/plot_sparse_non_linear.py b/examples/plot_sparse_non_linear.py
index 9d564682..b0624cc4 100644
--- a/examples/plot_sparse_non_linear.py
+++ b/examples/plot_sparse_non_linear.py
@@ -33,20 +33,20 @@ class SparseNonlinearClassifier(CDClassifier):
 
     def __init__(self, gamma=1e-2, C=1, alpha=1):
         self.gamma = gamma
-        super(SparseNonlinearClassifier, self).__init__(C=C,
-                                                        alpha=alpha,
-                                                        loss="squared_hinge",
-                                                        penalty="l1")
+        super().__init__(C=C,
+                         alpha=alpha,
+                         loss="squared_hinge",
+                         penalty="l1")
 
     def fit(self, X, y):
         K = rbf_kernel(X, gamma=self.gamma)
         self.X_train_ = X
-        super(SparseNonlinearClassifier, self).fit(K, y)
+        super().fit(K, y)
         return self
 
     def decision_function(self, X):
         K = rbf_kernel(X, self.X_train_, gamma=self.gamma)
-        return super(SparseNonlinearClassifier, self).decision_function(K)
+        return super().decision_function(K)
 
 
 def gen_non_lin_separable_data():
diff --git a/lightning/impl/adagrad.py b/lightning/impl/adagrad.py
index e467be9e..f39bbfe1 100644
--- a/lightning/impl/adagrad.py
+++ b/lightning/impl/adagrad.py
@@ -4,7 +4,6 @@
 import numpy as np
 
 from sklearn.utils import check_random_state
-from six.moves import xrange
 
 from .base import BaseClassifier, BaseRegressor
 from .dataset_fast import get_dataset
@@ -38,7 +37,7 @@ def _fit(self, X, Y):
         loss = self._get_loss()
         n_calls = n_samples if self.n_calls is None else self.n_calls
 
-        for i in xrange(n_vectors):
+        for i in range(n_vectors):
             _adagrad_fit(self, ds, Y[:, i], self.coef_[i], self.g_sum_[i],
                          self.g_norms_[i], loss, self.eta, delta, alpha1,
                          alpha2, self.n_iter, self.shuffle, self.callback,
diff --git a/lightning/impl/adagrad_fast.pyx b/lightning/impl/adagrad_fast.pyx
index 08da2da6..bc812b7d 100644
--- a/lightning/impl/adagrad_fast.pyx
+++ b/lightning/impl/adagrad_fast.pyx
@@ -26,7 +26,7 @@ cdef double _pred(double* data,
     cdef int j, jj
     cdef double dot = 0
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         dot += w[j] * data[jj]
 
@@ -64,7 +64,7 @@ cpdef _proj_elastic_all(double eta,
                         np.ndarray[double, ndim=1] w):
     cdef int n_features = w.shape[0]
     cdef int j
-    for j in xrange(n_features):
+    for j in range(n_features):
         if g_norms[j] != 0:
             w[j] = _proj_elastic(eta, t, g_sum[j], g_norms[j], alpha1, alpha2,
                                  delta)
@@ -107,13 +107,13 @@ def _adagrad_fit(self,
     cdef double* w = <double*>coef.data
 
     t = 1
-    for it in xrange(n_iter):
+    for it in range(n_iter):
 
         # Shuffle sample indices.
         if shuffle:
             rng.shuffle(sindices)
 
-        for ii in xrange(n_samples):
+        for ii in range(n_samples):
             i = sindices[ii]
 
             # Retrieve sample i.
@@ -121,7 +121,7 @@ def _adagrad_fit(self,
 
             # Update w lazily.
             if t > 1:
-                for jj in xrange(n_nz):
+                for jj in range(n_nz):
                     j = indices[jj]
                     if g_norms[j] != 0:
                         w[j] = _proj_elastic(eta, t - 1, g_sum[j], g_norms[j],
@@ -135,14 +135,14 @@ def _adagrad_fit(self,
 
             # Update g_sum and g_norms.
             if scale != 0:
-                for jj in xrange(n_nz):
+                for jj in range(n_nz):
                     j = indices[jj]
                     tmp = scale * data[jj]
                     g_sum[j] += tmp
                     g_norms[j] += tmp * tmp
 
             # Update w by naive implementation: very slow.
-            # for j in xrange(n_features):
+            # for j in range(n_features):
             #    w[j] = _proj_elastic(eta, t, g_sum[j], g_norms[j], alpha1,
             #                         alpha2, delta)
 
diff --git a/lightning/impl/base.py b/lightning/impl/base.py
index 5c6c308c..c08ee52e 100644
--- a/lightning/impl/base.py
+++ b/lightning/impl/base.py
@@ -30,9 +30,9 @@ def n_nonzero(self, percentage=False):
         if percentage:
             if hasattr(self, "support_vectors_") and \
                self.support_vectors_ is not None:
-                n_nz /= float(self.n_samples_)
+                n_nz /= self.n_samples_
             else:
-                n_nz /= float(coef.shape[1])
+                n_nz /= coef.shape[1]
 
         return n_nz
 
diff --git a/lightning/impl/dataset_fast.pyx b/lightning/impl/dataset_fast.pyx
index 16b16572..75d776ec 100644
--- a/lightning/impl/dataset_fast.pyx
+++ b/lightning/impl/dataset_fast.pyx
@@ -84,7 +84,7 @@ cdef class ContiguousDataset(RowDataset):
         cdef int i
         cdef int n_features = X.shape[1]
         self.indices = <int*> stdlib.malloc(sizeof(int) * n_features)
-        for j in xrange(n_features):
+        for j in range(n_features):
             self.indices[j] = j
 
     def __dealloc__(self):
@@ -116,7 +116,7 @@ cdef class FortranDataset(ColumnDataset):
         cdef int i
         cdef int n_samples = X.shape[0]
         self.indices = <int*> stdlib.malloc(sizeof(int) * n_samples)
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             self.indices[i] = i
 
     def __dealloc__(self):
diff --git a/lightning/impl/datasets/samples_generator.py b/lightning/impl/datasets/samples_generator.py
index e3fee8d0..fc7b6ff4 100644
--- a/lightning/impl/datasets/samples_generator.py
+++ b/lightning/impl/datasets/samples_generator.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import scipy.sparse as sp
-from six.moves import xrange
 
 from sklearn.utils.extmath import safe_sparse_dot
 from sklearn.utils import check_random_state
@@ -27,7 +26,7 @@ def _make_nn_regression(n_samples=100, n_features=100, n_informative=10,
 
     n = 0
     ind = np.arange(n_features)
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         generator.shuffle(ind)
         col[n:n+n_informative] = ind[:n_informative]
         n += n_informative
diff --git a/lightning/impl/dual_cd.py b/lightning/impl/dual_cd.py
index 98b85f15..ec61775e 100644
--- a/lightning/impl/dual_cd.py
+++ b/lightning/impl/dual_cd.py
@@ -13,7 +13,6 @@
 import numpy as np
 
 from sklearn.preprocessing import add_dummy_feature
-from six.moves import xrange
 
 from .base import BaseClassifier, BaseRegressor
 from .dataset_fast import get_dataset
@@ -136,7 +135,7 @@ def fit(self, X, y):
                 self.dual_coef_ = np.zeros((n_vectors, n_samples),
                                            dtype=np.float64)
 
-        for i in xrange(n_vectors):
+        for i in range(n_vectors):
             if self.criterion == "accuracy":
                 _dual_cd(self, self.coef_[i], self.dual_coef_[i],
                          ds, Y[:, i], self.permute,
@@ -257,7 +256,7 @@ def fit(self, X, y):
             self.dual_coef_ = np.zeros((n_vectors, n_samples),
                                        dtype=np.float64)
 
-        for i in xrange(n_vectors):
+        for i in range(n_vectors):
             _dual_cd_svr(self, self.coef_[i], self.dual_coef_[i],
                          ds, Y[:, i], self.permute,
                          self.C, self.epsilon, self._get_loss(),
diff --git a/lightning/impl/dual_cd_fast.pyx b/lightning/impl/dual_cd_fast.pyx
index 2dd59dc5..c0cf5314 100644
--- a/lightning/impl/dual_cd_fast.pyx
+++ b/lightning/impl/dual_cd_fast.pyx
@@ -79,10 +79,10 @@ cdef _sqnorms(RowDataset X,
     cdef int* indices
     cdef int n_nz
 
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         X.get_row_ptr(i, &indices, &data, &n_nz)
         dot = 0
-        for jj in xrange(n_nz):
+        for jj in range(n_nz):
             dot += data[jj] * data[jj]
         sqnorms[i] = dot
 
@@ -150,7 +150,7 @@ def _dual_cd(self,
     Q_bar_diag += D_ii
 
 
-    for t in xrange(max_iter):
+    for t in range(max_iter):
         if permute:
             rs.shuffle(A[:active_size])
 
@@ -170,7 +170,7 @@ def _dual_cd(self,
             # Compute ith element of the gradient.
             # G = y_i * np.dot(w, X[i]) - 1 + D_ii * alpha_i
             G = 0
-            for jj in xrange(n_nz):
+            for jj in range(n_nz):
                 j = indices[jj]
                 G += w[j] * data[jj]
             G = y_i * G - 1 + D_ii * alpha_i
@@ -207,7 +207,7 @@ def _dual_cd(self,
 
                 # Update the primal coefficients.
                 step = (alpha_i - alpha_old) * y_i
-                for jj in xrange(n_nz):
+                for jj in range(n_nz):
                     j = indices[jj]
                     w[j] += step * data[jj]
 
@@ -306,7 +306,7 @@ def _dual_cd_auc(self,
     cdef int n_pos = 0
     cdef int n_neg = 0
 
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         if y[i] == 1:
             pos[n_pos] = i
             n_pos += 1
@@ -319,11 +319,11 @@ def _dual_cd_auc(self,
     alpha = np.zeros(n_pos * n_neg, dtype=np.float64)
 
     # Learning
-    for t in xrange(max_iter):
+    for t in range(max_iter):
         if verbose >= 1:
             print("\nIteration", t)
 
-        for tt in xrange(n_samples):
+        for tt in range(n_samples):
             r = rs.randint(n_pos - 1)
             s = rs.randint(n_neg - 1)
             p = pos[r]
@@ -340,11 +340,11 @@ def _dual_cd_auc(self,
 
             # Gradient
             G = 0
-            for jj in xrange(n_nz1):
+            for jj in range(n_nz1):
                 j = indices1[jj]
                 G += w[j] * data1[jj]
 
-            for jj in xrange(n_nz2):
+            for jj in range(n_nz2):
                 j = indices2[jj]
                 G -= w[j] * data2[jj]
 
@@ -374,11 +374,11 @@ def _dual_cd_auc(self,
                 # Update w.
                 step = (alpha_k - alpha_old)
 
-                for jj in xrange(n_nz1):
+                for jj in range(n_nz1):
                     j = indices1[jj]
                     w[j] += step * data1[jj]
 
-                for jj in xrange(n_nz2):
+                for jj in range(n_nz2):
                     j = indices2[jj]
                     w[j] -= step * data2[jj]
 
@@ -446,7 +446,7 @@ def _dual_cd_svr(self,
     # We store alphas in the form
     # alpha[i] = alpha_+[i] - alpha_-[i]
     # so we need to convert representation.
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         ii = i * 2
         if alpha[i] > 0:
             alpha_[ii] = alpha[i]
@@ -454,7 +454,7 @@ def _dual_cd_svr(self,
             alpha_[ii + 1] = -alpha[i]
 
     # Learning...
-    for t in xrange(max_iter):
+    for t in range(max_iter):
         if verbose >= 1:
             print("\nIteration", t)
 
@@ -463,7 +463,7 @@ def _dual_cd_svr(self,
 
         violation_sum = 0
 
-        for s in xrange(n_samples * 2):
+        for s in range(n_samples * 2):
             i = A[s % n_samples]
 
             # Retrieve row.
@@ -471,7 +471,7 @@ def _dual_cd_svr(self,
 
             # Compute prediction.
             pred = 0
-            for jj in xrange(n_nz):
+            for jj in range(n_nz):
                 j = indices[jj]
                 pred += w[j] * data[jj]
 
@@ -511,7 +511,7 @@ def _dual_cd_svr(self,
 
                 # Update the primal coefficients.
                 if diff != 0:
-                    for jj in xrange(n_nz):
+                    for jj in range(n_nz):
                         j = indices[jj]
                         w[j] += diff * data[jj]
 
@@ -545,7 +545,7 @@ def _dual_cd_svr(self,
     if verbose >= 1:
         print()
 
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         ii = i * 2
         alpha[i] = alpha_[ii] - alpha_[ii + 1]
 
diff --git a/lightning/impl/fista.py b/lightning/impl/fista.py
index 957e7d60..1c61def9 100644
--- a/lightning/impl/fista.py
+++ b/lightning/impl/fista.py
@@ -4,7 +4,6 @@
 import numpy as np
 
 from sklearn.utils.extmath import safe_sparse_dot
-from six.moves import xrange
 
 from .base import BaseClassifier, BaseRegressor
 
@@ -75,7 +74,7 @@ def _fit(self, X, y, n_vectors):
             L = 1.0
 
         t = 1.0
-        for it in xrange(self.max_iter):
+        for it in range(self.max_iter):
             if self.verbose >= 1:
                 print("Iter", it + 1, obj)
 
@@ -92,7 +91,7 @@ def _fit(self, X, y, n_vectors):
             if self.max_steps > 0:
                 objb = self._get_objective(df, y, loss)
 
-            for tt in xrange(self.max_steps):
+            for tt in range(self.max_steps):
                 # Solve
                 coefx = coef - G / L
                 coefx = penalty.projection(coefx, self.alpha, L)
diff --git a/lightning/impl/loss_fast.pyx b/lightning/impl/loss_fast.pyx
index 366c23ba..bef51495 100644
--- a/lightning/impl/loss_fast.pyx
+++ b/lightning/impl/loss_fast.pyx
@@ -27,11 +27,11 @@ cdef double _l2_norm_sums(RowDataset X, int squared):
         cdef int* indices
         cdef int n_nz
 
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             X.get_row_ptr(i, &indices, &data, &n_nz)
 
             norm = 0
-            for jj in xrange(n_nz):
+            for jj in range(n_nz):
                 norm += data[jj] * data[jj]
 
             if squared:
@@ -59,11 +59,11 @@ cdef class Squared:
         cdef int i, k, j, jj
         cdef double residual
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 residual = y[i, k] - df[i, k]
                 X.get_row_ptr(i, &indices, &data, &n_nz)
-                for jj in xrange(n_nz):
+                for jj in range(n_nz):
                     j = indices[jj]
                     G[k, j] -= residual * data[jj]
 
@@ -79,8 +79,8 @@ cdef class Squared:
 
         obj = 0
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 residual = y[i, k] - df[i, k]
                 obj += residual * residual
 
@@ -107,13 +107,13 @@ cdef class SquaredHinge:
         cdef int i, k, j, jj
         cdef double tmp
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 tmp = 1 - y[i, k] * df[i, k]
                 if tmp > 0:
                     tmp *= 2 * y[i, k]
                     X.get_row_ptr(i, &indices, &data, &n_nz)
-                    for jj in xrange(n_nz):
+                    for jj in range(n_nz):
                         j = indices[jj]
                         G[k, j] -= tmp * data[jj]
 
@@ -129,8 +129,8 @@ cdef class SquaredHinge:
 
         obj = 0
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 value = max(1 - y[i, k] * df[i, k], 0)
                 obj += value * value
 
@@ -157,17 +157,17 @@ cdef class MulticlassSquaredHinge:
         cdef int i, k, j, jj
         cdef double update, tmp
 
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             X.get_row_ptr(i, &indices, &data, &n_nz)
 
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 if y[i] == k:
                     continue
 
                 update = max(1 - df[i, y[i]] + df[i, k], 0)
                 if update != 0:
                     update *= 2
-                    for jj in xrange(n_nz):
+                    for jj in range(n_nz):
                         j = indices[jj]
                         tmp = update * data[jj]
                         G[y[i], j] -= tmp
@@ -185,8 +185,8 @@ cdef class MulticlassSquaredHinge:
 
         obj = 0
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 if y[i] == k:
                     continue
                 value = max(1 - df[i, y[i]] + df[i, k], 0)
@@ -222,11 +222,11 @@ cdef class MulticlassLog:
         cdef np.ndarray[double, ndim=1, mode='c'] scores
         scores = np.zeros(n_vectors, dtype=np.float64)
 
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             X.get_row_ptr(i, &indices, &data, &n_nz)
 
             Z = 0
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 tmp = df[i, k] - df[i, y[i]]
                 if self.margin and k != y[i]:
                     tmp += 1
@@ -234,12 +234,12 @@ cdef class MulticlassLog:
                 scores[k] = tmp
                 Z += tmp
 
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 tmp = scores[k] / Z
                 if k == y[i]:
                     tmp -= 1
 
-                for jj in xrange(n_nz):
+                for jj in range(n_nz):
                     j = indices[jj]
                     G[k, j] += tmp * data[jj]
 
@@ -255,9 +255,9 @@ cdef class MulticlassLog:
 
         obj = 0
 
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             s = 1
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 tmp = df[i, k] - df[i, y[i]]
                 if self.margin and k != y[i]:
                     tmp += 1
diff --git a/lightning/impl/penalty.py b/lightning/impl/penalty.py
index 56614fdf..d8445598 100644
--- a/lightning/impl/penalty.py
+++ b/lightning/impl/penalty.py
@@ -62,7 +62,7 @@ def project_simplex(v, z=1):
     ind = np.arange(n_features) + 1
     cond = u - cssv / ind > 0
     rho = ind[cond][-1]
-    theta = cssv[cond][-1] / float(rho)
+    theta = cssv[cond][-1] / rho
     w = np.maximum(v - theta, 0)
     return w
 
diff --git a/lightning/impl/prank_fast.pyx b/lightning/impl/prank_fast.pyx
index 50bec596..598c43f6 100644
--- a/lightning/impl/prank_fast.pyx
+++ b/lightning/impl/prank_fast.pyx
@@ -20,7 +20,7 @@ cdef int _predict(double dot,
     cdef int r
     cdef int y_hat = 0
 
-    for r in xrange(n_classes):
+    for r in range(n_classes):
         if dot - b[r] < 0:
             y_hat = r
             break
@@ -35,7 +35,7 @@ cdef int _update_thresholds(double dot,
     cdef int tau = 0
     cdef int r, yr
 
-    for r in xrange(n_classes - 1):
+    for r in range(n_classes - 1):
         if y <= r:
             yr = -1
         else:
@@ -72,11 +72,11 @@ def _prank_fit(np.ndarray[double, ndim=1, mode='c'] w,
     cdef np.ndarray[int, ndim=1] ind
     ind = np.arange(n_samples, dtype=np.int32)
 
-    for n in xrange(n_iter):
+    for n in range(n_iter):
         if shuffle:
             rs.shuffle(ind)
 
-        for ii in xrange(n_samples):
+        for ii in range(n_samples):
             i = ind[ii]
 
             # Retrieve row.
@@ -84,7 +84,7 @@ def _prank_fit(np.ndarray[double, ndim=1, mode='c'] w,
 
             # Compute dot product.
             dot = 0
-            for jj in xrange(n_nz):
+            for jj in range(n_nz):
                 j = indices[jj]
                 dot += w[j] * data[jj]
 
@@ -97,7 +97,7 @@ def _prank_fit(np.ndarray[double, ndim=1, mode='c'] w,
             tau = _update_thresholds(dot, b, y[i], n_classes)
 
             # Update w.
-            for jj in xrange(n_nz):
+            for jj in range(n_nz):
                 j = indices[jj]
                 w[j] += tau * data[jj]
 
@@ -120,16 +120,16 @@ def _prank_fit_kernel(np.ndarray[double, ndim=1, mode='c'] alpha,
     cdef np.ndarray[int, ndim=1] ind
     ind = np.arange(n_samples, dtype=np.int32)
 
-    for n in xrange(n_iter):
+    for n in range(n_iter):
         if shuffle:
             rs.shuffle(ind)
 
-        for ii in xrange(n_samples):
+        for ii in range(n_samples):
             i = ind[ii]
 
             # Compute dot product.
             dot = 0
-            for j in xrange(n_samples):
+            for j in range(n_samples):
                 dot += alpha[j] * K[i, j]
 
             y_hat = _predict(dot, b, n_classes)
@@ -152,5 +152,5 @@ def _prank_predict(np.ndarray[double, ndim=1, mode='c'] dot,
     cdef int n_samples = dot.shape[0]
     cdef int i
 
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         out[i] = _predict(dot[i], b, n_classes)
diff --git a/lightning/impl/primal_cd.py b/lightning/impl/primal_cd.py
index d17cdd8d..c3981f68 100644
--- a/lightning/impl/primal_cd.py
+++ b/lightning/impl/primal_cd.py
@@ -13,7 +13,6 @@
 import numpy as np
 
 from joblib import Parallel, delayed
-from six.moves import xrange
 
 from .base import BaseClassifier
 from .base import BaseRegressor
@@ -299,7 +298,7 @@ def fit(self, X, y):
 
             n_pos = np.zeros(n_vectors)
             vinit = self.C / self.C_init * np.ones_like(n_pos)
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 n_pos[k] = np.sum(Y[:, k] == 1)
                 vinit[k] *= self.violation_init_.get(k, 0)
             n_neg = n_samples - n_pos
@@ -315,7 +314,7 @@ def fit(self, X, y):
                                         self.shrinking, vinit[k],
                                         rs, tol[k], self.callback, self.n_calls,
                                         self.verbose)
-                    for k in xrange(n_vectors))
+                    for k in range(n_vectors))
             model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
             viol, coefs, errors = zip(*model)
             self.coef_ = np.asarray(coefs)
@@ -345,7 +344,7 @@ def fit(self, X, y):
                            rs, self.tol, self.callback, self.n_calls,
                            self.verbose
                             )
-                    for k in xrange(n_vectors))
+                    for k in range(n_vectors))
             model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
             viol, coefs, errors = zip(*model)
             self.coef_ = np.asarray(coefs)
@@ -472,7 +471,7 @@ def fit(self, X, y):
         else:
             penalty = self._get_penalty()
             vinit = np.asarray([self.violation_init_.get(k, 0)
-                    for k in xrange(n_vectors)]) * self.C / self.C_init
+                    for k in range(n_vectors)]) * self.C / self.C_init
 
             jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_,
                                        ds, y, Y, k, False,
@@ -484,7 +483,7 @@ def fit(self, X, y):
                                        self.shrinking, vinit[k],
                                        rs, self.tol, self.callback, self.n_calls,
                                        self.verbose)
-                    for k in xrange(n_vectors))
+                    for k in range(n_vectors))
 
             model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
             viol, self.coef_, self.error_ = zip(*model)
diff --git a/lightning/impl/primal_cd_fast.pyx b/lightning/impl/primal_cd_fast.pyx
index 8211018c..4a2731c1 100644
--- a/lightning/impl/primal_cd_fast.pyx
+++ b/lightning/impl/primal_cd_fast.pyx
@@ -159,10 +159,10 @@ cdef class LossFunction:
         cdef int* indices
         cdef int n_nz
 
-        for j in xrange(n_features):
+        for j in range(n_features):
             X.get_column_ptr(j, &indices, &data, &n_nz)
 
-            for ii in xrange(n_nz):
+            for ii in range(n_nz):
                 i = indices[ii]
 
                 out[j] += scale * data[ii] * data[ii]
@@ -343,7 +343,7 @@ cdef class LossFunction:
             L = 0
             Lpp_max = -DBL_MAX
 
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 self.derivatives(j, C, indices, data, n_nz, y_ptr,
                                  b_ptr, &g[k], &Lpp_tmp, &L_tmp)
                 L += L_tmp
@@ -364,7 +364,7 @@ cdef class LossFunction:
         g_norm = 0
         R_j = 0
 
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             g_norm += g[k] * g[k]
             R_j += w[k, j] * w[k, j]
 
@@ -387,7 +387,7 @@ cdef class LossFunction:
 
         # Compute vector to be projected and scaling factor.
         scaling = 0
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             d_old[k] = 0
             d[k] = w[k, j] - g[k] / Lpp_max
             scaling += d[k] * d[k]
@@ -400,7 +400,7 @@ cdef class LossFunction:
         # Project (proximity operator).
         delta = 0
         dmax = -DBL_MAX
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             # Difference between new and old solution.
             d[k] = scaling * d[k] - w[k, j]
             delta += d[k] * g[k]
@@ -424,7 +424,7 @@ cdef class LossFunction:
                 y_ptr = <double*>Y.data
                 b_ptr = <double*>b.data
 
-                for k in xrange(n_vectors):
+                for k in range(n_vectors):
                     z_diff = d_old[k] - d[k]
                     self.update(j, z_diff, C, indices, data, n_nz,
                                 y_ptr, b_ptr, &L_tmp)
@@ -441,7 +441,7 @@ cdef class LossFunction:
 
             # Compute regularization term.
             R_j_new = 0
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 tmp = w[k, j] + d[k]
                 R_j_new += tmp * tmp
             R_j_new = sqrt(R_j_new)
@@ -456,13 +456,13 @@ cdef class LossFunction:
                 break
 
             delta *= self.beta
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 d_old[k] = d[k]
                 d[k] *= self.beta
             step += 1
 
         # Update solution
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             w[k, j] += d[k]
 
         # Recompute errors if necessary.
@@ -474,7 +474,7 @@ cdef class LossFunction:
                 b_ptr = <double*>b.data
                 w_ptr = <double*>w.data
 
-                for k in xrange(n_vectors):
+                for k in range(n_vectors):
                     self.recompute(X, y_ptr, w_ptr, b_ptr)
                     y_ptr += n_samples
                     b_ptr += n_samples
@@ -567,7 +567,7 @@ cdef class Squared(LossFunction):
         # Objective value
         L[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             tmp = data[ii] * C
             Lpp[0] += data[ii] * tmp
@@ -592,7 +592,7 @@ cdef class Squared(LossFunction):
         # New objective value
         L_new[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             # Update residuals.
             b[i] -= z_diff * data[ii]
@@ -634,7 +634,7 @@ cdef class SquaredHinge(LossFunction):
         # Objective value
         L[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             val = data[ii] * y[i]
 
@@ -663,7 +663,7 @@ cdef class SquaredHinge(LossFunction):
         # New objective value
         L_new[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b_new = b[i] + z_diff * data[ii] * y[i]
             # b[i] = 1 - y[i] * np.dot(w, X[i])
@@ -701,15 +701,15 @@ cdef class SquaredHinge(LossFunction):
         # Objective value
         L[0] = 0
 
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             # First derivative with respect to w_jk
             g[k] = 0
             # Second derivative with respect to  w_jk^2
             h[k] = 0
 
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
 
-            for ii in xrange(n_nz):
+            for ii in range(n_nz):
                 i = indices[ii]
 
                 if y[i] == k:
@@ -731,7 +731,7 @@ cdef class SquaredHinge(LossFunction):
             b_ptr += n_samples
 
         Lpp_max[0] = -DBL_MAX
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             g[k] *= 2
             Lpp_max[0] = max(Lpp_max[0], h[k])
 
@@ -758,13 +758,13 @@ cdef class SquaredHinge(LossFunction):
 
         # New objective value
         L_new[0] = 0
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b_ptr = b + i
 
             tmp = d_old[y[i]] - d[y[i]]
 
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 if k != y[i]:
                     # b_ptr[0] = b[k, i]
                     b_new = b_ptr[0] + (tmp - (d_old[k] - d[k])) * data[ii]
@@ -827,7 +827,7 @@ cdef class SmoothHinge(LossFunction):
         # Objective value
         L[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             val = data[ii] * y[i]
 
@@ -859,7 +859,7 @@ cdef class SmoothHinge(LossFunction):
         # New objective value
         L_new[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b_new = b[i] + z_diff * data[ii] * y[i]
             # b[i] = 1 - y[i] * np.dot(w, X[i])
@@ -906,7 +906,7 @@ cdef class ModifiedHuber(LossFunction):
         # Objective value
         L[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             val = data[ii] * y[i]
 
@@ -938,7 +938,7 @@ cdef class ModifiedHuber(LossFunction):
 
         L_new[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b_new = b[i] + z_diff * data[ii] * y[i]
             b[i] = b_new
@@ -984,7 +984,7 @@ cdef class Log(LossFunction):
         # Objective value
         L[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             val = data[ii] * y[i]
 
@@ -1012,7 +1012,7 @@ cdef class Log(LossFunction):
         # New objective value
         L_new[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b[i] /= exp(z_diff * data[ii] * y[i])
             exppred = 1 + 1 / b[i]
@@ -1043,11 +1043,11 @@ cdef class Log(LossFunction):
         # Objective value
         L[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b_ptr = b + i
             Z[i] = 0 # Normalization term
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 # b_ptr[0] = b[k, i]
                 Z[i] += b_ptr[0]
                 b_ptr += n_samples
@@ -1057,13 +1057,13 @@ cdef class Log(LossFunction):
         Lpp_max[0] = -DBL_MAX
 
         b_ptr = b
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             # First derivatives (k th element of the partial gradient)
             g[k] = 0
             # Second derivative
             Lpp = 0
 
-            for ii in xrange(n_nz):
+            for ii in range(n_nz):
                 i = indices[ii]
 
                 if Z[i] == 0:
@@ -1104,13 +1104,13 @@ cdef class Log(LossFunction):
         # New objective value
         L_new[0] = 0
 
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             b_ptr = b + i
             tmp = d_old[y[i]] - d[y[i]]
             Z[i] = 0
 
-            for k in xrange(n_vectors):
+            for k in range(n_vectors):
                 # b_ptr[0] = b[k, i]
                 if y[i] != k:
                     b_ptr[0] *= exp((d[k] - d_old[k] + tmp) * data[ii])
@@ -1134,17 +1134,17 @@ cdef class Log(LossFunction):
         cdef int* indices
         cdef int n_nz
 
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             b[i] = 0
 
-        for j in xrange(n_features):
+        for j in range(n_features):
             X.get_column_ptr(j, &indices, &data, &n_nz)
 
-            for ii in xrange(n_nz):
+            for ii in range(n_nz):
                 i = indices[ii]
                 b[i] += data[ii] * w[j]
 
-        for i in xrange(n_samples):
+        for i in range(n_samples):
             b[i] = exp(y[i] * b[i])
 
     cdef void recompute_mc(self,
@@ -1163,27 +1163,27 @@ cdef class Log(LossFunction):
         cdef int* indices
         cdef int n_nz
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 b[k, i] = 0
 
-        for j in xrange(n_features):
+        for j in range(n_features):
             X.get_column_ptr(j, &indices, &data, &n_nz)
 
-            for ii in xrange(n_nz):
+            for ii in range(n_nz):
                 i = indices[ii]
 
-                for k in xrange(n_vectors):
+                for k in range(n_vectors):
                     tmp = w[k, j] * data[ii]
                     if k == y[i]:
-                        for k2 in xrange(n_vectors):
+                        for k2 in range(n_vectors):
                             if k2 != y[i]:
                                 b[k2, i] -= tmp
                     else:
                         b[k, i] += tmp
 
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 if k != y[i]:
                     b[k, i] = exp(b[k, i])
                 else:
@@ -1301,7 +1301,7 @@ def _primal_cd(self,
         buf = np.zeros(n_samples, dtype=np.float64)
         buf_ptr = <double*>buf.data
 
-    for t in xrange(max_iter):
+    for t in range(max_iter):
         # Permute features (cyclic case only)
         if permute:
             rs.shuffle(active_set[:active_size])
diff --git a/lightning/impl/primal_newton.py b/lightning/impl/primal_newton.py
index fbaa0b0d..6bad93cb 100644
--- a/lightning/impl/primal_newton.py
+++ b/lightning/impl/primal_newton.py
@@ -18,7 +18,6 @@
 from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils import check_random_state
 from sklearn.metrics.pairwise import pairwise_kernels
-from six.moves import xrange
 
 from .base import BaseClassifier
 
@@ -111,7 +110,7 @@ def _fit_binary(self, K, y, rs):
             sv[:1000] = True
             rs.shuffle(sv)
 
-        for t in xrange(1, self.max_iter + 1):
+        for t in range(1, self.max_iter + 1):
             if self.verbose:
                 print("Iteration", t, "#SV=", np.sum(sv))
 
@@ -180,7 +179,7 @@ def fit(self, X, y):
         K = pairwise_kernels(X, filter_params=True, n_jobs=self.n_jobs,
                              metric=self.kernel, **self._kernel_params())
 
-        coef = [self._fit_binary(K, Y[:, i], rs) for i in xrange(n_vectors)]
+        coef = [self._fit_binary(K, Y[:, i], rs) for i in range(n_vectors)]
         self.coef_ = np.array(coef)
         self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
 
diff --git a/lightning/impl/randomkit/tests/test_random.py b/lightning/impl/randomkit/tests/test_random.py
index fa2ed98e..f1f70c1f 100644
--- a/lightning/impl/randomkit/tests/test_random.py
+++ b/lightning/impl/randomkit/tests/test_random.py
@@ -2,12 +2,11 @@
 import numpy as np
 
 from lightning.impl.randomkit import RandomState
-from six.moves import xrange
 
 
 def test_randint():
     rs = RandomState(seed=0)
-    vals = [rs.randint(10) for t in xrange(10000)]
+    vals = [rs.randint(10) for t in range(10000)]
     np.testing.assert_almost_equal(np.mean(vals), 5.018)
 
 
diff --git a/lightning/impl/sag.py b/lightning/impl/sag.py
index dc14226a..1c8c1256 100644
--- a/lightning/impl/sag.py
+++ b/lightning/impl/sag.py
@@ -5,8 +5,6 @@
 
 import numpy as np
 
-from six.moves import xrange
-
 from .base import BaseClassifier, BaseRegressor
 from .dataset_fast import get_dataset
 from .sag_fast import _sag_fit, get_auto_step_size
@@ -77,7 +75,7 @@ def _fit(self, X, Y, sample_weight):
         self.coef_scale_ = np.ones(n_vectors, dtype=np.float64)
         grad = np.zeros((n_vectors, n_samples), dtype=np.float64)
 
-        for i in xrange(n_vectors):
+        for i in range(n_vectors):
             y = Y[:, i]
 
             _sag_fit(self, ds, y, self.coef_[i], self.coef_scale_[i:], grad[i],
@@ -211,7 +209,7 @@ class SAGAClassifier(SAGClassifier):
     def __init__(self, eta='auto', alpha=1.0, beta=0.0, loss="smooth_hinge",
                  penalty=None, gamma=1.0,  max_iter=10, n_inner=1.0,
                  tol=1e-3, verbose=0, callback=None, random_state=None):
-            super(SAGAClassifier, self).__init__(
+            super().__init__(
                 eta=eta, alpha=alpha, beta=beta, loss=loss, penalty=penalty,
                 gamma=gamma, max_iter=max_iter, n_inner=n_inner, tol=tol,
                 verbose=verbose, callback=callback, random_state=random_state)
@@ -335,7 +333,7 @@ class SAGARegressor(SAGRegressor):
     def __init__(self, eta='auto', alpha=1.0, beta=0.0, loss="smooth_hinge",
                  penalty="l1", max_iter=10, n_inner=1.0, tol=1e-3,
                  verbose=0, callback=None, random_state=None):
-            super(SAGARegressor, self).__init__(
+            super().__init__(
                 eta=eta, alpha=alpha, beta=beta, loss=loss, penalty=penalty,
                 gamma=1.0, max_iter=max_iter, n_inner=n_inner, tol=tol,
                 verbose=verbose, callback=callback, random_state=random_state)
diff --git a/lightning/impl/sag_fast.pyx b/lightning/impl/sag_fast.pyx
index 6f3b464d..1e252e20 100644
--- a/lightning/impl/sag_fast.pyx
+++ b/lightning/impl/sag_fast.pyx
@@ -185,7 +185,7 @@ cdef double _pred(double* data,
     cdef int j, jj
     cdef double dot = 0
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         dot += w[j] * data[jj]
 
@@ -198,7 +198,7 @@ cdef void _add(double* data,
                double* w):
     cdef int jj, j
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         w[j] += scale * data[jj]
 
diff --git a/lightning/impl/sdca.py b/lightning/impl/sdca.py
index 38802fcc..060856ca 100644
--- a/lightning/impl/sdca.py
+++ b/lightning/impl/sdca.py
@@ -4,7 +4,6 @@
 import numpy as np
 
 from sklearn.utils import check_random_state
-from six.moves import xrange
 
 from .base import BaseClassifier, BaseRegressor
 from .dataset_fast import get_dataset
@@ -55,7 +54,7 @@ def _fit(self, X, Y):
         rng = check_random_state(self.random_state)
         loss = self._get_loss()
 
-        for i in xrange(n_vectors):
+        for i in range(n_vectors):
             y = Y[:, i]
 
             if self.l1_ratio == 1.0:
diff --git a/lightning/impl/sdca_fast.pyx b/lightning/impl/sdca_fast.pyx
index 6aead728..025e36a6 100644
--- a/lightning/impl/sdca_fast.pyx
+++ b/lightning/impl/sdca_fast.pyx
@@ -27,7 +27,7 @@ cdef _add_l2(double* data,
     cdef int j, jj
     cdef double delta, w_old
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         delta = update * data[jj]
         w_old = w[j]
@@ -57,7 +57,7 @@ cdef _add_elastic(double* data,
     cdef int j, jj
     cdef double delta, w_old, v_old
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         delta = update * data[jj]
         v_old = v[j]
@@ -80,10 +80,10 @@ cdef _sqnorms(RowDataset X,
     cdef int* indices
     cdef int n_nz
 
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         X.get_row_ptr(i, &indices, &data, &n_nz)
         dot = 0
-        for jj in xrange(n_nz):
+        for jj in range(n_nz):
             dot += data[jj] * data[jj]
         sqnorms[i] = dot
 
@@ -96,7 +96,7 @@ cdef double _pred(double* data,
     cdef int j, jj
     cdef double dot = 0
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         dot += w[j] * data[jj]
 
@@ -249,12 +249,12 @@ def _prox_sdca_fit(self,
     regul = 0
 
     t = 0
-    for it in xrange(max_iter):
+    for it in range(max_iter):
         primal = 0
 
         rng.shuffle(sindices)
 
-        for ii in xrange(n_samples):
+        for ii in range(n_samples):
 
             i = sindices[ii]
 
@@ -275,7 +275,7 @@ def _prox_sdca_fit(self,
 
             t += 1
 
-        # end for ii in xrange(n_samples)
+        # end for ii in range(n_samples)
 
         gap = (primal - dual) / n_samples + alpha2 * regul
         gap = fabs(gap)
@@ -288,7 +288,7 @@ def _prox_sdca_fit(self,
                 print("Converged")
             break
 
-    # for it in xrange(max_iter)
+    # for it in range(max_iter)
 
-    for i in xrange(n_samples):
+    for i in range(n_samples):
         dcoef[i] *= scale
diff --git a/lightning/impl/sgd.py b/lightning/impl/sgd.py
index 023bf7d4..3457348d 100644
--- a/lightning/impl/sgd.py
+++ b/lightning/impl/sgd.py
@@ -16,7 +16,6 @@
 from sklearn.utils import check_random_state
 from sklearn.utils.extmath import safe_sparse_dot
 from sklearn.utils.validation import assert_all_finite
-from six.moves import xrange
 
 from .base import BaseClassifier
 from .base import BaseRegressor
@@ -207,7 +206,7 @@ def fit(self, X, y):
         if n_vectors == 1 or not self.multiclass:
             Y = np.asfortranarray(self.label_binarizer_.fit_transform(y),
                                   dtype=np.float64)
-            for i in xrange(n_vectors):
+            for i in range(n_vectors):
                 _binary_sgd(self,
                             self.coef_, self.intercept_, i,
                             ds, Y[:, i], loss, penalty,
@@ -362,7 +361,7 @@ def fit(self, X, y):
         loss = self._get_loss()
         penalty = self._get_penalty()
 
-        for k in xrange(n_vectors):
+        for k in range(n_vectors):
             _binary_sgd(self,
                         self.coef_, self.intercept_, k,
                         ds, Y[:, k], loss, penalty, self.alpha,
diff --git a/lightning/impl/svrg.py b/lightning/impl/svrg.py
index 1c7e3a14..e3a627bf 100644
--- a/lightning/impl/svrg.py
+++ b/lightning/impl/svrg.py
@@ -3,8 +3,6 @@
 
 import numpy as np
 
-from six.moves import xrange
-
 from .base import BaseClassifier, BaseRegressor
 from .dataset_fast import get_dataset
 from .svrg_fast import _svrg_fit
@@ -35,7 +33,7 @@ def _fit(self, X, Y):
         grad = np.zeros((n_vectors, n_samples), dtype=np.float64)
         self.coef_scale_ = np.ones(n_vectors, dtype=np.float64)
 
-        for i in xrange(n_vectors):
+        for i in range(n_vectors):
             y = Y[:, i]
 
             _svrg_fit(self, ds, y, self.coef_[i], self.coef_scale_[i:],
diff --git a/lightning/impl/svrg_fast.pyx b/lightning/impl/svrg_fast.pyx
index 99219696..20ba3962 100644
--- a/lightning/impl/svrg_fast.pyx
+++ b/lightning/impl/svrg_fast.pyx
@@ -27,7 +27,7 @@ cdef double _pred(double* data,
     cdef int j, jj
     cdef double dot = 0
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         dot += w[j] * data[jj]
 
@@ -41,7 +41,7 @@ cdef void _add(double* data,
                double* w):
     cdef int jj, j
 
-    for jj in xrange(n_nz):
+    for jj in range(n_nz):
         j = indices[jj]
         w[j] += scale * data[jj]
 
@@ -86,14 +86,14 @@ def _svrg_fit(self,
     cdef double* fg = <double*>full_grad.data
     cdef double* g = <double*>grad.data
 
-    for it in xrange(max_iter):
+    for it in range(max_iter):
 
         # Reset full gradient
-        for j in xrange(n_features):
+        for j in range(n_features):
             fg[j] = 0
 
         # Compute full gradient.
-        for i in xrange(n_samples):
+        for i in range(n_samples):
 
             # Retrieve sample i.
             X.get_row_ptr(i, &indices, &data, &n_nz)
@@ -109,7 +109,7 @@ def _svrg_fit(self,
         # Compute optimality violation.
         violation = 0
         alpha_scaled = alpha * w_scale[0]
-        for j in xrange(n_features):
+        for j in range(n_features):
             tmp = fg[j] / n_samples + alpha_scaled * w[j]
             violation += tmp * tmp
         violation = sqrt(violation)
@@ -129,7 +129,7 @@ def _svrg_fit(self,
             break
 
         # Inner loop.
-        for t in xrange(n_inner):
+        for t in range(n_inner):
             i = rng.randint(n_samples - 1)
 
             # Retrieve sample i.
@@ -137,7 +137,7 @@ def _svrg_fit(self,
 
             # Add deterministic part, just in time.
             if t > 0:
-                for jj in xrange(n_nz):
+                for jj in range(n_nz):
                     j = indices[jj]
                     w[j] -= eta_avg / w_scale[0] * (t - last[j]) * fg[j]
                     last[j] = t
@@ -151,7 +151,7 @@ def _svrg_fit(self,
             w_scale[0] *= (1 - eta_alpha)
 
             # Add deterministic part.
-            #for j in xrange(n_features):
+            #for j in range(n_features):
                 #w[j] -= eta_avg / w_scale * fg[j]
 
             # Add stochastic part.
@@ -159,12 +159,12 @@ def _svrg_fit(self,
 
             # Take care of possible underflows.
             if w_scale[0] < 1e-9:
-                for j in xrange(n_features):
+                for j in range(n_features):
                     w[j] *= w_scale[0]
                 w_scale[0] = 1.0
 
         # Finalize.
-        for j in xrange(n_features):
+        for j in range(n_features):
             w[j] -= eta_avg / w_scale[0] * (n_inner - last[j]) * fg[j]
             last[j] = 0
 
@@ -175,6 +175,6 @@ def _svrg_fit(self,
                 break
 
     # Rescale coefficients.
-    for j in xrange(n_features):
+    for j in range(n_features):
         w[j] *= w_scale[0]
     w_scale[0] = 1.0
diff --git a/lightning/impl/tests/test_dataset.py b/lightning/impl/tests/test_dataset.py
index fd837543..69cc29ea 100644
--- a/lightning/impl/tests/test_dataset.py
+++ b/lightning/impl/tests/test_dataset.py
@@ -2,8 +2,6 @@
 import numpy as np
 import scipy.sparse as sp
 
-from six.moves import xrange
-
 from sklearn.datasets import make_classification
 from sklearn.utils import check_random_state
 
@@ -33,7 +31,7 @@
 
 def test_contiguous_get_row():
     ind = np.arange(X.shape[1])
-    for i in xrange(X.shape[0]):
+    for i in range(X.shape[0]):
         indices, data, n_nz = cds.get_row(i)
         np.testing.assert_array_equal(indices, ind)
         np.testing.assert_array_equal(data, X[i])
@@ -41,16 +39,16 @@ def test_contiguous_get_row():
 
 
 def test_csr_get_row():
-    for i in xrange(X.shape[0]):
+    for i in range(X.shape[0]):
         indices, data, n_nz = csr_ds.get_row(i)
-        for jj in xrange(n_nz):
+        for jj in range(n_nz):
             j = indices[jj]
             assert X[i, j] == data[jj]
 
 
 def test_fortran_get_column():
     ind = np.arange(X.shape[0])
-    for j in xrange(X.shape[1]):
+    for j in range(X.shape[1]):
         indices, data, n_nz = fds.get_column(j)
         np.testing.assert_array_equal(indices, ind)
         np.testing.assert_array_equal(data, X[:, j])
@@ -58,9 +56,9 @@ def test_fortran_get_column():
 
 
 def test_csc_get_column():
-    for j in xrange(X.shape[1]):
+    for j in range(X.shape[1]):
         indices, data, n_nz = csc_ds.get_column(j)
-        for ii in xrange(n_nz):
+        for ii in range(n_nz):
             i = indices[ii]
             assert X[i, j] == data[ii]
 
diff --git a/lightning/impl/tests/test_dual_cd.py b/lightning/impl/tests/test_dual_cd.py
index e873d029..20f6a70e 100644
--- a/lightning/impl/tests/test_dual_cd.py
+++ b/lightning/impl/tests/test_dual_cd.py
@@ -3,7 +3,6 @@
 
 from sklearn.metrics.pairwise import linear_kernel
 from sklearn.datasets import make_regression
-from six.moves import xrange
 
 from lightning.impl.datasets.samples_generator import make_classification
 from lightning.impl.dual_cd import LinearSVC
@@ -31,8 +30,8 @@ def test_sparse_dot():
         K2 = np.zeros_like(K)
         ds = get_dataset(data)
 
-        for i in xrange(data.shape[0]):
-            for j in xrange(i, data.shape[0]):
+        for i in range(data.shape[0]):
+            for j in range(i, data.shape[0]):
                 K2[i, j] = sparse_dot(ds, i, j)
                 K2[j, i] = K[i, j]
 
diff --git a/lightning/impl/tests/test_penalty.py b/lightning/impl/tests/test_penalty.py
index 5ce66eb2..592ae956 100644
--- a/lightning/impl/tests/test_penalty.py
+++ b/lightning/impl/tests/test_penalty.py
@@ -1,7 +1,5 @@
 import numpy as np
 
-from six.moves import xrange
-
 from lightning.impl.penalty import project_l1_ball, project_simplex
 
 
@@ -10,7 +8,7 @@ def project_simplex_bisection(v, z=1, tau=0.0001, max_iter=1000):
     upper = np.max(v)
     current = np.inf
 
-    for it in xrange(max_iter):
+    for it in range(max_iter):
         if np.abs(current) / z < tau and current < 0:
             break
 
diff --git a/lightning/impl/tests/test_primal_cd.py b/lightning/impl/tests/test_primal_cd.py
index 51037106..bc317bc7 100644
--- a/lightning/impl/tests/test_primal_cd.py
+++ b/lightning/impl/tests/test_primal_cd.py
@@ -3,7 +3,6 @@
 
 from sklearn.datasets import load_digits
 from sklearn.preprocessing import LabelBinarizer
-from six.moves import xrange
 
 from lightning.impl.datasets.samples_generator import make_classification
 from lightning.impl.primal_cd import CDClassifier, CDRegressor
@@ -30,7 +29,7 @@ def test_fit_linear_binary_l1r():
     np.testing.assert_almost_equal(acc, 1.0)
     n_nz = clf.n_nonzero()
     perc = clf.n_nonzero(percentage=True)
-    assert perc == float(n_nz) / bin_dense.shape[1]
+    assert perc == n_nz / bin_dense.shape[1]
 
     clf = CDClassifier(C=0.1, random_state=0, penalty="l1")
     clf.fit(bin_dense, bin_target)
@@ -38,7 +37,7 @@ def test_fit_linear_binary_l1r():
     np.testing.assert_almost_equal(acc, 0.97)
     n_nz2 = clf.n_nonzero()
     perc2 = clf.n_nonzero(percentage=True)
-    assert perc2 == float(n_nz2) / bin_dense.shape[1]
+    assert perc2 == n_nz2 / bin_dense.shape[1]
 
     assert n_nz > n_nz2
 
@@ -236,11 +235,11 @@ def test_l1l2_multiclass_log_loss():
         clf.fit(data, mult_target)
         np.testing.assert_almost_equal(clf.score(data, mult_target), 0.8766, 3)
         df = clf.decision_function(data)
-        sel = np.array([df[i, int(mult_target[i])] for i in xrange(df.shape[0])])
+        sel = np.array([df[i, int(mult_target[i])] for i in range(df.shape[0])])
         df -= sel[:, np.newaxis]
         df = np.exp(df)
         np.testing.assert_array_almost_equal(clf.errors_, df.T)
-        for i in xrange(data.shape[0]):
+        for i in range(data.shape[0]):
             np.testing.assert_almost_equal(clf.errors_[mult_target[i], i], 1.0)
         nz = np.sum(clf.coef_ != 0)
         assert nz == 297
@@ -275,8 +274,8 @@ def test_l1l2_multiclass_squared_hinge_loss():
         df = clf.decision_function(data)
         n_samples, n_vectors = df.shape
         diff = np.zeros_like(clf.errors_)
-        for i in xrange(n_samples):
-            for k in xrange(n_vectors):
+        for i in range(n_samples):
+            for k in range(n_vectors):
                 diff[k, i] = 1 - (df[i, mult_target[i]] - df[i, k])
         np.testing.assert_array_almost_equal(clf.errors_, diff)
         assert np.sum(clf.coef_ != 0) == 300
@@ -301,8 +300,8 @@ def test_l1l2_multiclass_squared_hinge_loss_no_linesearch():
     df = clf.decision_function(data)
     n_samples, n_vectors = df.shape
     diff = np.zeros_like(clf.errors_)
-    for i in xrange(n_samples):
-        for k in xrange(n_vectors):
+    for i in range(n_samples):
+        for k in range(n_vectors):
             diff[k, i] = 1 - (df[i, mult_target[i]] - df[i, k])
     np.testing.assert_array_almost_equal(clf.errors_, diff)
     assert np.sum(clf.coef_ != 0) == 300
diff --git a/lightning/impl/tests/test_sag.py b/lightning/impl/tests/test_sag.py
index 7e8e39cc..9bf9daf9 100644
--- a/lightning/impl/tests/test_sag.py
+++ b/lightning/impl/tests/test_sag.py
@@ -181,7 +181,7 @@ class PySAGAClassifier(PySAGClassifier):
     def __init__(self, eta, alpha=1.0, beta=0.0, loss="smooth_hinge",
                  penalty='l2', gamma=1.0, max_iter=100, random_state=None,
                  callback=None):
-        super(PySAGAClassifier, self).__init__(
+        super().__init__(
                 eta=eta, alpha=alpha, beta=beta, loss=loss, penalty=penalty,
                 gamma=gamma, max_iter=max_iter,
                 random_state=random_state, callback=callback)
diff --git a/setup.py b/setup.py
index 9969d528..afc6198f 100644
--- a/setup.py
+++ b/setup.py
@@ -9,15 +9,17 @@
 
 
 DISTNAME = 'sklearn-contrib-lightning'
-DESCRIPTION = "Large-scale sparse linear classification, " + \
-              "regression and ranking in Python"
-LONG_DESCRIPTION = open('README.rst').read()
+DESCRIPTION = ("Large-scale sparse linear classification, "
+               "regression and ranking in Python")
+with open('README.rst', encoding='utf-8') as f:
+    LONG_DESCRIPTION = f.read()
 MAINTAINER = 'Mathieu Blondel'
 MAINTAINER_EMAIL = 'mathieu@mblondel.org'
 URL = 'https://github.com/scikit-learn-contrib/lightning'
 LICENSE = 'new BSD'
 DOWNLOAD_URL = 'https://github.com/scikit-learn-contrib/lightning'
 VERSION = '0.6.1dev'
+MIN_PYTHON_VERSION = '3.6'
 
 
 def configuration(parent_package='', top_path=None):
@@ -42,8 +44,8 @@ def configuration(parent_package='', top_path=None):
     setup(configuration=configuration,
           name=DISTNAME,
           maintainer=MAINTAINER,
+          python_requires='>={}'.format(MIN_PYTHON_VERSION),
           install_requires=[
-              'six',
               'scikit-learn'
           ],
           include_package_data=True,