From 3010e757fe2773bd70b61a6eafff98f981f742ba Mon Sep 17 00:00:00 2001 From: dheim Date: Tue, 17 Dec 2019 16:05:56 +0900 Subject: [PATCH 1/7] did first fix in pca.py, did not run on cluster --- dask_ml/decomposition/pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dask_ml/decomposition/pca.py b/dask_ml/decomposition/pca.py index 19346b454..d959ead36 100644 --- a/dask_ml/decomposition/pca.py +++ b/dask_ml/decomposition/pca.py @@ -277,7 +277,8 @@ def _fit(self, X): U, S, V = da.linalg.svd_compressed( X, n_components, n_power_iter=n_power_iter, seed=seed ) - U, V = svd_flip(U, V) + V_copy = np.copy(V) + U, V = svd_flip(U, V_copy) explained_variance = (S ** 2) / (n_samples - 1) components, singular_values = V, S From 08955fbdbcd012825231d8bb09728f8ec04a0f3d Mon Sep 17 00:00:00 2001 From: dheim Date: Thu, 19 Dec 2019 11:33:30 +0900 Subject: [PATCH 2/7] did another pca fix, ran successfully on cluster --- dask_ml/decomposition/pca.py | 5 ++--- dask_ml/utils.py | 37 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/dask_ml/decomposition/pca.py b/dask_ml/decomposition/pca.py index d959ead36..abab6fe0c 100644 --- a/dask_ml/decomposition/pca.py +++ b/dask_ml/decomposition/pca.py @@ -13,7 +13,7 @@ from .._compat import check_is_fitted from .._utils import draw_seed -from ..utils import svd_flip +from ..utils import svd_flip_new _TYPE_MSG = ( "Got an unsupported type ({}). Dask-ML's PCA only support Dask Arrays or " @@ -277,8 +277,7 @@ def _fit(self, X): U, S, V = da.linalg.svd_compressed( X, n_components, n_power_iter=n_power_iter, seed=seed ) - V_copy = np.copy(V) - U, V = svd_flip(U, V_copy) + U, V = svd_flip_new(U, V) explained_variance = (S ** 2) / (n_samples - 1) components, singular_values = V, S diff --git a/dask_ml/utils.py b/dask_ml/utils.py index 2a5ef9d6e..b59e95533 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -23,6 +23,43 @@ logger = logging.getLogger() +def svd_flip_fixed(u, v, u_based_decision=True): + """ + This is a replicate of svd_flip() from + miniconda3/lib/python3.7/site-packages/sklearn/utils/extmath.py + To avoid error "ValueError: output array is read-only", it is changed + as described by comments in code + """ + if u_based_decision: + max_abs_cols = np.argmax(np.abs(u), axis=0) + signs = np.sign(u[max_abs_cols, range(u.shape[1])]) + u *= signs + # DEBUG: replaced: + # v *= signs[:, np.newaxis] + # by : + v_copy = np.copy(v) + v_copy *= signs[:, np.newaxis] + return u, v_copy + # DEBUG end + else: + max_abs_rows = np.argmax(np.abs(v), axis=1) + signs = np.sign(v[range(v.shape[0]), max_abs_rows]) + u *= signs + v *= signs[:, np.newaxis] + return u, v + + +def svd_flip_new(u, v): + """ + This is a replicate of svd_flip() which calls svd_flip_fixed() + instead of skm.svd_flip() + """ + u2, v2 = delayed(svd_flip_fixed, nout=2)(u, v) + u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) + v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype) + return u, v + + def svd_flip(u, v): u2, v2 = delayed(skm.svd_flip, nout=2)(u, v) u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) From 5ff97789525f43c4b76975951b8170a8d8c92968 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Tue, 24 Dec 2019 07:49:58 +0900 Subject: [PATCH 3/7] Using try/catch statement --- dask_ml/decomposition/pca.py | 4 ++-- dask_ml/utils.py | 34 +++++++++++++--------------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/dask_ml/decomposition/pca.py b/dask_ml/decomposition/pca.py index abab6fe0c..19346b454 100644 --- a/dask_ml/decomposition/pca.py +++ b/dask_ml/decomposition/pca.py @@ -13,7 +13,7 @@ from .._compat import check_is_fitted from .._utils import draw_seed -from ..utils import svd_flip_new +from ..utils import svd_flip _TYPE_MSG = ( "Got an unsupported type ({}). Dask-ML's PCA only support Dask Arrays or " @@ -277,7 +277,7 @@ def _fit(self, X): U, S, V = da.linalg.svd_compressed( X, n_components, n_power_iter=n_power_iter, seed=seed ) - U, V = svd_flip_new(U, V) + U, V = svd_flip(U, V) explained_variance = (S ** 2) / (n_samples - 1) components, singular_values = V, S diff --git a/dask_ml/utils.py b/dask_ml/utils.py index b59e95533..b0ea72e12 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -23,10 +23,10 @@ logger = logging.getLogger() -def svd_flip_fixed(u, v, u_based_decision=True): +def svd_flip_with_copy(u, v, u_based_decision=True): """ This is a replicate of svd_flip() from - miniconda3/lib/python3.7/site-packages/sklearn/utils/extmath.py + sklearn/utils/extmath.py To avoid error "ValueError: output array is read-only", it is changed as described by comments in code """ @@ -34,13 +34,9 @@ def svd_flip_fixed(u, v, u_based_decision=True): max_abs_cols = np.argmax(np.abs(u), axis=0) signs = np.sign(u[max_abs_cols, range(u.shape[1])]) u *= signs - # DEBUG: replaced: - # v *= signs[:, np.newaxis] - # by : - v_copy = np.copy(v) - v_copy *= signs[:, np.newaxis] - return u, v_copy - # DEBUG end + # replaced: + # v *= signs[:, np.newaxis] by : + v = signs[:, np.newaxis] * v else: max_abs_rows = np.argmax(np.abs(v), axis=1) signs = np.sign(v[range(v.shape[0]), max_abs_rows]) @@ -49,19 +45,15 @@ def svd_flip_fixed(u, v, u_based_decision=True): return u, v -def svd_flip_new(u, v): - """ - This is a replicate of svd_flip() which calls svd_flip_fixed() - instead of skm.svd_flip() - """ - u2, v2 = delayed(svd_flip_fixed, nout=2)(u, v) - u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) - v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype) - return u, v - - def svd_flip(u, v): - u2, v2 = delayed(skm.svd_flip, nout=2)(u, v) + try: + # If the array is locked, copy the array and transpose it + # This happens with a very large array > 1TB + # GH: issue 592 + u2, v2 = delayed(skm.svd_flip, nout=2)(u, v) + except ValueError: + u2, v2 = delayed(svd_flip_with_copy, nout=2)(u, v) + u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype) return u, v From 57749ecd538047a52bb200b5daf9deb7f3248a65 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Tue, 31 Dec 2019 01:00:53 +0900 Subject: [PATCH 4/7] Adding a test but does not fail... --- dask_ml/utils.py | 25 ++++--------------------- tests/test_pca.py | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/dask_ml/utils.py b/dask_ml/utils.py index b0ea72e12..c0983d76c 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -23,26 +23,8 @@ logger = logging.getLogger() -def svd_flip_with_copy(u, v, u_based_decision=True): - """ - This is a replicate of svd_flip() from - sklearn/utils/extmath.py - To avoid error "ValueError: output array is read-only", it is changed - as described by comments in code - """ - if u_based_decision: - max_abs_cols = np.argmax(np.abs(u), axis=0) - signs = np.sign(u[max_abs_cols, range(u.shape[1])]) - u *= signs - # replaced: - # v *= signs[:, np.newaxis] by : - v = signs[:, np.newaxis] * v - else: - max_abs_rows = np.argmax(np.abs(v), axis=1) - signs = np.sign(v[range(v.shape[0]), max_abs_rows]) - u *= signs - v *= signs[:, np.newaxis] - return u, v +def _svd_flip_copy(x, y): + return skm.svd_flip(x.copy(), y.copy()) def svd_flip(u, v): @@ -52,7 +34,8 @@ def svd_flip(u, v): # GH: issue 592 u2, v2 = delayed(skm.svd_flip, nout=2)(u, v) except ValueError: - u2, v2 = delayed(svd_flip_with_copy, nout=2)(u, v) + raise ValueError + u2, v2 = delayed(_svd_flip_copy, nout=2)(u, v) u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype) diff --git a/tests/test_pca.py b/tests/test_pca.py index c55497e45..94c326550 100644 --- a/tests/test_pca.py +++ b/tests/test_pca.py @@ -11,10 +11,11 @@ from dask.array.utils import assert_eq from sklearn import datasets from sklearn.utils import check_random_state as sk_check_random_state +import sklearn.utils.extmath as skm import dask_ml.decomposition as dd from dask_ml.decomposition._compat import _assess_dimension_, _infer_dimension_ -from dask_ml.utils import assert_estimator_equal +from dask_ml.utils import assert_estimator_equal, svd_flip with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) @@ -782,3 +783,24 @@ def test_pca_sklearn_inputs(input_type, solver): a.fit(Y) with pytest.raises(TypeError, match="unsupported type"): a.fit_transform(Y) + + +@pytest.mark.parametrize("seed", [0, 1]) +def test_svd_flip(seed): + rng = np.random.RandomState(seed) + u = rng.randn(8, 3) + v = rng.randn(3, 10) + u = da.from_array(u, chunks=(2, 1)) + v = da.from_array(v, chunks=(3, 1)) + + def set_readonly(x): + x.setflags(write=False) + return x + + u = u.map_blocks(set_readonly) + v = v.map_blocks(set_readonly) + # should no error + u, v = svd_flip(u, v) + u2, v2 = skm.svd_flip(u.compute(), v.compute()) + assert_eq(u, u2) + assert_eq(v, v2) \ No newline at end of file From 905a0b389ba368e8b6674ab4def3c657f389da74 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Tue, 31 Dec 2019 01:15:38 +0900 Subject: [PATCH 5/7] Added a test --- dask_ml/utils.py | 15 +++++---------- tests/test_pca.py | 12 +++++------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/dask_ml/utils.py b/dask_ml/utils.py index c0983d76c..cdbd68564 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -24,19 +24,14 @@ def _svd_flip_copy(x, y): - return skm.svd_flip(x.copy(), y.copy()) + try: + return skm.svd_flip(x, y) + except ValueError: + return skm.svd_flip(x.copy(), y.copy()) def svd_flip(u, v): - try: - # If the array is locked, copy the array and transpose it - # This happens with a very large array > 1TB - # GH: issue 592 - u2, v2 = delayed(skm.svd_flip, nout=2)(u, v) - except ValueError: - raise ValueError - u2, v2 = delayed(_svd_flip_copy, nout=2)(u, v) - + u2, v2 = delayed(_svd_flip_copy, nout=2)(u, v) u = da.from_delayed(u2, shape=u.shape, dtype=u.dtype) v = da.from_delayed(v2, shape=v.shape, dtype=v.dtype) return u, v diff --git a/tests/test_pca.py b/tests/test_pca.py index 94c326550..286c9f904 100644 --- a/tests/test_pca.py +++ b/tests/test_pca.py @@ -785,13 +785,13 @@ def test_pca_sklearn_inputs(input_type, solver): a.fit_transform(Y) -@pytest.mark.parametrize("seed", [0, 1]) -def test_svd_flip(seed): - rng = np.random.RandomState(seed) +def test_svd_flip(): + rng = np.random.RandomState(0) u = rng.randn(8, 3) v = rng.randn(3, 10) - u = da.from_array(u, chunks=(2, 1)) - v = da.from_array(v, chunks=(3, 1)) + u = da.from_array(u, chunks=(-1, -1)) + v = da.from_array(v, chunks=(-1, -1)) + u2, v2 = svd_flip(u, v) def set_readonly(x): x.setflags(write=False) @@ -799,8 +799,6 @@ def set_readonly(x): u = u.map_blocks(set_readonly) v = v.map_blocks(set_readonly) - # should no error u, v = svd_flip(u, v) - u2, v2 = skm.svd_flip(u.compute(), v.compute()) assert_eq(u, u2) assert_eq(v, v2) \ No newline at end of file From 4cbffacc0a861edd41e47b0b308075d2421c45f6 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Tue, 31 Dec 2019 01:17:50 +0900 Subject: [PATCH 6/7] Added a change log --- dask_ml/utils.py | 3 +++ docs/source/changelog.rst | 1 + 2 files changed, 4 insertions(+) diff --git a/dask_ml/utils.py b/dask_ml/utils.py index cdbd68564..825f7a17a 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -24,6 +24,9 @@ def _svd_flip_copy(x, y): + # If the array is locked, copy the array and transpose it + # This happens with a very large array > 1TB + # GH: issue 592 try: return skm.svd_flip(x, y) except ValueError: diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 6ec71e274..2ce056551 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -5,6 +5,7 @@ Version 1.1.1 ~~~~~~~~~~~~~ - Fixed an issue with the 1.1.0 wheel (:issue:`575`) +- Make svd_flip work even when arrays are read only (:issue:`592`) Version 1.1.0 ~~~~~~~~~~~~~ From 6d471e1b2ed064b250d5557f75a9fed2db310ca5 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Tue, 31 Dec 2019 01:19:55 +0900 Subject: [PATCH 7/7] a few cleanup --- tests/test_pca.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_pca.py b/tests/test_pca.py index 286c9f904..444d13dad 100644 --- a/tests/test_pca.py +++ b/tests/test_pca.py @@ -11,7 +11,6 @@ from dask.array.utils import assert_eq from sklearn import datasets from sklearn.utils import check_random_state as sk_check_random_state -import sklearn.utils.extmath as skm import dask_ml.decomposition as dd from dask_ml.decomposition._compat import _assess_dimension_, _infer_dimension_ @@ -801,4 +800,4 @@ def set_readonly(x): v = v.map_blocks(set_readonly) u, v = svd_flip(u, v) assert_eq(u, u2) - assert_eq(v, v2) \ No newline at end of file + assert_eq(v, v2)