From 49438711cb840c2719c6f5d41e738117b5152152 Mon Sep 17 00:00:00 2001 From: Sam Prokopchuk Date: Tue, 26 Jul 2022 21:01:40 -0400 Subject: [PATCH 1/6] Add new test case --- skillsnetwork/core.py | 40 +++++++++++++++++++++++-------------- tests/test_skillsnetwork.py | 18 +++++++++++++++++ 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/skillsnetwork/core.py b/skillsnetwork/core.py index b513e18..59ff0bf 100644 --- a/skillsnetwork/core.py +++ b/skillsnetwork/core.py @@ -114,25 +114,33 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None] raise Exception(f"Failed to read dataset at {url}") from None +def _rmrf(path: Path) -> None: + if path.is_dir(): + shutil.rmtree(path) + else: + path.unlink() + + def _verify_files_dont_exist( - paths: Iterable[Union[str, Path]], remove_if_exist: bool = False + paths: Iterable[Path], remove_if_exist: bool = False ) -> None: """ Verifies all paths in 'paths' don't exist. :param paths: A iterable of strs or pathlib.Paths. :param remove_if_exist=False: Removes file at path if they already exist. :returns: None - :raises FileExistsError: On the first path found that already exists. + :raises FileExistsError: On the first path found that already exists if remove_if_exist is False. """ for path in paths: - path = Path(path) - if path.exists(): + # Could be a broken symlink => path.exists() is False + if path.exists() or path.is_symlink(): if remove_if_exist: - if path.is_symlink(): - realpath = path.resolve() - path.unlink(realpath) - else: - shutil.rmtree(path) + while path.is_symlink(): + temp = path.readlink() + path.unlink(missing_ok=True) + path = temp + if path.exists(): + _rmrf(path) else: raise FileExistsError(f"Error: File '{path}' already exists.") @@ -254,9 +262,9 @@ async def prepare( path / child.name for child in map(Path, tf.getnames()) if len(child.parents) == 1 and _is_file_to_symlink(child) - ], - overwrite, - ) # Only check if top-level fileobject + ], # Only check if top-level fileobject + remove_if_exist=overwrite, + ) pbar = tqdm(iterable=tf.getmembers(), total=len(tf.getmembers())) pbar.set_description(f"Extracting {filename}") for member in pbar: @@ -269,8 +277,8 @@ async def prepare( path / child.name for child in map(Path, zf.namelist()) if len(child.parents) == 1 and _is_file_to_symlink(child) - ], - overwrite, + ], # Only check if top-level fileobject + remove_if_exist=overwrite, ) pbar = tqdm(iterable=zf.infolist(), total=len(zf.infolist())) pbar.set_description(f"Extracting {filename}") @@ -278,13 +286,15 @@ async def prepare( zf.extract(member=member, path=extract_dir) tmp_download_file.unlink() else: - _verify_files_dont_exist([path / filename], overwrite) + _verify_files_dont_exist([path / filename], remove_if_exist=overwrite) shutil.move(tmp_download_file, extract_dir / filename) # If in jupyterlite environment, the extract_dir = path, so the files are already there. if not _is_jupyterlite(): # If not in jupyterlite environment, symlink top-level file objects in extract_dir for child in filter(_is_file_to_symlink, extract_dir.iterdir()): + if (path / child.name).is_symlink() and overwrite: + (path / child.name).unlink() (path / child.name).symlink_to(child, target_is_directory=child.is_dir()) if verbose: diff --git a/tests/test_skillsnetwork.py b/tests/test_skillsnetwork.py index 90312c4..e9dcd2d 100644 --- a/tests/test_skillsnetwork.py +++ b/tests/test_skillsnetwork.py @@ -134,3 +134,21 @@ async def test_prepare_non_compressed_dataset_with_path(httpserver): await skillsnetwork.prepare_dataset(httpserver.url_for(url), path=path) assert expected_path.exists() expected_path.unlink() + + +@pytest.mark.asyncio +async def test_prepare_non_compressed_dataset_with_overwrite(httpserver): + url = "/test.csv" + expected_path = Path("./test.csv") + with open("tests/test.csv", "rb") as expected_data: + httpserver.expect_request(url).respond_with_data(expected_data) + await skillsnetwork.prepare_dataset(httpserver.url_for(url), overwrite=True) + assert expected_path.exists() + httpserver.clear() + print(expected_path.absolute(), expected_path.absolute().exists()) + with open("tests/test.csv", "rb") as expected_data: + httpserver.expect_request(url).respond_with_data(expected_data) + await skillsnetwork.prepare_dataset(httpserver.url_for(url), overwrite=True) + assert expected_path.exists() + assert Path(expected_path).stat().st_size == 540 + expected_path.unlink() From c0b0062457930239dc91d20cbec11bc36dd8072e Mon Sep 17 00:00:00 2001 From: Sam Prokopchuk Date: Tue, 26 Jul 2022 21:03:29 -0400 Subject: [PATCH 2/6] Docstring typo fix --- skillsnetwork/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skillsnetwork/core.py b/skillsnetwork/core.py index 59ff0bf..3f21190 100644 --- a/skillsnetwork/core.py +++ b/skillsnetwork/core.py @@ -126,7 +126,7 @@ def _verify_files_dont_exist( ) -> None: """ Verifies all paths in 'paths' don't exist. - :param paths: A iterable of strs or pathlib.Paths. + :param paths: A iterable of pathlib.Paths. :param remove_if_exist=False: Removes file at path if they already exist. :returns: None :raises FileExistsError: On the first path found that already exists if remove_if_exist is False. From a14a4df794c7b65f42b5cfefb79c51148c3b4b6b Mon Sep 17 00:00:00 2001 From: Sam Prokopchuk Date: Wed, 27 Jul 2022 09:50:24 -0400 Subject: [PATCH 3/6] Add new test case --- tests/test_skillsnetwork.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/test_skillsnetwork.py b/tests/test_skillsnetwork.py index e9dcd2d..4fb064c 100644 --- a/tests/test_skillsnetwork.py +++ b/tests/test_skillsnetwork.py @@ -137,18 +137,45 @@ async def test_prepare_non_compressed_dataset_with_path(httpserver): @pytest.mark.asyncio -async def test_prepare_non_compressed_dataset_with_overwrite(httpserver): +async def test_prepare_non_compressed_dataset_no_path_with_overwrite(httpserver): url = "/test.csv" expected_path = Path("./test.csv") with open("tests/test.csv", "rb") as expected_data: httpserver.expect_request(url).respond_with_data(expected_data) - await skillsnetwork.prepare_dataset(httpserver.url_for(url), overwrite=True) + await skillsnetwork.prepare_dataset(httpserver.url_for(url)) assert expected_path.exists() httpserver.clear() - print(expected_path.absolute(), expected_path.absolute().exists()) with open("tests/test.csv", "rb") as expected_data: httpserver.expect_request(url).respond_with_data(expected_data) await skillsnetwork.prepare_dataset(httpserver.url_for(url), overwrite=True) assert expected_path.exists() assert Path(expected_path).stat().st_size == 540 expected_path.unlink() + + +@pytest.mark.asyncio +async def test_prepare_dataset_tar_no_path_with_overwrite(httpserver): + url = "/test.tar.gz" + expected_directory = Path("test") + try: + shutil.rmtree(expected_directory) # clean up any previous test + except FileNotFoundError as e: + print(e) + pass + + with open("tests/test.tar.gz", "rb") as expected_data: + httpserver.expect_request(url).respond_with_data(expected_data) + await skillsnetwork.prepare_dataset(httpserver.url_for(url)) + + assert os.path.isdir(expected_directory) + with open(expected_directory / "1.txt") as f: + assert "I am the first test file" in f.read() + httpserver.clear() + + with open("tests/test.tar.gz", "rb") as expected_data: + httpserver.expect_request(url).respond_with_data(expected_data) + await skillsnetwork.prepare_dataset(httpserver.url_for(url), overwrite=True) + assert os.path.isdir(expected_directory) + with open(expected_directory / "1.txt") as f: + assert "I am the first test file" in f.read() + expected_directory.unlink() From 73a5b232b1f860ac2ec674a18c5de17395d50e67 Mon Sep 17 00:00:00 2001 From: Sam Prokopchuk Date: Wed, 27 Jul 2022 09:59:06 -0400 Subject: [PATCH 4/6] Add new test case --- tests/test_skillsnetwork.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_skillsnetwork.py b/tests/test_skillsnetwork.py index 4fb064c..40ec2ed 100644 --- a/tests/test_skillsnetwork.py +++ b/tests/test_skillsnetwork.py @@ -179,3 +179,31 @@ async def test_prepare_dataset_tar_no_path_with_overwrite(httpserver): with open(expected_directory / "1.txt") as f: assert "I am the first test file" in f.read() expected_directory.unlink() + + +@pytest.mark.asyncio +async def test_prepare_dataset_zip_no_path_with_overwrite(httpserver): + url = "/test.zip" + expected_directory = Path("test") + try: + shutil.rmtree(expected_directory) # clean up any previous test + except FileNotFoundError as e: + print(e) + pass + + with open("tests/test.zip", "rb") as expected_data: + httpserver.expect_request(url).respond_with_data(expected_data) + await skillsnetwork.prepare_dataset(httpserver.url_for(url)) + + assert os.path.isdir(expected_directory) + with open(expected_directory / "1.txt") as f: + assert "I am the first test file" in f.read() + httpserver.clear() + + with open("tests/test.zip", "rb") as expected_data: + httpserver.expect_request(url).respond_with_data(expected_data) + await skillsnetwork.prepare_dataset(httpserver.url_for(url), overwrite=True) + assert os.path.isdir(expected_directory) + with open(expected_directory / "1.txt") as f: + assert "I am the first test file" in f.read() + expected_directory.unlink() From 35da522c542d3b4b77f1ef153e3872ecbdf1eff0 Mon Sep 17 00:00:00 2001 From: Sam Prokopchuk Date: Wed, 27 Jul 2022 11:25:18 -0400 Subject: [PATCH 5/6] Minor typo changes --- skillsnetwork/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skillsnetwork/core.py b/skillsnetwork/core.py index 3f21190..fe42671 100644 --- a/skillsnetwork/core.py +++ b/skillsnetwork/core.py @@ -127,7 +127,7 @@ def _verify_files_dont_exist( """ Verifies all paths in 'paths' don't exist. :param paths: A iterable of pathlib.Paths. - :param remove_if_exist=False: Removes file at path if they already exist. + :param remove_if_exist=False: Remove each file at each path in paths if they already exist. :returns: None :raises FileExistsError: On the first path found that already exists if remove_if_exist is False. """ @@ -239,7 +239,6 @@ async def prepare( path.mkdir(exist_ok=True) # For avoiding collisions with any other files the user may have downloaded to /tmp/ - dname = f"skills-network-{hash(url)}" # The file to extract data to. If not jupyterlite, to be symlinked to as well extract_dir = path if _is_jupyterlite() else Path(f"/tmp/{dname}") From 6f6289cc8d177a26f9bbedafa01c868b0486d323 Mon Sep 17 00:00:00 2001 From: Sam Prokopchuk Date: Wed, 27 Jul 2022 14:10:27 -0400 Subject: [PATCH 6/6] Don't remove all tracebacks, shorten exception --- skillsnetwork/core.py | 107 ++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 62 deletions(-) diff --git a/skillsnetwork/core.py b/skillsnetwork/core.py index fe42671..4b02748 100644 --- a/skillsnetwork/core.py +++ b/skillsnetwork/core.py @@ -89,7 +89,7 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None] pbar.update(len(value)) pbar.close() except JsException: - raise Exception(f"Failed to read dataset at {url}") from None + raise Exception(f"Failed to read dataset at '{url}'.") from None else: import requests # pyright: ignore from requests.exceptions import ConnectionError # pyright: ignore @@ -99,7 +99,7 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None] # If requests.get fails, it will return readable error if response.status_code >= 400: raise Exception( - f"received status code {response.status_code} from {url}" + f"received status code {response.status_code} from '{url}'." ) pbar = tqdm( miniters=1, @@ -111,7 +111,7 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None] pbar.update(len(chunk)) pbar.close() except ConnectionError: - raise Exception(f"Failed to read dataset at {url}") from None + raise Exception(f"Failed to read dataset at '{url}'.") from None def _rmrf(path: Path) -> None: @@ -126,7 +126,7 @@ def _verify_files_dont_exist( ) -> None: """ Verifies all paths in 'paths' don't exist. - :param paths: A iterable of pathlib.Paths. + :param paths: A iterable of pathlib.Path s. :param remove_if_exist=False: Remove each file at each path in paths if they already exist. :returns: None :raises FileExistsError: On the first path found that already exists if remove_if_exist is False. @@ -232,9 +232,9 @@ async def prepare( path = Path.cwd() if path is None else Path(path) # Check if path contains /tmp if Path("/tmp") in path.parents: - raise ValueError("path must not be in /tmp") + raise ValueError("path must not be in /tmp.") elif path.is_file(): - raise ValueError("Datasets must be prepared to directories, not files") + raise ValueError("Datasets must be prepared to directories, not files.") # Create the target path if it doesn't exist yet path.mkdir(exist_ok=True) @@ -254,39 +254,45 @@ async def prepare( shutil.rmtree(extract_dir) extract_dir.mkdir() - if tarfile.is_tarfile(tmp_download_file): - with tarfile.open(tmp_download_file) as tf: - _verify_files_dont_exist( - [ - path / child.name - for child in map(Path, tf.getnames()) - if len(child.parents) == 1 and _is_file_to_symlink(child) - ], # Only check if top-level fileobject - remove_if_exist=overwrite, - ) - pbar = tqdm(iterable=tf.getmembers(), total=len(tf.getmembers())) - pbar.set_description(f"Extracting {filename}") - for member in pbar: - tf.extract(member=member, path=extract_dir) - tmp_download_file.unlink() - elif zipfile.is_zipfile(tmp_download_file): - with zipfile.ZipFile(tmp_download_file) as zf: - _verify_files_dont_exist( - [ - path / child.name - for child in map(Path, zf.namelist()) - if len(child.parents) == 1 and _is_file_to_symlink(child) - ], # Only check if top-level fileobject - remove_if_exist=overwrite, - ) - pbar = tqdm(iterable=zf.infolist(), total=len(zf.infolist())) - pbar.set_description(f"Extracting {filename}") - for member in pbar: - zf.extract(member=member, path=extract_dir) - tmp_download_file.unlink() - else: - _verify_files_dont_exist([path / filename], remove_if_exist=overwrite) - shutil.move(tmp_download_file, extract_dir / filename) + try: + if tarfile.is_tarfile(tmp_download_file): + with tarfile.open(tmp_download_file) as tf: + _verify_files_dont_exist( + [ + path / child.name + for child in map(Path, tf.getnames()) + if len(child.parents) == 1 and _is_file_to_symlink(child) + ], # Only check if top-level fileobject + remove_if_exist=overwrite, + ) + pbar = tqdm(iterable=tf.getmembers(), total=len(tf.getmembers())) + pbar.set_description(f"Extracting {filename}") + for member in pbar: + tf.extract(member=member, path=extract_dir) + tmp_download_file.unlink() + elif zipfile.is_zipfile(tmp_download_file): + with zipfile.ZipFile(tmp_download_file) as zf: + _verify_files_dont_exist( + [ + path / child.name + for child in map(Path, zf.namelist()) + if len(child.parents) == 1 and _is_file_to_symlink(child) + ], # Only check if top-level fileobject + remove_if_exist=overwrite, + ) + pbar = tqdm(iterable=zf.infolist(), total=len(zf.infolist())) + pbar.set_description(f"Extracting {filename}") + for member in pbar: + zf.extract(member=member, path=extract_dir) + tmp_download_file.unlink() + else: + _verify_files_dont_exist([path / filename], remove_if_exist=overwrite) + shutil.move(tmp_download_file, extract_dir / filename) + except FileExistsError as e: + raise FileExistsError( + str(e) + + "\nIf you want to overwrite any existing files, use prepare(..., overwrite=True)." + ) from None # If in jupyterlite environment, the extract_dir = path, so the files are already there. if not _is_jupyterlite(): @@ -304,29 +310,6 @@ def setup() -> None: if _is_jupyterlite(): tqdm.monitor_interval = 0 - try: - import sys # pyright: ignore - - ipython = get_ipython() - - def hide_traceback( - exc_tuple=None, - filename=None, - tb_offset=None, - exception_only=False, - running_compiled_code=False, - ): - etype, value, tb = sys.exc_info() - value.__cause__ = None # suppress chained exceptions - return ipython._showtraceback( - etype, value, ipython.InteractiveTB.get_exception_only(etype, value) - ) - - ipython.showtraceback = hide_traceback - - except NameError: - pass - setup()