From 47a2ad591185b920da783bad6fe1f5bfaa905f18 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 19 Dec 2024 12:26:26 +0100 Subject: [PATCH 1/5] Fixed handling paths wit whitespace --- .../v2/processes/connectors/fsspec/fsspec.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py b/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py index 4005ba41b..3de902cb4 100644 --- a/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +++ b/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py @@ -4,6 +4,7 @@ import random import shutil import tempfile +from urllib.parse import unquote from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar @@ -60,6 +61,7 @@ class FileConfig(BaseModel): def __init__(self, **data): protocol, path_without_protocol = data["remote_url"].split("://") + path_without_protocol = unquote(path_without_protocol) data["protocol"] = protocol data["path_without_protocol"] = path_without_protocol super().__init__(**data) @@ -105,8 +107,12 @@ def precheck(self) -> None: **self.connection_config.get_access_config(), ) files = fs.ls(path=self.index_config.path_without_protocol, detail=True) + if files is None: + logger.error(f"[{CONNECTOR_TYPE}]fs.ls returned None for path: {self.index_config.path_without_protocol}") + raise SourceConnectionError("No files returned. Check if path is correct.") valid_files = [x.get("name") for x in files if x.get("type") == "file"] if not valid_files: + logger.warning(f"[{CONNECTOR_TYPE}]There was no files found at path: {self.index_config.path_without_protocol}") return file_to_sample = valid_files[0] logger.debug(f"attempting to make HEAD request for file: {file_to_sample}") From 73860024802b73c6520fd91ad1d0d70f0af0822f Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 19 Dec 2024 12:43:46 +0100 Subject: [PATCH 2/5] Linter fix Version bump --- CHANGELOG.md | 7 +++++++ unstructured_ingest/__version__.py | 2 +- .../v2/processes/connectors/fsspec/fsspec.py | 12 +++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4797b7275..b8564b2b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## 0.3.11-dev1 + +### Enhancements + +* **fix for URL style character in fsspec url** + + ## 0.3.11-dev0 ### Enhancements diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 312f3ed3a..99e11453f 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.3.11-dev0" # pragma: no cover +__version__ = "0.3.11-dev1" # pragma: no cover diff --git a/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py b/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py index 7de9c9e5e..2cd95fd40 100644 --- a/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +++ b/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py @@ -5,10 +5,10 @@ import shutil import tempfile from contextlib import contextmanager -from urllib.parse import unquote from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar +from urllib.parse import unquote from uuid import NAMESPACE_DNS, uuid5 from pydantic import BaseModel, Field, Secret @@ -110,11 +110,17 @@ def precheck(self) -> None: ) files = fs.ls(path=self.index_config.path_without_protocol, detail=True) if files is None: - logger.error(f"[{CONNECTOR_TYPE}]fs.ls returned None for path: {self.index_config.path_without_protocol}") + logger.error( + f"[{CONNECTOR_TYPE}]fs.ls returned None for path: \ + {self.index_config.path_without_protocol}" + ) raise SourceConnectionError("No files returned. Check if path is correct.") valid_files = [x.get("name") for x in files if x.get("type") == "file"] if not valid_files: - logger.warning(f"[{CONNECTOR_TYPE}]There was no files found at path: {self.index_config.path_without_protocol}") + logger.warning( + f"[{CONNECTOR_TYPE}]There was no files found at path: \ + {self.index_config.path_without_protocol}" + ) return file_to_sample = valid_files[0] logger.debug(f"attempting to make HEAD request for file: {file_to_sample}") From c76796bcd027b4c0e3cf27b1c61ae7471bb2c0bb Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 19 Dec 2024 13:31:00 +0100 Subject: [PATCH 3/5] Added unit test for url and space path styles in fsspec --- test/unit/v2/connectors/test_fsspec_paths.py | 96 ++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 test/unit/v2/connectors/test_fsspec_paths.py diff --git a/test/unit/v2/connectors/test_fsspec_paths.py b/test/unit/v2/connectors/test_fsspec_paths.py new file mode 100644 index 000000000..dda060bfe --- /dev/null +++ b/test/unit/v2/connectors/test_fsspec_paths.py @@ -0,0 +1,96 @@ +from unittest.mock import MagicMock, patch +from urllib.parse import unquote + +import pytest + +from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import ( + FsspecAccessConfig, + FsspecConnectionConfig, + FsspecIndexer, + FsspecIndexerConfig, + Secret, +) + + +@pytest.mark.parametrize( + ("remote_url", "expected_path"), + [ + ("dropbox://da ta", "da ta"), + ("dropbox://da%20ta", "da ta"), + ] +) +def test_fsspec_indexer_path_decoding(remote_url, expected_path): + # Initialize index config with given remote_url + index_config = FsspecIndexerConfig(remote_url=remote_url) + # We assume an empty access config just for test + connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig())) + + # After initialization, ensure path_without_protocol matches expected result + assert ( + index_config.path_without_protocol == expected_path + ), f"Expected {expected_path}, got {index_config.path_without_protocol}" + + # Create the indexer + indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config) + + # Now index_config should have our expected path + full_path = expected_path + assert ( + indexer.index_config.path_without_protocol == full_path + ), f"Expected path to be {full_path}, got {indexer.index_config.path_without_protocol}" + + # Mock fsspec filesystem class to verify it's called with the correct path + with patch("fsspec.get_filesystem_class") as mock_fs_class: + mock_fs_instance = MagicMock() + mock_fs_class.return_value = lambda **kwargs: mock_fs_instance + + # Mock fs.ls to return a dummy file + mock_fs_instance.ls.return_value = [ + {"name": full_path + "/file.txt", "type": "file", "size": 123} + ] + + files = indexer.get_file_data() + + # Verify that fs.ls was called with the correct decoded path + mock_fs_instance.ls.assert_called_once_with(full_path, detail=True) + + # Assert that we got the expected file and it has the correct name + assert len(files) == 1 + assert ( + files[0]["name"] == full_path + "/file.txt" + ), "File name did not match expected output" + + +@pytest.mark.parametrize( + "remote_url", + [ + "dropbox://da ta", + "dropbox://da%20ta", + ], +) +def test_fsspec_indexer_precheck(remote_url): + # This test ensures that precheck doesn't raise errors and calls head appropriately + index_config = FsspecIndexerConfig(remote_url=remote_url) + connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig())) + indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config) + + with patch("fsspec.get_filesystem_class") as mock_fs_class: + mock_fs_instance = MagicMock() + mock_fs_class.return_value = lambda **kwargs: mock_fs_instance + + mock_fs_instance.ls.return_value = [ + { + "name": "/" + unquote(remote_url.split("://", 1)[1]) + "/file.txt", + "type": "file", + "size": 123, + } + ] + + # head should be called on that file + mock_fs_instance.head.return_value = {"Content-Length": "123"} + + # If precheck does not raise SourceConnectionError, we consider it passed + indexer.precheck() + + # Check that fs.head was called with the correct file path + mock_fs_instance.head.assert_called_once() From a6adbc5758b42c16c5a2a8c076d172c7d71e237a Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 19 Dec 2024 13:34:50 +0100 Subject: [PATCH 4/5] Removed some excessive comments I made for myself --- test/unit/v2/connectors/test_fsspec_paths.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/unit/v2/connectors/test_fsspec_paths.py b/test/unit/v2/connectors/test_fsspec_paths.py index dda060bfe..8dbc339d5 100644 --- a/test/unit/v2/connectors/test_fsspec_paths.py +++ b/test/unit/v2/connectors/test_fsspec_paths.py @@ -20,9 +20,7 @@ ] ) def test_fsspec_indexer_path_decoding(remote_url, expected_path): - # Initialize index config with given remote_url index_config = FsspecIndexerConfig(remote_url=remote_url) - # We assume an empty access config just for test connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig())) # After initialization, ensure path_without_protocol matches expected result @@ -30,7 +28,6 @@ def test_fsspec_indexer_path_decoding(remote_url, expected_path): index_config.path_without_protocol == expected_path ), f"Expected {expected_path}, got {index_config.path_without_protocol}" - # Create the indexer indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config) # Now index_config should have our expected path From ac578c5187a6a9cd996ea3b1abdf13a8e129d007 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 19 Dec 2024 13:40:14 +0100 Subject: [PATCH 5/5] Lint fix --- test/unit/v2/connectors/test_fsspec_paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/v2/connectors/test_fsspec_paths.py b/test/unit/v2/connectors/test_fsspec_paths.py index 8dbc339d5..abba02ede 100644 --- a/test/unit/v2/connectors/test_fsspec_paths.py +++ b/test/unit/v2/connectors/test_fsspec_paths.py @@ -17,7 +17,7 @@ [ ("dropbox://da ta", "da ta"), ("dropbox://da%20ta", "da ta"), - ] + ], ) def test_fsspec_indexer_path_decoding(remote_url, expected_path): index_config = FsspecIndexerConfig(remote_url=remote_url)