-
Notifications
You must be signed in to change notification settings - Fork 6.3k
Closed as not planned
Closed as not planned
Copy link
Labels
bugSomething isn't workingSomething isn't workingtriageIssue needs to be triaged/prioritizedIssue needs to be triaged/prioritized
Description
Bug Description
My file is stored in MinIO. When I use this module to read data, I always get an error saying that the current file is in use.
I use the window system to run the program
Version
0.12.25
Steps to Reproduce
`class MinioReader(BaseReader):
"""General reader for any Minio file or directory."""
def __init__(
self,
*args: Any,
bucket: str,
key: Optional[str] = None,
prefix: Optional[str] = "",
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
required_exts: Optional[List[str]] = None,
filename_as_id: bool = False,
num_files_limit: Optional[int] = None,
file_metadata: Optional[Callable[[str], Dict]] = None,
minio_endpoint: Optional[str] = None,
minio_secure: bool = False,
minio_cert_check: bool = True,
minio_access_key: Optional[str] = None,
minio_secret_key: Optional[str] = None,
minio_session_token: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Initialize Minio bucket and key, along with credentials if needed.
If key is not set, the entire bucket (filtered by prefix) is parsed.
Args:
bucket (str): the name of your Minio bucket
key (Optional[str]): the name of the specific file. If none is provided,
this loader will iterate through the entire bucket.
prefix (Optional[str]): the prefix to filter by in the case that the loader
iterates through the entire bucket. Defaults to empty string.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. See `SimpleDirectoryReader` for more details.
required_exts (Optional[List[str]]): List of required extensions.
Default is None.
num_files_limit (Optional[int]): Maximum number of files to read.
Default is None.
file_metadata (Optional[Callable[str, Dict]]): A function that takes
in a filename and returns a Dict of metadata for the Document.
Default is None.
minio_endpoint (Optional[str]): The Minio endpoint. Default is None.
minio_port (Optional[int]): The Minio port. Default is None.
minio_access_key (Optional[str]): The Minio access key. Default is None.
minio_secret_key (Optional[str]): The Minio secret key. Default is None.
minio_session_token (Optional[str]): The Minio session token.
minio_secure: MinIO server runs in TLS mode
minio_cert_check: allows the usage of a self-signed cert for MinIO server
"""
super().__init__(*args, **kwargs)
self.bucket = bucket
self.key = key
self.prefix = prefix
self.file_extractor = file_extractor
self.required_exts = required_exts
self.filename_as_id = filename_as_id
self.num_files_limit = num_files_limit
self.file_metadata = file_metadata
self.minio_endpoint = minio_endpoint
self.minio_secure = minio_secure
self.minio_cert_check = minio_cert_check
self.minio_access_key = minio_access_key
self.minio_secret_key = minio_secret_key
self.minio_session_token = minio_session_token
def load_data(self) -> List[Document]:
"""Load file(s) from Minio."""
from minio import Minio
minio_client = Minio(
self.minio_endpoint,
secure=self.minio_secure,
cert_check=self.minio_cert_check,
access_key=self.minio_access_key,
secret_key=self.minio_secret_key,
session_token=self.minio_session_token,
)
with tempfile.TemporaryDirectory() as temp_dir:
if self.key:
suffix = Path(self.key).suffix
_, filepath = tempfile.mkstemp(dir=temp_dir, suffix=suffix)
minio_client.fget_object(
bucket_name=self.bucket, object_name=self.key, file_path=filepath
)
else:
objects = minio_client.list_objects(
bucket_name=self.bucket, prefix=self.prefix, recursive=True
)
for i, obj in enumerate(objects):
file_name = obj.object_name.split("/")[-1]
if self.num_files_limit is not None and i > self.num_files_limit:
break
suffix = Path(obj.object_name).suffix
is_dir = obj.object_name.endswith("/") # skip folders
is_bad_ext = (
self.required_exts is not None
and suffix not in self.required_exts # skip other extensions
)
if is_dir or is_bad_ext:
continue
filepath = f"{temp_dir}/{file_name}"
minio_client.fget_object(self.bucket, obj.object_name, filepath)
loader = SimpleDirectoryReader(
temp_dir,
file_extractor=self.file_extractor,
required_exts=self.required_exts,
filename_as_id=self.filename_as_id,
num_files_limit=self.num_files_limit,
file_metadata=self.file_metadata,
)
return loader.load_data()`
Use MinIOReader in the window to use the load_data() method to get data,I have located the code and I have successfully resolved the issue after modifying the code locally.
Just add os.close(_) # Close the file descriptor
,when specifying to read only one file.
Relevant Logs/Tracbacks
"[WinError 32] 另一个程序正在使用此文件,进程无法访问。: 'C:\\\\Users\\\\xxxx\\\\AppData\\\\Local\\\\Temp\\\\tmpevke254r\\\\tmpotwb0nf0.pdf'"
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingtriageIssue needs to be triaged/prioritizedIssue needs to be triaged/prioritized