Skip to content

Commit b453748

Browse files
new: put the filetype detection code in a separate function
Signed-off-by: thiswillbeyourgithub <[email protected]>
1 parent ed7a9c7 commit b453748

File tree

1 file changed

+42
-44
lines changed

1 file changed

+42
-44
lines changed

wdoc/utils/batch_file_loader.py

Lines changed: 42 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,44 @@
9191
inference_rules[k][i] = re.compile(vv)
9292

9393

94+
@optional_typecheck
95+
def infer_filetype(path: str) -> str:
96+
"""
97+
Heuristics to infer the 'filetype' argument of a --path given to wdoc.
98+
"""
99+
for k, v in inference_rules.items():
100+
for vv in inference_rules[k]:
101+
if vv.search(path):
102+
return k
103+
fp = Path(path)
104+
if not fp.exists():
105+
raise Exception(
106+
f"Failed to detect 'auto' filetype for '{fp}' with regex, and it's not a file (does not exist)"
107+
)
108+
try:
109+
import magic
110+
111+
info = magic.from_file(fp).lower()
112+
# # instead of passing the file, pass only the
113+
# # headers of the file because otherwise
114+
# # it seems to have issues with some files
115+
# with open(fp, "rb") as temp:
116+
# start = temp.read(1024)
117+
# info = magic.from_buffer(start).lower()
118+
except Exception as err:
119+
raise Exception(
120+
f"Failed to detect 'auto' filetype for '{fp}' with regex and even python-magic. Error: '{err}'"
121+
) from err
122+
if "pdf" in info:
123+
return "pdf"
124+
elif "mpeg" in info or "mp3" in info:
125+
return "local_audio"
126+
elif "epub" in info:
127+
return "epub"
128+
else:
129+
raise Exception(f"No more python magic heuristics to try for path '{path}'")
130+
131+
94132
@optional_typecheck
95133
def batch_load_doc(
96134
llm_name: ModelName,
@@ -129,54 +167,14 @@ def batch_load_doc(
129167
continue
130168
load_filetype = load_kwargs["filetype"]
131169

132-
# auto parse filetype if infer
170+
# guess the appropriate 'filetype' argument based on the path because
171+
# the user gave us filetype='auto'
133172
if load_filetype == "auto":
134-
for k, v in inference_rules.items():
135-
for vv in inference_rules[k]:
136-
if vv.search(load_kwargs["path"]):
137-
load_filetype = k
138-
break
139-
if load_filetype != "auto":
140-
break
141-
if load_filetype == "auto":
142-
try:
143-
fp = Path(load_kwargs["path"])
144-
if fp.exists():
145-
try:
146-
import magic
147-
148-
info = magic.from_file(fp).lower()
149-
# # instead of passing the file, pass only the
150-
# # headers of the file because otherwise
151-
# # it seems to have issues with some files
152-
# with open(fp, "rb") as temp:
153-
# start = temp.read(1024)
154-
# info = magic.from_buffer(start).lower()
155-
except Exception as err:
156-
raise Exception(
157-
f"Failed to run python-magic as a last resort heuristic: '{err}'"
158-
) from err
159-
if "pdf" in info:
160-
load_filetype = "pdf"
161-
break
162-
elif "mpeg" in info:
163-
load_filetype = "local_audio"
164-
break
165-
elif "epub" in info:
166-
load_filetype = "epub"
167-
break
168-
else:
169-
raise Exception(
170-
"No more python magic heuristics to try"
171-
)
172-
except Exception as err:
173-
logger.warning(
174-
f"Failed to detect 'auto' filetype for '{fp}' with regex and even python-magic. Error: '{err}'"
175-
)
173+
load_filetype = infer_filetype(load_kwargs["path"])
176174

177175
assert (
178176
load_filetype != "auto"
179-
), f"Could not infer filetype of {load_kwargs['path']}. Use the 'filetype' argument."
177+
), f"Could not infer the filetype of '{load_kwargs['path']}', please specify a value for the 'filetype' argument."
180178
if load_filetype not in recursive_types:
181179
to_load[ild]["filetype"] = load_filetype
182180

0 commit comments

Comments
 (0)