|
91 | 91 | inference_rules[k][i] = re.compile(vv) |
92 | 92 |
|
93 | 93 |
|
| 94 | +@optional_typecheck |
| 95 | +def infer_filetype(path: str) -> str: |
| 96 | + """ |
| 97 | + Heuristics to infer the 'filetype' argument of a --path given to wdoc. |
| 98 | + """ |
| 99 | + for k, v in inference_rules.items(): |
| 100 | + for vv in inference_rules[k]: |
| 101 | + if vv.search(path): |
| 102 | + return k |
| 103 | + fp = Path(path) |
| 104 | + if not fp.exists(): |
| 105 | + raise Exception( |
| 106 | + f"Failed to detect 'auto' filetype for '{fp}' with regex, and it's not a file (does not exist)" |
| 107 | + ) |
| 108 | + try: |
| 109 | + import magic |
| 110 | + |
| 111 | + info = magic.from_file(fp).lower() |
| 112 | + # # instead of passing the file, pass only the |
| 113 | + # # headers of the file because otherwise |
| 114 | + # # it seems to have issues with some files |
| 115 | + # with open(fp, "rb") as temp: |
| 116 | + # start = temp.read(1024) |
| 117 | + # info = magic.from_buffer(start).lower() |
| 118 | + except Exception as err: |
| 119 | + raise Exception( |
| 120 | + f"Failed to detect 'auto' filetype for '{fp}' with regex and even python-magic. Error: '{err}'" |
| 121 | + ) from err |
| 122 | + if "pdf" in info: |
| 123 | + return "pdf" |
| 124 | + elif "mpeg" in info or "mp3" in info: |
| 125 | + return "local_audio" |
| 126 | + elif "epub" in info: |
| 127 | + return "epub" |
| 128 | + else: |
| 129 | + raise Exception(f"No more python magic heuristics to try for path '{path}'") |
| 130 | + |
| 131 | + |
94 | 132 | @optional_typecheck |
95 | 133 | def batch_load_doc( |
96 | 134 | llm_name: ModelName, |
@@ -129,54 +167,14 @@ def batch_load_doc( |
129 | 167 | continue |
130 | 168 | load_filetype = load_kwargs["filetype"] |
131 | 169 |
|
132 | | - # auto parse filetype if infer |
| 170 | + # guess the appropriate 'filetype' argument based on the path because |
| 171 | + # the user gave us filetype='auto' |
133 | 172 | if load_filetype == "auto": |
134 | | - for k, v in inference_rules.items(): |
135 | | - for vv in inference_rules[k]: |
136 | | - if vv.search(load_kwargs["path"]): |
137 | | - load_filetype = k |
138 | | - break |
139 | | - if load_filetype != "auto": |
140 | | - break |
141 | | - if load_filetype == "auto": |
142 | | - try: |
143 | | - fp = Path(load_kwargs["path"]) |
144 | | - if fp.exists(): |
145 | | - try: |
146 | | - import magic |
147 | | - |
148 | | - info = magic.from_file(fp).lower() |
149 | | - # # instead of passing the file, pass only the |
150 | | - # # headers of the file because otherwise |
151 | | - # # it seems to have issues with some files |
152 | | - # with open(fp, "rb") as temp: |
153 | | - # start = temp.read(1024) |
154 | | - # info = magic.from_buffer(start).lower() |
155 | | - except Exception as err: |
156 | | - raise Exception( |
157 | | - f"Failed to run python-magic as a last resort heuristic: '{err}'" |
158 | | - ) from err |
159 | | - if "pdf" in info: |
160 | | - load_filetype = "pdf" |
161 | | - break |
162 | | - elif "mpeg" in info: |
163 | | - load_filetype = "local_audio" |
164 | | - break |
165 | | - elif "epub" in info: |
166 | | - load_filetype = "epub" |
167 | | - break |
168 | | - else: |
169 | | - raise Exception( |
170 | | - "No more python magic heuristics to try" |
171 | | - ) |
172 | | - except Exception as err: |
173 | | - logger.warning( |
174 | | - f"Failed to detect 'auto' filetype for '{fp}' with regex and even python-magic. Error: '{err}'" |
175 | | - ) |
| 173 | + load_filetype = infer_filetype(load_kwargs["path"]) |
176 | 174 |
|
177 | 175 | assert ( |
178 | 176 | load_filetype != "auto" |
179 | | - ), f"Could not infer filetype of {load_kwargs['path']}. Use the 'filetype' argument." |
| 177 | + ), f"Could not infer the filetype of '{load_kwargs['path']}', please specify a value for the 'filetype' argument." |
180 | 178 | if load_filetype not in recursive_types: |
181 | 179 | to_load[ild]["filetype"] = load_filetype |
182 | 180 |
|
|
0 commit comments