Skip to content

Commit 261bbfc

Browse files
committed
dont convert markdown, json to plaintext for rag
1 parent a34f64e commit 261bbfc

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

daras_ai_v2/vector_search.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import datetime
44
import hashlib
55
import io
6+
import json
67
import mimetypes
78
import multiprocessing
89
import re
@@ -1004,11 +1005,18 @@ def any_bytes_to_text_pages_or_df(
10041005
except UnsupportedDocumentError:
10051006
pass
10061007

1007-
if mime_type == "text/plain":
1008-
text = f_bytes.decode()
1009-
else:
1010-
ext = mimetypes.guess_extension(mime_type) or ""
1011-
text = pandoc_to_text(f_name + ext, f_bytes)
1008+
match mime_type:
1009+
case "text/plain" | "text/markdown":
1010+
text = f_bytes.decode()
1011+
case "application/json":
1012+
try:
1013+
text = json.dumps(json.loads(f_bytes.decode()), indent=2)
1014+
except json.JSONDecodeError as e:
1015+
raise UserError(f"Invalid JSON file: {e}") from e
1016+
case _:
1017+
ext = mimetypes.guess_extension(mime_type) or ""
1018+
text = pandoc_to_text(f_name + ext, f_bytes)
1019+
10121020
return [text]
10131021

10141022

0 commit comments

Comments
 (0)