From af27cbecec698bc4e3f24c04f3e648e0f1082cc7 Mon Sep 17 00:00:00 2001 From: joe-at-openai <117690718+joe-at-openai@users.noreply.github.com> Date: Mon, 10 Apr 2023 08:34:09 -0700 Subject: [PATCH] add case to handle indented .json parsing (#175) * add case to handle indented .json parsing * Fix formatting --------- Co-authored-by: Chris Hallacy --- openai/validators.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/openai/validators.py b/openai/validators.py index 97083659df..078179a44b 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -526,14 +526,21 @@ def read_any_format(fname, fields=["prompt", "completion"]): else: pass # this is what we expect for a .jsonl file elif fname.lower().endswith(".json"): - df = pd.read_json(fname, lines=True, dtype=str).fillna("") - if len(df) == 1: - # this is what we expect for a .json file + try: + # to handle case where .json file is actually a .jsonl file + df = pd.read_json(fname, lines=True, dtype=str).fillna("") + if len(df) == 1: + # this code path corresponds to a .json file that has one line + df = pd.read_json(fname, dtype=str).fillna("") + else: + # this is NOT what we expect for a .json file + immediate_msg = "\n- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL format" + necessary_msg = ( + "Your format `JSON` will be converted to `JSONL`" + ) + except ValueError: + # this code path corresponds to a .json file that has multiple lines (i.e. it is indented) df = pd.read_json(fname, dtype=str).fillna("") - else: - # this is NOT what we expect for a .json file - immediate_msg = "\n- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL format" - necessary_msg = "Your format `JSON` will be converted to `JSONL`" else: error_msg = "Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONL" if "." in fname: