Skip to content

Commit 1b90633

Browse files
Improve error message in case of a misformatted file (#158)
* add more descriptive error handling regarding poorly formatted files * update version * add dot prefix to json file extentions and ensure list of allowable file types is complete * cleanup error messages and add comments to explain jsonl/json loading logic * cleanup csv/tsv reading allowing use of elif for other file extensions, add comments, and remove unnecessary re-attempt to parse as json * run fillna immediately upon DataFrame creation so that an additional switch is not needed * use only 1 try-except block to catch parsing errors + cleanup error message * separate the json and jsonl cases while still maintaining the same functionality, also include a message to user if jsonl appears to be json or vice versa * fix bug in csv path * use index -1 to get extension from split * black formatting apply * fix black Co-authored-by: joe-at-openai <[email protected]>
1 parent 48b6929 commit 1b90633

File tree

2 files changed

+58
-44
lines changed

2 files changed

+58
-44
lines changed

openai/validators.py

+57-43
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def necessary_column_validator(df, necessary_column):
3737
"""
3838

3939
def lower_case_column(df, column):
40-
cols = [c for c in df.columns if c.lower() == column]
40+
cols = [c for c in df.columns if str(c).lower() == column]
4141
df.rename(columns={cols[0]: column.lower()}, inplace=True)
4242
return df
4343

@@ -47,7 +47,7 @@ def lower_case_column(df, column):
4747
error_msg = None
4848

4949
if necessary_column not in df.columns:
50-
if necessary_column in [c.lower() for c in df.columns]:
50+
if necessary_column in [str(c).lower() for c in df.columns]:
5151

5252
def lower_case_column_creator(df):
5353
return lower_case_column(df, necessary_column)
@@ -482,51 +482,65 @@ def read_any_format(fname, fields=["prompt", "completion"]):
482482
df = None
483483

484484
if os.path.isfile(fname):
485-
for ending, separator in [(".csv", ","), (".tsv", "\t")]:
486-
if fname.lower().endswith(ending):
487-
immediate_msg = f"\n- Based on your file extension, your file is formatted as a {ending[1:].upper()} file"
488-
necessary_msg = (
489-
f"Your format `{ending[1:].upper()}` will be converted to `JSONL`"
485+
try:
486+
if fname.lower().endswith(".csv") or fname.lower().endswith(".tsv"):
487+
file_extension_str, separator = (
488+
("CSV", ",") if fname.lower().endswith(".csv") else ("TSV", "\t")
490489
)
491-
df = pd.read_csv(fname, sep=separator, dtype=str)
492-
if fname.lower().endswith(".xlsx"):
493-
immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file"
494-
necessary_msg = "Your format `XLSX` will be converted to `JSONL`"
495-
xls = pd.ExcelFile(fname)
496-
sheets = xls.sheet_names
497-
if len(sheets) > 1:
498-
immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..."
499-
df = pd.read_excel(fname, dtype=str)
500-
if fname.lower().endswith(".txt"):
501-
immediate_msg = "\n- Based on your file extension, you provided a text file"
502-
necessary_msg = "Your format `TXT` will be converted to `JSONL`"
503-
with open(fname, "r") as f:
504-
content = f.read()
505-
df = pd.DataFrame(
506-
[["", line] for line in content.split("\n")],
507-
columns=fields,
508-
dtype=str,
490+
immediate_msg = f"\n- Based on your file extension, your file is formatted as a {file_extension_str} file"
491+
necessary_msg = (
492+
f"Your format `{file_extension_str}` will be converted to `JSONL`"
509493
)
510-
if fname.lower().endswith("jsonl") or fname.lower().endswith("json"):
511-
try:
512-
df = pd.read_json(fname, lines=True, dtype=str)
513-
except (ValueError, TypeError):
514-
df = pd.read_json(fname, dtype=str)
515-
immediate_msg = "\n- Your file appears to be in a .JSON format. Your file will be converted to JSONL format"
516-
necessary_msg = "Your format `JSON` will be converted to `JSONL`"
517-
518-
if df is None:
519-
error_msg = (
520-
"Your file is not saved as a .CSV, .TSV, .XLSX, .TXT or .JSONL file."
521-
)
522-
if "." in fname:
523-
error_msg += (
524-
f" Your file `{fname}` appears to end with `.{fname.split('.')[1]}`"
494+
df = pd.read_csv(fname, sep=separator, dtype=str).fillna("")
495+
elif fname.lower().endswith(".xlsx"):
496+
immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file"
497+
necessary_msg = "Your format `XLSX` will be converted to `JSONL`"
498+
xls = pd.ExcelFile(fname)
499+
sheets = xls.sheet_names
500+
if len(sheets) > 1:
501+
immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..."
502+
df = pd.read_excel(fname, dtype=str).fillna("")
503+
elif fname.lower().endswith(".txt"):
504+
immediate_msg = (
505+
"\n- Based on your file extension, you provided a text file"
525506
)
507+
necessary_msg = "Your format `TXT` will be converted to `JSONL`"
508+
with open(fname, "r") as f:
509+
content = f.read()
510+
df = pd.DataFrame(
511+
[["", line] for line in content.split("\n")],
512+
columns=fields,
513+
dtype=str,
514+
).fillna("")
515+
elif fname.lower().endswith(".jsonl"):
516+
df = pd.read_json(fname, lines=True, dtype=str).fillna("")
517+
if len(df) == 1:
518+
# this is NOT what we expect for a .jsonl file
519+
immediate_msg = "\n- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL format"
520+
necessary_msg = "Your format `JSON` will be converted to `JSONL`"
521+
df = pd.read_json(fname, dtype=str).fillna("")
522+
else:
523+
pass # this is what we expect for a .jsonl file
524+
elif fname.lower().endswith(".json"):
525+
df = pd.read_json(fname, lines=True, dtype=str).fillna("")
526+
if len(df) == 1:
527+
# this is what we expect for a .json file
528+
df = pd.read_json(fname, dtype=str).fillna("")
529+
else:
530+
# this is NOT what we expect for a .json file
531+
immediate_msg = "\n- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL format"
532+
necessary_msg = "Your format `JSON` will be converted to `JSONL`"
526533
else:
527-
error_msg += f" Your file `{fname}` does not appear to have a file ending. Please ensure your filename ends with one of the supported file endings."
528-
else:
529-
df.fillna("", inplace=True)
534+
error_msg = "Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONL"
535+
if "." in fname:
536+
error_msg += f" Your file `{fname}` ends with the extension `.{fname.split('.')[-1]}` which is not supported."
537+
else:
538+
error_msg += f" Your file `{fname}` is missing a file extension."
539+
540+
except (ValueError, TypeError):
541+
file_extension_str = fname.split(".")[-1].upper()
542+
error_msg = f"Your file `{fname}` does not appear to be in valid {file_extension_str} format. Please ensure your file is formatted as a valid {file_extension_str} file."
543+
530544
else:
531545
error_msg = f"File {fname} does not exist."
532546

openai/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
VERSION = "0.26.1"
1+
VERSION = "0.26.2"

0 commit comments

Comments
 (0)