From 1b906335c4d7358b0808237ee4fcf6cccd6ac7e9 Mon Sep 17 00:00:00 2001
From: Boris Power <81998504+BorisPower@users.noreply.github.com>
Date: Tue, 17 Jan 2023 13:47:09 -0800
Subject: [PATCH] Improve error message in case of a misformatted file (#158)

* add more descriptive error handling regarding poorly formatted files

* update version

* add dot prefix to json file extentions and ensure list of allowable file types is complete

* cleanup error messages and add comments to explain jsonl/json loading logic

* cleanup csv/tsv reading allowing use of elif for other file extensions, add comments, and remove unnecessary re-attempt to parse as json

* run fillna immediately upon DataFrame creation so that an additional switch is not needed

* use only 1 try-except block to catch parsing errors + cleanup error message

* separate the json and jsonl cases while still maintaining the same functionality, also include a message to user if jsonl appears to be json or vice versa

* fix bug in csv path

* use index -1 to get extension from split

* black formatting apply

* fix black

Co-authored-by: joe-at-openai <joe@openai.com>
---
 openai/validators.py | 100 ++++++++++++++++++++++++-------------------
 openai/version.py    |   2 +-
 2 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/openai/validators.py b/openai/validators.py
index 146e97d9fb..c5a3dd7890 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -37,7 +37,7 @@ def necessary_column_validator(df, necessary_column):
     """
 
     def lower_case_column(df, column):
-        cols = [c for c in df.columns if c.lower() == column]
+        cols = [c for c in df.columns if str(c).lower() == column]
         df.rename(columns={cols[0]: column.lower()}, inplace=True)
         return df
 
@@ -47,7 +47,7 @@ def lower_case_column(df, column):
     error_msg = None
 
     if necessary_column not in df.columns:
-        if necessary_column in [c.lower() for c in df.columns]:
+        if necessary_column in [str(c).lower() for c in df.columns]:
 
             def lower_case_column_creator(df):
                 return lower_case_column(df, necessary_column)
@@ -482,51 +482,65 @@ def read_any_format(fname, fields=["prompt", "completion"]):
     df = None
 
     if os.path.isfile(fname):
-        for ending, separator in [(".csv", ","), (".tsv", "\t")]:
-            if fname.lower().endswith(ending):
-                immediate_msg = f"\n- Based on your file extension, your file is formatted as a {ending[1:].upper()} file"
-                necessary_msg = (
-                    f"Your format `{ending[1:].upper()}` will be converted to `JSONL`"
+        try:
+            if fname.lower().endswith(".csv") or fname.lower().endswith(".tsv"):
+                file_extension_str, separator = (
+                    ("CSV", ",") if fname.lower().endswith(".csv") else ("TSV", "\t")
                 )
-                df = pd.read_csv(fname, sep=separator, dtype=str)
-        if fname.lower().endswith(".xlsx"):
-            immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file"
-            necessary_msg = "Your format `XLSX` will be converted to `JSONL`"
-            xls = pd.ExcelFile(fname)
-            sheets = xls.sheet_names
-            if len(sheets) > 1:
-                immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..."
-            df = pd.read_excel(fname, dtype=str)
-        if fname.lower().endswith(".txt"):
-            immediate_msg = "\n- Based on your file extension, you provided a text file"
-            necessary_msg = "Your format `TXT` will be converted to `JSONL`"
-            with open(fname, "r") as f:
-                content = f.read()
-                df = pd.DataFrame(
-                    [["", line] for line in content.split("\n")],
-                    columns=fields,
-                    dtype=str,
+                immediate_msg = f"\n- Based on your file extension, your file is formatted as a {file_extension_str} file"
+                necessary_msg = (
+                    f"Your format `{file_extension_str}` will be converted to `JSONL`"
                 )
-        if fname.lower().endswith("jsonl") or fname.lower().endswith("json"):
-            try:
-                df = pd.read_json(fname, lines=True, dtype=str)
-            except (ValueError, TypeError):
-                df = pd.read_json(fname, dtype=str)
-                immediate_msg = "\n- Your file appears to be in a .JSON format. Your file will be converted to JSONL format"
-                necessary_msg = "Your format `JSON` will be converted to `JSONL`"
-
-        if df is None:
-            error_msg = (
-                "Your file is not saved as a .CSV, .TSV, .XLSX, .TXT or .JSONL file."
-            )
-            if "." in fname:
-                error_msg += (
-                    f" Your file `{fname}` appears to end with `.{fname.split('.')[1]}`"
+                df = pd.read_csv(fname, sep=separator, dtype=str).fillna("")
+            elif fname.lower().endswith(".xlsx"):
+                immediate_msg = "\n- Based on your file extension, your file is formatted as an Excel file"
+                necessary_msg = "Your format `XLSX` will be converted to `JSONL`"
+                xls = pd.ExcelFile(fname)
+                sheets = xls.sheet_names
+                if len(sheets) > 1:
+                    immediate_msg += "\n- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet..."
+                df = pd.read_excel(fname, dtype=str).fillna("")
+            elif fname.lower().endswith(".txt"):
+                immediate_msg = (
+                    "\n- Based on your file extension, you provided a text file"
                 )
+                necessary_msg = "Your format `TXT` will be converted to `JSONL`"
+                with open(fname, "r") as f:
+                    content = f.read()
+                    df = pd.DataFrame(
+                        [["", line] for line in content.split("\n")],
+                        columns=fields,
+                        dtype=str,
+                    ).fillna("")
+            elif fname.lower().endswith(".jsonl"):
+                df = pd.read_json(fname, lines=True, dtype=str).fillna("")
+                if len(df) == 1:
+                    # this is NOT what we expect for a .jsonl file
+                    immediate_msg = "\n- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL format"
+                    necessary_msg = "Your format `JSON` will be converted to `JSONL`"
+                    df = pd.read_json(fname, dtype=str).fillna("")
+                else:
+                    pass  # this is what we expect for a .jsonl file
+            elif fname.lower().endswith(".json"):
+                df = pd.read_json(fname, lines=True, dtype=str).fillna("")
+                if len(df) == 1:
+                    # this is what we expect for a .json file
+                    df = pd.read_json(fname, dtype=str).fillna("")
+                else:
+                    # this is NOT what we expect for a .json file
+                    immediate_msg = "\n- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL format"
+                    necessary_msg = "Your format `JSON` will be converted to `JSONL`"
             else:
-                error_msg += f" Your file `{fname}` does not appear to have a file ending. Please ensure your filename ends with one of the supported file endings."
-        else:
-            df.fillna("", inplace=True)
+                error_msg = "Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONL"
+                if "." in fname:
+                    error_msg += f" Your file `{fname}` ends with the extension `.{fname.split('.')[-1]}` which is not supported."
+                else:
+                    error_msg += f" Your file `{fname}` is missing a file extension."
+
+        except (ValueError, TypeError):
+            file_extension_str = fname.split(".")[-1].upper()
+            error_msg = f"Your file `{fname}` does not appear to be in valid {file_extension_str} format. Please ensure your file is formatted as a valid {file_extension_str} file."
+
     else:
         error_msg = f"File {fname} does not exist."
 
diff --git a/openai/version.py b/openai/version.py
index 10ea7670f6..46ef7466b0 100644
--- a/openai/version.py
+++ b/openai/version.py
@@ -1 +1 @@
-VERSION = "0.26.1"
+VERSION = "0.26.2"
\ No newline at end of file