pandas-dev · njriasan · Mar 25, 2021 · Apr 6, 2021 · Apr 7, 2021 · Apr 7, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -783,6 +783,8 @@ I/O
 - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
 - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
+- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`)
+- Bug in :func:`read_csv` failing to raise ParserError when ``names is not None`` and ``header=None`` (:issue:`22144`)
 
 Period
 ^^^^^^

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -215,6 +215,13 @@ cdef extern from "parser/tokenizer.h":
         int64_t header_start        # header row start
         uint64_t header_end         # header row end
 
+        bint allow_leading_cols     # Boolean: 1: can infer index col, 0: no index col
+
+        # Boolean: 1: Header=None, 0 Header is not None. This is used because
+        # header_end is uint64_t so there is no valid NULL value
+        # (i.e. header_end == -1).
+        bint skip_header_end
+
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
@@ -378,6 +385,7 @@ cdef class TextReader:
         self.encoding_errors = PyBytes_AsString(encoding_errors)
 
         self.parser = parser_new()
+        self.parser.allow_leading_cols = allow_leading_cols
         self.parser.chunksize = tokenize_chunksize
 
         self.mangle_dupe_cols = mangle_dupe_cols
@@ -517,11 +525,13 @@ cdef class TextReader:
         if header is None:
             # sentinel value
             self.parser.header_start = -1
-            self.parser.header_end = -1
+            self.parser.skip_header_end = True
+            self.parser.header_end = 0
             self.parser.header = -1
             self.parser_start = 0
             prelim_header = []
         else:
+            self.parser.skip_header_end = False
             if isinstance(header, list):
                 if len(header) > 1:
                     # need to artificially skip the final line

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -421,10 +421,10 @@ static int end_line(parser_t *self) {
 
     TRACE(("end_line: lines: %d\n", self->lines));
     if (self->lines > 0) {
-        if (self->expected_fields >= 0) {
-            ex_fields = self->expected_fields;
+        if (self->expected_fields > self->line_fields[self->lines - 1]) {
+             ex_fields = self->expected_fields;
         } else {
-            ex_fields = self->line_fields[self->lines - 1];
+             ex_fields = self->line_fields[self->lines - 1];
         }
     }
     TRACE(("end_line: ex_fields: %d\n", ex_fields));
@@ -444,9 +444,26 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
         return 0;
     }
-
-    if (!(self->lines <= self->header_end + 1) &&
-        (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
+    if (
+        // Allow extra fields if there is no header, but there may be
+        // index columns in the first line or we are within the header
+        // and we may have index columns.
+        !((self->skip_header_end &&
+            (self->lines < (uint64_t) self->allow_leading_cols))
+            || (!self->skip_header_end
+                && (self->lines <=
+                (self->header_end + self->allow_leading_cols))))
+        // We only throw an error if we know how many fields
+        // to expect and have encountered too many fields.
+        && (ex_fields > 0 && fields > ex_fields)
+        // Ignore field parsing errors if we will use a subset of the columns.
+        && !(self->usecols)
+        // Ignore a trailing delimter (see gh-2442) by checking if
+        // the last field is empty. We determine this if the next
+        // to last character is null (last character must be null).
+        && !(((fields - 1) == ex_fields) &&
+        !self->stream[self->stream_len - 2])
+    ) {
         // increment file line count
         self->file_lines++;
 

diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -150,6 +150,13 @@ typedef struct parser_t {
     int64_t header_start;  // header row start
     uint64_t header_end;   // header row end
 
+    int allow_leading_cols;  // Boolean: 1: can infer index col, 0: no index col
+
+    // Boolean: 1: Header=None, 0 Header is not None. This is used because
+    // header_end is uint64_t so there is no valid NULL value
+    // (i.e. header_end == -1).
+    int skip_header_end;
+
     void *skipset;
     PyObject *skipfunc;
     int64_t skip_first_N_rows;

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -897,13 +897,10 @@ def _rows_to_cols(self, content):
         # Check that there are no rows with too many
         # elements in their row (rows with too few
         # elements are padded with NaN).
-        # error: Non-overlapping identity check (left operand type: "List[int]",
+        # error: Non-overlapping identity check
+        # (left operand type: "List[int]",
         # right operand type: "Literal[False]")
-        if (
-            max_len > col_len
-            and self.index_col is not False  # type: ignore[comparison-overlap]
-            and self.usecols is None
-        ):
+        if max_len > col_len and self.usecols is None:
 
             footers = self.skipfooter if self.skipfooter else 0
             bad_lines = []
@@ -914,6 +911,10 @@ def _rows_to_cols(self, content):
 
             for (i, l) in iter_content:
                 actual_len = len(l)
+                # Check and remove trailing delimiters see gh-2442
+                if actual_len == (col_len + 1) and l[-1] == "":
+                    l.pop()
+                    actual_len -= 1
 
                 if actual_len > col_len:
                     if self.error_bad_lines or self.warn_bad_lines:

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -667,11 +667,10 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
 def test_no_header_two_extra_columns(all_parsers):
     # GH 26218
     column_names = ["one", "two", "three"]
-    ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
     stream = StringIO("foo,bar,baz,bam,blah")
     parser = all_parsers
-    df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
-    tm.assert_frame_equal(df, ref)
+    with pytest.raises(ParserError, match="Expected 3 fields in line 1, saw 5"):
+        parser.read_csv(stream, header=None, names=column_names, index_col=False)
 
 
 def test_read_csv_names_not_accepting_sets(all_parsers):

diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import ParserError
+
 from pandas import (
     DataFrame,
     Index,
@@ -283,3 +285,20 @@ def test_multiindex_columns_index_col_with_data(all_parsers):
         index=Index(["data"]),
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_index_col_false_error(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    with pytest.raises(ParserError, match="Expected 3 fields in line 2, saw 4"):
+        parser.read_csv(StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False)
+
+
+def test_index_col_false_error_ignore(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False, error_bad_lines=False
+    )
+    expected = DataFrame({"a": [1], "b": [2], "c": [3]})
+    tm.assert_frame_equal(result, expected)