BUG: Fix multi-index colname references in read_csv c engine. (#42519)

jmcomie · web-flow · commit 2ec98626c3e7 · 2021-07-15T00:20:23.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -233,7 +233,7 @@ MultiIndex
 I/O
 ^^^
 - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
--
+- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
 -
 
 Period
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1280,6 +1280,8 @@ cdef class TextReader:
                 # generate extra (bogus) headers if there are more columns than headers
                 if j >= len(self.header[0]):
                     return j
+                elif self.has_mi_columns:
+                    return tuple(header_row[j] for header_row in self.header)
                 else:
                     return self.header[0][j]
             else:
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -257,3 +257,29 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
     result = parser.read_csv(StringIO(data), dtype=str)
     expected = DataFrame({"a": ["1"], "a.1": ["1"]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_dtype_multi_index(all_parsers):
+    # GH 42446
+    parser = all_parsers
+    data = "A,B,B\nX,Y,Z\n1,2,3"
+
+    result = parser.read_csv(
+        StringIO(data),
+        header=list(range(2)),
+        dtype={
+            ("A", "X"): np.int32,
+            ("B", "Y"): np.int32,
+            ("B", "Z"): np.float32,
+        },
+    )
+
+    expected = DataFrame(
+        {
+            ("A", "X"): np.int32([1]),
+            ("B", "Y"): np.int32([2]),
+            ("B", "Z"): np.float32([3]),
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
@@ -161,3 +161,29 @@ def test_converter_index_col_bug(all_parsers):
 
     xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
     tm.assert_frame_equal(rs, xp)
+
+
+def test_converter_multi_index(all_parsers):
+    # GH 42446
+    parser = all_parsers
+    data = "A,B,B\nX,Y,Z\n1,2,3"
+
+    result = parser.read_csv(
+        StringIO(data),
+        header=list(range(2)),
+        converters={
+            ("A", "X"): np.int32,
+            ("B", "Y"): np.int32,
+            ("B", "Z"): np.float32,
+        },
+    )
+
+    expected = DataFrame(
+        {
+            ("A", "X"): np.int32([1]),
+            ("B", "Y"): np.int32([2]),
+            ("B", "Z"): np.float32([3]),
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -570,3 +570,23 @@ def test_str_nan_dropped(all_parsers):
     )
 
     tm.assert_frame_equal(result, expected)
+
+
+def test_nan_multi_index(all_parsers):
+    # GH 42446
+    parser = all_parsers
+    data = "A,B,B\nX,Y,Z\n1,2,inf"
+
+    result = parser.read_csv(
+        StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
+    )
+
+    expected = DataFrame(
+        {
+            ("A", "X"): [1],
+            ("B", "Y"): [2],
+            ("B", "Z"): [np.nan],
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ MultiIndex`
`233`	`233`	`I/O`
`234`	`234`	`^^^`
`235`	`235`	- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
`236`		`--`
	`236`	+- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
`237`	`237`	`-`
`238`	`238`
`239`	`239`	`Period`