pandas-dev · WillAyd · Nov 26, 2018 · Oct 18, 2018 · Oct 18, 2018 · Oct 19, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -236,6 +236,7 @@ Other Enhancements
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
+- :func:`read_fwf` now accepts keyword `infer_nrows` (:issue:`15138`).
 
 .. _whatsnew_0240.api_breaking:
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -342,15 +342,22 @@
                          _engine_doc))
 
 _fwf_widths = """\
-colspecs : list of pairs (int, int) or 'infer'. optional
+colspecs : list of pairs (int, int) or 'infer', default 'infer'
     A list of pairs (tuples) giving the extents of the fixed-width
-    fields of each line as half-open intervals (i.e.,  [from, to[ ).
+    fields of each line as half-open intervals (i.e.,  [from, to) ).
     String value 'infer' can be used to instruct the parser to try
     detecting the column specifications from the first 100 rows of
-    the data which are not being skipped via skiprows (default='infer').
-widths : list of ints. optional
-    A list of field widths which can be used instead of 'colspecs' if
+    the data which are not being skipped via skiprows (default='infer'),
+    or by using the `infer_nrows` parameter.
+widths : list of ints, optional
+    A list of field widths which can be used instead of `colspecs` if
     the intervals are contiguous.
+infer_nrows : int, default 100
+    The number of rows to consider when letting the parser determine the
+    `colspecs`.
+
+    .. versionadded:: 0.24.0
+
 delimiter : str, default ``'\t' + ' '``
     Characters to consider as filler characters in the fixed-width file.
     Can be used to specify the filler character of the fields
@@ -527,6 +534,7 @@ def _read(filepath_or_buffer, kwds):
 
 _fwf_defaults = {
     'colspecs': 'infer',
+    'infer_nrows': 100,
     'widths': None,
 }
 
@@ -716,7 +724,8 @@ def parser_f(filepath_or_buffer,
 
 
 @Appender(_read_fwf_doc)
-def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
+def read_fwf(filepath_or_buffer, colspecs='infer', widths=None,
+             infer_nrows=100, **kwds):
     # Check input arguments.
     if colspecs is None and widths is None:
         raise ValueError("Must specify either colspecs or widths")
@@ -732,6 +741,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
             col += w
 
     kwds['colspecs'] = colspecs
+    kwds['infer_nrows'] = infer_nrows
     kwds['engine'] = 'python-fwf'
     return _read(filepath_or_buffer, kwds)
 
@@ -3362,13 +3372,15 @@ class FixedWidthReader(BaseIterator):
     A reader of fixed-width lines.
     """
 
-    def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
+    def __init__(self, f, colspecs, delimiter, comment, skiprows=None,
+                 infer_nrows=100):
         self.f = f
         self.buffer = None
         self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
         self.comment = comment
         if colspecs == 'infer':
-            self.colspecs = self.detect_colspecs(skiprows=skiprows)
+            self.colspecs = self.detect_colspecs(skiprows=skiprows,
+                                                 infer_nrows=infer_nrows)
         else:
             self.colspecs = colspecs
 
@@ -3421,11 +3433,14 @@ def get_rows(self, n, skiprows=None):
         self.buffer = iter(buffer_rows)
         return detect_rows
 
-    def detect_colspecs(self, n=100, skiprows=None):
+    def detect_colspecs(self, n=None, skiprows=None, infer_nrows=100):
+        # infer_nrows replaces n, see GH15138
         # Regex escape the delimiters
         delimiters = ''.join(r'\%s' % x for x in self.delimiter)
         pattern = re.compile('([^%s]+)' % delimiters)
-        rows = self.get_rows(n, skiprows)
+        if n:
+            infer_nrows = n
+        rows = self.get_rows(infer_nrows, skiprows)
         if not rows:
             raise EmptyDataError("No rows from which to infer column width")
         max_len = max(map(len, rows))
@@ -3464,8 +3479,10 @@ class FixedWidthFieldParser(PythonParser):
     def __init__(self, f, **kwds):
         # Support iterators, convert to a list.
         self.colspecs = kwds.pop('colspecs')
+        self.infer_nrows = kwds.pop('infer_nrows')
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f):
         self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
-                                     self.comment, self.skiprows)
+                                     self.comment, self.skiprows,
+                                     self.infer_nrows)
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -143,6 +143,22 @@ def test_fwf_colspecs_None(self):
         expected = DataFrame([[123456, 456], [456789, 789]])
         tm.assert_frame_equal(result, expected)
 
+    def test_fwf_colspecs_infer_nrows(self):
+        # GH 15138
+        data = """\
+  1  2
+123 98
+"""
+        # infer_nrows == 1 should have colspec == [(2, 3), (5, 6)]
+        df = read_fwf(StringIO(data), header=None, infer_nrows=1)
+        expected = pd.DataFrame([[1, 2], [3, 8]])
+        tm.assert_frame_equal(df, expected)
+
+        # test for infer_nrows > number of rows
+        df = read_fwf(StringIO(data), header=None, infer_nrows=10)
+        expected = pd.DataFrame([[1, 2], [123, 98]])
+        tm.assert_frame_equal(df, expected)
+
     def test_fwf_regression(self):
         # GH 3594
         # turns out 'T060' is parsable as a datetime slice!