Skip to content

Commit 6d95e5a

Browse files
committed
encoding/csv: add FieldPos method
This enables a consumer of a CSV to find out the position of a CSV field without implementing an intermediate buffer. This is useful to produce good higher level error messages when the CSV syntax is OK but the field values don't match expectations. This also changes the existing semantics of the `ParseError.Column` field to bring it in line with precedent elsewhere in the Go standard library (notably go/token.Position) - the column is now 1-based and indicates a byte count rather than a rune count, and the error position reporting at the end of a last line without a newline is now fixed. This change has some impact on performance: ``` name old time/op new time/op delta Read-8 2.14µs ± 0% 2.15µs ± 0% ~ (p=0.056 n=5+5) ReadWithFieldsPerRecord-8 2.15µs ± 2% 2.15µs ± 1% ~ (p=0.151 n=5+5) ReadWithoutFieldsPerRecord-8 2.15µs ± 0% 2.15µs ± 0% +0.37% (p=0.024 n=5+5) ReadLargeFields-8 3.55µs ± 2% 3.59µs ± 0% ~ (p=0.206 n=5+5) ReadReuseRecord-8 1.18µs ± 1% 1.22µs ± 1% +2.93% (p=0.008 n=5+5) ReadReuseRecordWithFieldsPerRecord-8 1.18µs ± 0% 1.21µs ± 0% +2.54% (p=0.008 n=5+5) ReadReuseRecordWithoutFieldsPerRecord-8 1.18µs ± 0% 1.22µs ± 1% +3.66% (p=0.008 n=5+5) ReadReuseRecordLargeFields-8 2.53µs ± 1% 2.57µs ± 1% +1.70% (p=0.008 n=5+5) Write-8 1.02µs ± 1% 1.01µs ± 0% -1.18% (p=0.016 n=5+4) ``` Fixes #44221. Change-Id: Id37c50fc396024eef406c5bad45380ecd414f5ea Reviewed-on: https://go-review.googlesource.com/c/go/+/291290 Run-TryBot: Ian Lance Taylor <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Ian Lance Taylor <[email protected]> Trust: Paul Jolly <[email protected]>
1 parent 2c05ba4 commit 6d95e5a

File tree

2 files changed

+585
-395
lines changed

2 files changed

+585
-395
lines changed

src/encoding/csv/reader.go

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ import (
6666
type ParseError struct {
6767
StartLine int // Line where the record starts
6868
Line int // Line where the error occurred
69-
Column int // Column (rune index) where the error occurred
69+
Column int // Column (1-based byte index) where the error occurred
7070
Err error // The actual error
7171
}
7272

@@ -162,6 +162,10 @@ type Reader struct {
162162
// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
163163
fieldIndexes []int
164164

165+
// fieldPositions is an index of field positions for the
166+
// last record returned by Read.
167+
fieldPositions []position
168+
165169
// lastRecord is a record cache and only used when ReuseRecord == true.
166170
lastRecord []string
167171
}
@@ -192,6 +196,25 @@ func (r *Reader) Read() (record []string, err error) {
192196
return record, err
193197
}
194198

199+
// FieldPos returns the line and column corresponding to
200+
// the start of the field with the given index in the slice most recently
201+
// returned by Read. Numbering of lines and columns starts at 1;
202+
// columns are counted in bytes, not runes.
203+
//
204+
// If this is called with an out-of-bounds index, it panics.
205+
func (r *Reader) FieldPos(field int) (line, column int) {
206+
if field < 0 || field >= len(r.fieldPositions) {
207+
panic("out of range index passed to FieldPos")
208+
}
209+
p := &r.fieldPositions[field]
210+
return p.line, p.col
211+
}
212+
213+
// pos holds the position of a field in the current line.
214+
type position struct {
215+
line, col int
216+
}
217+
195218
// ReadAll reads all the remaining records from r.
196219
// Each record is a slice of fields.
197220
// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
@@ -260,7 +283,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
260283
}
261284

262285
// Read line (automatically skipping past empty lines and any comments).
263-
var line, fullLine []byte
286+
var line []byte
264287
var errRead error
265288
for errRead == nil {
266289
line, errRead = r.readLine()
@@ -272,7 +295,6 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
272295
line = nil
273296
continue // Skip empty lines
274297
}
275-
fullLine = line
276298
break
277299
}
278300
if errRead == io.EOF {
@@ -286,10 +308,20 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
286308
recLine := r.numLine // Starting line for record
287309
r.recordBuffer = r.recordBuffer[:0]
288310
r.fieldIndexes = r.fieldIndexes[:0]
311+
r.fieldPositions = r.fieldPositions[:0]
312+
pos := position{line: r.numLine, col: 1}
289313
parseField:
290314
for {
291315
if r.TrimLeadingSpace {
292-
line = bytes.TrimLeftFunc(line, unicode.IsSpace)
316+
i := bytes.IndexFunc(line, func(r rune) bool {
317+
return !unicode.IsSpace(r)
318+
})
319+
if i < 0 {
320+
i = len(line)
321+
pos.col -= lengthNL(line)
322+
}
323+
line = line[i:]
324+
pos.col += i
293325
}
294326
if len(line) == 0 || line[0] != '"' {
295327
// Non-quoted string field
@@ -303,48 +335,56 @@ parseField:
303335
// Check to make sure a quote does not appear in field.
304336
if !r.LazyQuotes {
305337
if j := bytes.IndexByte(field, '"'); j >= 0 {
306-
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
338+
col := pos.col + j
307339
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
308340
break parseField
309341
}
310342
}
311343
r.recordBuffer = append(r.recordBuffer, field...)
312344
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
345+
r.fieldPositions = append(r.fieldPositions, pos)
313346
if i >= 0 {
314347
line = line[i+commaLen:]
348+
pos.col += i + commaLen
315349
continue parseField
316350
}
317351
break parseField
318352
} else {
319353
// Quoted string field
354+
fieldPos := pos
320355
line = line[quoteLen:]
356+
pos.col += quoteLen
321357
for {
322358
i := bytes.IndexByte(line, '"')
323359
if i >= 0 {
324360
// Hit next quote.
325361
r.recordBuffer = append(r.recordBuffer, line[:i]...)
326362
line = line[i+quoteLen:]
363+
pos.col += i + quoteLen
327364
switch rn := nextRune(line); {
328365
case rn == '"':
329366
// `""` sequence (append quote).
330367
r.recordBuffer = append(r.recordBuffer, '"')
331368
line = line[quoteLen:]
369+
pos.col += quoteLen
332370
case rn == r.Comma:
333371
// `",` sequence (end of field).
334372
line = line[commaLen:]
373+
pos.col += commaLen
335374
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
375+
r.fieldPositions = append(r.fieldPositions, fieldPos)
336376
continue parseField
337377
case lengthNL(line) == len(line):
338378
// `"\n` sequence (end of line).
339379
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
380+
r.fieldPositions = append(r.fieldPositions, fieldPos)
340381
break parseField
341382
case r.LazyQuotes:
342383
// `"` sequence (bare quote).
343384
r.recordBuffer = append(r.recordBuffer, '"')
344385
default:
345386
// `"*` sequence (invalid non-escaped quote).
346-
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
347-
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
387+
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: pos.col - quoteLen, Err: ErrQuote}
348388
break parseField
349389
}
350390
} else if len(line) > 0 {
@@ -353,19 +393,23 @@ parseField:
353393
if errRead != nil {
354394
break parseField
355395
}
396+
pos.col += len(line)
356397
line, errRead = r.readLine()
398+
if len(line) > 0 {
399+
pos.line++
400+
pos.col = 1
401+
}
357402
if errRead == io.EOF {
358403
errRead = nil
359404
}
360-
fullLine = line
361405
} else {
362406
// Abrupt end of file (EOF or error).
363407
if !r.LazyQuotes && errRead == nil {
364-
col := utf8.RuneCount(fullLine)
365-
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
408+
err = &ParseError{StartLine: recLine, Line: pos.line, Column: pos.col, Err: ErrQuote}
366409
break parseField
367410
}
368411
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
412+
r.fieldPositions = append(r.fieldPositions, fieldPos)
369413
break parseField
370414
}
371415
}
@@ -392,7 +436,12 @@ parseField:
392436
// Check or update the expected fields per record.
393437
if r.FieldsPerRecord > 0 {
394438
if len(dst) != r.FieldsPerRecord && err == nil {
395-
err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
439+
err = &ParseError{
440+
StartLine: recLine,
441+
Line: recLine,
442+
Column: 1,
443+
Err: ErrFieldCount,
444+
}
396445
}
397446
} else if r.FieldsPerRecord == 0 {
398447
r.FieldsPerRecord = len(dst)

0 commit comments

Comments
 (0)