diff --git a/src/bufio/bufio.go b/src/bufio/bufio.go index bbdfad4ba8f987..573c8eb2fce143 100644 --- a/src/bufio/bufio.go +++ b/src/bufio/bufio.go @@ -179,6 +179,31 @@ func (b *Reader) Discard(n int) (discarded int, err error) { } } +// DiscardRune skips the next n UTF-8 encoded runes, returning the number of bytes discarded. +// +// If Discard skips fewer than n runes, it also returns an error. +// If 0 <= n <= b.Buffered(), Discard is guaranteed to succeed without +// reading from the underlying io.Reader. +func (b *Reader) DiscardRunes(n int) (discardedBytes int, err error) { + if n < 0 { + return 0, ErrNegativeCount + } + if n == 0 { + return + } + for i := 0; i < n; i++ { + for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil && b.w-b.r < len(b.buf) { + b.fill() // b.w-b.r < len(buf) => buffer is not full + } + + _, bytes := utf8.DecodeRune(b.buf[b.r:b.w], true) + discardedBytes += bytes + b.r += bytes + } + + return discardedBytes, nil +} + // Read reads data into p. // It returns the number of bytes read into p. // The bytes are taken from at most one Read on the underlying Reader, diff --git a/src/bufio/bufio_test.go b/src/bufio/bufio_test.go index ef0f6c834e8623..e7b01ae480020a 100644 --- a/src/bufio/bufio_test.go +++ b/src/bufio/bufio_test.go @@ -1591,3 +1591,81 @@ func BenchmarkWriterFlush(b *testing.B) { bw.Flush() } } + +func TestReader_DiscardRunes(t *testing.T) { + type args struct { + runeLen int + } + tests := []struct { + name string + fields string + args args + wantByteLen int + wantErr bool + }{ + { + name: "中", + fields: "中", + args: struct{ runeLen int }{runeLen: 1}, + wantByteLen: 3, + wantErr: false, + }, + { + name: "a中", + fields: "a中", + args: struct{ runeLen int }{runeLen: 2}, + wantByteLen: 4, + wantErr: false, + }, + { + name: "a中b", + fields: "a中b", + args: struct{ runeLen int }{runeLen: 3}, + wantByteLen: 5, + wantErr: false, + }, + { + name: "multi_chunk_4097", + fields: strings.Repeat("中", 4097), + args: struct{ runeLen int }{runeLen: 4097}, + wantByteLen: 4097 * 3, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + buf := bytes.NewBufferString(tt.fields) + b := NewReader(buf) + gotByteLen, err := b.DiscardRunes(tt.args.runeLen) + if (err != nil) != tt.wantErr { + t.Errorf("ByteLenOfRune() error = %v, wantErr %v", err, tt.wantErr) + return + } + if gotByteLen != tt.wantByteLen { + t.Errorf("ByteLenOfRune() gotByteLen = %v, want %v", gotByteLen, tt.wantByteLen) + } + }) + } +} + +func BenchmarkDiscardVsRead(b *testing.B) { + b.Run("DiscardRunes", func(b *testing.B) { + data := strings.Repeat("中", 4097) + for i := 0; i < b.N; i++ { + buf := bytes.NewBufferString(data) + b := NewReader(buf) + b.DiscardRunes(4097) + } + }) + + b.Run("readRuneForDiscard", func(b *testing.B) { + data := strings.Repeat("中", 4097) + for i := 0; i < b.N; i++ { + buf := bytes.NewBufferString(data) + b := NewReader(buf) + for i := 0; i < 4097; i++ { + b.ReadRune() + } + } + }) +} diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go index 6ccd46437382bf..90479e449306fa 100644 --- a/src/unicode/utf8/utf8.go +++ b/src/unicode/utf8/utf8.go @@ -146,23 +146,36 @@ func FullRuneInString(s string) bool { // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if // the encoding is invalid, it returns (RuneError, 1). Both are impossible // results for correct, non-empty UTF-8. +// opts provides extended features, The first parameter indicates that only +// the byte size of rune is returned // // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is // out of range, or is not the shortest possible UTF-8 encoding for the // value. No other validation is performed. -func DecodeRune(p []byte) (r rune, size int) { +func DecodeRune(p []byte, opts ...interface{}) (r rune, size int) { n := len(p) if n < 1 { return RuneError, 0 } + var runeByteFlag bool + { + if len(opts) > 0 { + runeByteFlag = opts[0].(bool) + } + } p0 := p[0] x := first[p0] if x >= as { + size = 1 + if runeByteFlag { + return + } // The following code simulates an additional check for x == xx and // handling the ASCII and invalid cases accordingly. This mask-and-or // approach prevents an additional branch. mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF. - return rune(p[0])&^mask | RuneError&mask, 1 + r = rune(p[0])&^mask | RuneError&mask + return } sz := x & 7 accept := acceptRanges[x>>4] @@ -174,20 +187,36 @@ func DecodeRune(p []byte) (r rune, size int) { return RuneError, 1 } if sz == 2 { - return rune(p0&mask2)<<6 | rune(b1&maskx), 2 + size = 2 + if runeByteFlag { + return + } + r = rune(p0&mask2)<<6 | rune(b1&maskx) + return } b2 := p[2] if b2 < locb || hicb < b2 { return RuneError, 1 } if sz == 3 { - return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3 + size = 3 + if runeByteFlag { + return + } + r = rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx) + return } b3 := p[3] if b3 < locb || hicb < b3 { return RuneError, 1 } - return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4 + + size = 4 + if runeByteFlag { + return + } + r = rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx) + return } // DecodeRuneInString is like DecodeRune but its input is a string. If s is