Skip to content

bufio: add Reader.DiscardRune(n int) #47640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions src/bufio/bufio.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,31 @@ func (b *Reader) Discard(n int) (discarded int, err error) {
}
}

// DiscardRune skips the next n UTF-8 encoded runes, returning the number of bytes discarded.
//
// If Discard skips fewer than n runes, it also returns an error.
// If 0 <= n <= b.Buffered(), Discard is guaranteed to succeed without
// reading from the underlying io.Reader.
func (b *Reader) DiscardRunes(n int) (discardedBytes int, err error) {
if n < 0 {
return 0, ErrNegativeCount
}
if n == 0 {
return
}
for i := 0; i < n; i++ {
for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil && b.w-b.r < len(b.buf) {
b.fill() // b.w-b.r < len(buf) => buffer is not full
}

_, bytes := utf8.DecodeRune(b.buf[b.r:b.w], true)
discardedBytes += bytes
b.r += bytes
}

return discardedBytes, nil
}

// Read reads data into p.
// It returns the number of bytes read into p.
// The bytes are taken from at most one Read on the underlying Reader,
Expand Down
78 changes: 78 additions & 0 deletions src/bufio/bufio_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1591,3 +1591,81 @@ func BenchmarkWriterFlush(b *testing.B) {
bw.Flush()
}
}

func TestReader_DiscardRunes(t *testing.T) {
type args struct {
runeLen int
}
tests := []struct {
name string
fields string
args args
wantByteLen int
wantErr bool
}{
{
name: "中",
fields: "中",
args: struct{ runeLen int }{runeLen: 1},
wantByteLen: 3,
wantErr: false,
},
{
name: "a中",
fields: "a中",
args: struct{ runeLen int }{runeLen: 2},
wantByteLen: 4,
wantErr: false,
},
{
name: "a中b",
fields: "a中b",
args: struct{ runeLen int }{runeLen: 3},
wantByteLen: 5,
wantErr: false,
},
{
name: "multi_chunk_4097",
fields: strings.Repeat("中", 4097),
args: struct{ runeLen int }{runeLen: 4097},
wantByteLen: 4097 * 3,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
buf := bytes.NewBufferString(tt.fields)
b := NewReader(buf)
gotByteLen, err := b.DiscardRunes(tt.args.runeLen)
if (err != nil) != tt.wantErr {
t.Errorf("ByteLenOfRune() error = %v, wantErr %v", err, tt.wantErr)
return
}
if gotByteLen != tt.wantByteLen {
t.Errorf("ByteLenOfRune() gotByteLen = %v, want %v", gotByteLen, tt.wantByteLen)
}
})
}
}

func BenchmarkDiscardVsRead(b *testing.B) {
b.Run("DiscardRunes", func(b *testing.B) {
data := strings.Repeat("中", 4097)
for i := 0; i < b.N; i++ {
buf := bytes.NewBufferString(data)
b := NewReader(buf)
b.DiscardRunes(4097)
}
})

b.Run("readRuneForDiscard", func(b *testing.B) {
data := strings.Repeat("中", 4097)
for i := 0; i < b.N; i++ {
buf := bytes.NewBufferString(data)
b := NewReader(buf)
for i := 0; i < 4097; i++ {
b.ReadRune()
}
}
})
}
39 changes: 34 additions & 5 deletions src/unicode/utf8/utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,23 +146,36 @@ func FullRuneInString(s string) bool {
// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
// the encoding is invalid, it returns (RuneError, 1). Both are impossible
// results for correct, non-empty UTF-8.
// opts provides extended features, The first parameter indicates that only
// the byte size of rune is returned
//
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
// out of range, or is not the shortest possible UTF-8 encoding for the
// value. No other validation is performed.
func DecodeRune(p []byte) (r rune, size int) {
func DecodeRune(p []byte, opts ...interface{}) (r rune, size int) {
n := len(p)
if n < 1 {
return RuneError, 0
}
var runeByteFlag bool
{
if len(opts) > 0 {
runeByteFlag = opts[0].(bool)
}
}
p0 := p[0]
x := first[p0]
if x >= as {
size = 1
if runeByteFlag {
return
}
// The following code simulates an additional check for x == xx and
// handling the ASCII and invalid cases accordingly. This mask-and-or
// approach prevents an additional branch.
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
return rune(p[0])&^mask | RuneError&mask, 1
r = rune(p[0])&^mask | RuneError&mask
return
}
sz := x & 7
accept := acceptRanges[x>>4]
Expand All @@ -174,20 +187,36 @@ func DecodeRune(p []byte) (r rune, size int) {
return RuneError, 1
}
if sz == 2 {
return rune(p0&mask2)<<6 | rune(b1&maskx), 2
size = 2
if runeByteFlag {
return
}
r = rune(p0&mask2)<<6 | rune(b1&maskx)
return
}
b2 := p[2]
if b2 < locb || hicb < b2 {
return RuneError, 1
}
if sz == 3 {
return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
size = 3
if runeByteFlag {
return
}
r = rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx)
return
}
b3 := p[3]
if b3 < locb || hicb < b3 {
return RuneError, 1
}
return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4

size = 4
if runeByteFlag {
return
}
r = rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx)
return
}

// DecodeRuneInString is like DecodeRune but its input is a string. If s is
Expand Down