Skip to content

Commit 1e8c723

Browse files
committed
feat:bufio add DiscardRunes and utf8 add RuneBytes
1 parent 760636d commit 1e8c723

File tree

3 files changed

+160
-0
lines changed

3 files changed

+160
-0
lines changed

src/bufio/bufio.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,31 @@ func (b *Reader) Discard(n int) (discarded int, err error) {
179179
}
180180
}
181181

182+
// DiscardRune skips the next n UTF-8 encoded runes, returning the number of bytes discarded.
183+
//
184+
// If Discard skips fewer than n runes, it also returns an error.
185+
// If 0 <= n <= b.Buffered(), Discard is guaranteed to succeed without
186+
// reading from the underlying io.Reader.
187+
func (b *Reader) DiscardRunes(n int) (discardedBytes int, err error) {
188+
if n < 0 {
189+
return 0, ErrNegativeCount
190+
}
191+
if n == 0 {
192+
return
193+
}
194+
for i := 0; i < n; i++ {
195+
for b.r+utf8.UTFMax > b.w && !utf8.FullRune(b.buf[b.r:b.w]) && b.err == nil && b.w-b.r < len(b.buf) {
196+
b.fill() // b.w-b.r < len(buf) => buffer is not full
197+
}
198+
199+
bytes := utf8.RuneBytes(b.buf[b.r:b.w])
200+
discardedBytes += bytes
201+
b.r += bytes
202+
}
203+
204+
return discardedBytes, nil
205+
}
206+
182207
// Read reads data into p.
183208
// It returns the number of bytes read into p.
184209
// The bytes are taken from at most one Read on the underlying Reader,

src/bufio/bufio_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,3 +1591,81 @@ func BenchmarkWriterFlush(b *testing.B) {
15911591
bw.Flush()
15921592
}
15931593
}
1594+
1595+
func TestReader_DiscardRunes(t *testing.T) {
1596+
type args struct {
1597+
runeLen int
1598+
}
1599+
tests := []struct {
1600+
name string
1601+
fields string
1602+
args args
1603+
wantByteLen int
1604+
wantErr bool
1605+
}{
1606+
{
1607+
name: "中",
1608+
fields: "中",
1609+
args: struct{ runeLen int }{runeLen: 1},
1610+
wantByteLen: 3,
1611+
wantErr: false,
1612+
},
1613+
{
1614+
name: "a中",
1615+
fields: "a中",
1616+
args: struct{ runeLen int }{runeLen: 2},
1617+
wantByteLen: 4,
1618+
wantErr: false,
1619+
},
1620+
{
1621+
name: "a中b",
1622+
fields: "a中b",
1623+
args: struct{ runeLen int }{runeLen: 3},
1624+
wantByteLen: 5,
1625+
wantErr: false,
1626+
},
1627+
{
1628+
name: "multi_chunk_4097",
1629+
fields: strings.Repeat("中", 4097),
1630+
args: struct{ runeLen int }{runeLen: 4097},
1631+
wantByteLen: 4097 * 3,
1632+
wantErr: false,
1633+
},
1634+
}
1635+
for _, tt := range tests {
1636+
t.Run(tt.name, func(t *testing.T) {
1637+
buf := bytes.NewBufferString(tt.fields)
1638+
b := NewReader(buf)
1639+
gotByteLen, err := b.DiscardRunes(tt.args.runeLen)
1640+
if (err != nil) != tt.wantErr {
1641+
t.Errorf("ByteLenOfRune() error = %v, wantErr %v", err, tt.wantErr)
1642+
return
1643+
}
1644+
if gotByteLen != tt.wantByteLen {
1645+
t.Errorf("ByteLenOfRune() gotByteLen = %v, want %v", gotByteLen, tt.wantByteLen)
1646+
}
1647+
})
1648+
}
1649+
}
1650+
1651+
func BenchmarkDiscardVsRead(b *testing.B) {
1652+
b.Run("DiscardRunes", func(b *testing.B) {
1653+
data := strings.Repeat("中", 4097)
1654+
for i := 0; i < b.N; i++ {
1655+
buf := bytes.NewBufferString(data)
1656+
b := NewReader(buf)
1657+
b.DiscardRunes(4097)
1658+
}
1659+
})
1660+
1661+
b.Run("readRuneForDiscard", func(b *testing.B) {
1662+
data := strings.Repeat("中", 4097)
1663+
for i := 0; i < b.N; i++ {
1664+
buf := bytes.NewBufferString(data)
1665+
b := NewReader(buf)
1666+
for i := 0; i < 4097; i++ {
1667+
b.ReadRune()
1668+
}
1669+
}
1670+
})
1671+
}

src/unicode/utf8/utf8.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,63 @@ func DecodeRune(p []byte) (r rune, size int) {
190190
return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
191191
}
192192

193+
// RuneBytes unpacks the first UTF-8 encoding in p and returns its width in bytes.
194+
// If p is empty it returns 0. Otherwise, if
195+
// the encoding is invalid, it returns 1. Both are impossible
196+
// results for correct, non-empty UTF-8.
197+
//
198+
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
199+
// out of range, or is not the shortest possible UTF-8 encoding for the
200+
// value. No other validation is performed.
201+
func RuneBytes(p []byte) (size int) {
202+
if p == nil {
203+
return 0
204+
}
205+
if len(p) == 0 {
206+
return 0
207+
}
208+
r := p[0]
209+
if r < RuneSelf {
210+
return 1
211+
}
212+
n := len(p)
213+
if n < 1 {
214+
return 0
215+
}
216+
p0 := p[0]
217+
x := first[p0]
218+
if x >= as {
219+
// The following code simulates an additional check for x == xx and
220+
// handling the ASCII and invalid cases accordingly. This mask-and-or
221+
// approach prevents an additional branch.
222+
return 1
223+
}
224+
sz := int(x & 7)
225+
accept := acceptRanges[x>>4]
226+
if n < sz {
227+
return 1
228+
}
229+
b1 := p[1]
230+
if b1 < accept.lo || accept.hi < b1 {
231+
return 1
232+
}
233+
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
234+
return 2
235+
}
236+
b2 := p[2]
237+
if b2 < locb || hicb < b2 {
238+
return 1
239+
}
240+
if sz <= 3 {
241+
return 3
242+
}
243+
b3 := p[3]
244+
if b3 < locb || hicb < b3 {
245+
return 1
246+
}
247+
return 4
248+
}
249+
193250
// DecodeRuneInString is like DecodeRune but its input is a string. If s is
194251
// empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
195252
// returns (RuneError, 1). Both are impossible results for correct, non-empty

0 commit comments

Comments
 (0)