Skip to content

Commit f1daf0a

Browse files
committed
collate: fix Compare[String] funcs to use key comparisons
and add unit tests for the UCA Variable Weighting examples
1 parent d42948e commit f1daf0a

File tree

3 files changed

+133
-78
lines changed

3 files changed

+133
-78
lines changed

collate/collate.go

Lines changed: 16 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -102,90 +102,28 @@ func (b *Buffer) Reset() {
102102

103103
// Compare returns an integer comparing the two byte slices.
104104
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
105+
// Note that this is less performant than calling c.Sort() because
106+
// a new buffer will be allocated for each call.
105107
func (c *Collator) Compare(a, b []byte) int {
106-
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
107-
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
108-
c.iter(0).SetInput(a)
109-
c.iter(1).SetInput(b)
110-
if res := c.compare(); res != 0 {
111-
return res
112-
}
113-
if !c.ignore[colltab.Identity] {
114-
return bytes.Compare(a, b)
115-
}
116-
return 0
108+
var (
109+
buf Buffer
110+
kA = c.Key(&buf, a)
111+
kB = c.Key(&buf, b)
112+
)
113+
return bytes.Compare(kA, kB)
117114
}
118115

119116
// CompareString returns an integer comparing the two strings.
120117
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
118+
// Note that this is less performant than calling c.Sort() because
119+
// a new buffer will be allocated for each call.
121120
func (c *Collator) CompareString(a, b string) int {
122-
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
123-
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
124-
c.iter(0).SetInputString(a)
125-
c.iter(1).SetInputString(b)
126-
if res := c.compare(); res != 0 {
127-
return res
128-
}
129-
if !c.ignore[colltab.Identity] {
130-
if a < b {
131-
return -1
132-
} else if a > b {
133-
return 1
134-
}
135-
}
136-
return 0
137-
}
138-
139-
func compareLevel(f func(i *iter) int, a, b *iter) int {
140-
a.pce = 0
141-
b.pce = 0
142-
for {
143-
va := f(a)
144-
vb := f(b)
145-
if va != vb {
146-
if va < vb {
147-
return -1
148-
}
149-
return 1
150-
} else if va == 0 {
151-
break
152-
}
153-
}
154-
return 0
155-
}
156-
157-
func (c *Collator) compare() int {
158-
ia, ib := c.iter(0), c.iter(1)
159-
// Process primary level
160-
if c.alternate != altShifted {
161-
// TODO: implement script reordering
162-
if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
163-
return res
164-
}
165-
} else {
166-
// TODO: handle shifted
167-
}
168-
if !c.ignore[colltab.Secondary] {
169-
f := (*iter).nextSecondary
170-
if c.backwards {
171-
f = (*iter).prevSecondary
172-
}
173-
if res := compareLevel(f, ia, ib); res != 0 {
174-
return res
175-
}
176-
}
177-
// TODO: special case handling (Danish?)
178-
if !c.ignore[colltab.Tertiary] || c.caseLevel {
179-
if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
180-
return res
181-
}
182-
if !c.ignore[colltab.Quaternary] {
183-
if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
184-
return res
185-
}
186-
}
187-
}
188-
return 0
121+
var (
122+
buf Buffer
123+
kA = c.KeyFromString(&buf, a)
124+
kB = c.KeyFromString(&buf, b)
125+
)
126+
return bytes.Compare(kA, kB)
189127
}
190128

191129
// Key returns the collation key for str.

collate/collate_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,24 @@ func TestCompare(t *testing.T) {
450450
t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
451451
}
452452
}
453+
454+
c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
455+
if c.CompareString("deluge", "de luge") != -1 {
456+
t.Errorf("CompareString for 'deluge' vs 'de luge' in Shift-Trimmed mode should return -1 but returned %v", c.CompareString("deluge", "de luge"))
457+
}
458+
}
459+
460+
func TestKeyFromStringCompareForShiftTrimmed(t *testing.T) {
461+
var (
462+
c = New(language.MustParse("en-us-u-ka-posix-ks-level4"))
463+
buf Buffer
464+
kA = c.KeyFromString(&buf, "deluge")
465+
kB = c.KeyFromString(&buf, "de luge")
466+
)
467+
468+
if bytes.Compare(kA, kB) != -1 {
469+
t.Errorf("The Keys for 'deluge' should sort before the key for 'de luge' in Shift-Trimmed mode, but it compares as %v", bytes.Compare(kA, kB))
470+
}
453471
}
454472

455473
func TestNumeric(t *testing.T) {

collate/sort_test.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
package collate_test
66

77
import (
8+
"bytes"
89
"fmt"
10+
"strings"
911
"testing"
1012

1113
"golang.org/x/text/collate"
@@ -53,3 +55,100 @@ func TestSort(t *testing.T) {
5355
t.Errorf("found %s; want %s", res, want)
5456
}
5557
}
58+
59+
func TestSortStringsAndCompareString(t *testing.T) {
60+
for _, tt := range []struct {
61+
name string
62+
c *collate.Collator
63+
want []string
64+
}{
65+
{
66+
name: "English default options",
67+
c: collate.New(language.English),
68+
want: []string{
69+
"abc",
70+
"bcd",
71+
"ddd",
72+
},
73+
},
74+
{
75+
// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
76+
name: "Blanked",
77+
c: collate.New(language.MustParse("en-us-u-ka-blanked")),
78+
want: []string{
79+
"death",
80+
"de luge",
81+
"de-luge",
82+
"deluge",
83+
"de-luge",
84+
"de Luge",
85+
"de-Luge",
86+
"deLuge",
87+
"de-Luge",
88+
"demark",
89+
},
90+
},
91+
{
92+
// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
93+
name: "Shifted",
94+
c: collate.New(language.MustParse("en-us-u-ka-shifted")),
95+
want: []string{
96+
"death",
97+
"de luge",
98+
"de-luge",
99+
"de-luge",
100+
"deluge",
101+
"de Luge",
102+
"de-Luge",
103+
"de-Luge",
104+
"deLuge",
105+
"demark",
106+
},
107+
},
108+
{
109+
// From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples
110+
name: "Shift-Trimmed",
111+
c: collate.New(language.MustParse("en-us-u-ka-posix-ks-level4")),
112+
want: []string{
113+
"death",
114+
"deluge",
115+
"de luge",
116+
"de-luge",
117+
"de-luge",
118+
"deLuge",
119+
"de Luge",
120+
"de-Luge",
121+
"de-Luge",
122+
"demark",
123+
},
124+
},
125+
} {
126+
t.Run(tt.name, func(t *testing.T) {
127+
actual := make([]string, len(tt.want))
128+
copy(actual, tt.want)
129+
tt.c.SortStrings(actual)
130+
131+
p := func(v []string) string { return strings.Join(v, ", ") }
132+
if p(tt.want) != p(actual) {
133+
t.Errorf("SortStrings want: '%v'\n Got: '%v'", p(tt.want), p(actual))
134+
}
135+
136+
buf := collate.Buffer{}
137+
for i := 0; i < len(tt.want)-1; i++ {
138+
a, b := tt.want[i], tt.want[i+1]
139+
kA, kB := tt.c.KeyFromString(&buf, a), tt.c.KeyFromString(&buf, b)
140+
if bytes.Compare(kA, kB) > 0 {
141+
t.Errorf("KeyFromString for %v is bigger than for %v", a, b)
142+
}
143+
}
144+
145+
for i := 0; i < len(tt.want)-1; i++ {
146+
a, b := tt.want[i], tt.want[i+1]
147+
cmp := tt.c.CompareString(a, b)
148+
if cmp > 0 {
149+
t.Errorf("CompareString for '%v' vs '%v' is 1 when should be -1 or 0", a, b)
150+
}
151+
}
152+
})
153+
}
154+
}

0 commit comments

Comments
 (0)