Skip to content

Commit 7b4e820

Browse files
committed
Add test suite for ISO-2022-JP-MS encoding
As with ISO-2022-JP-KDDI, the main reference used to develop these tests was the behavior of the existing code. It would have been better to have some independent reference which we could cross-check our code against, but I couldn't find one.
1 parent a83ed21 commit 7b4e820

File tree

1 file changed

+190
-0
lines changed

1 file changed

+190
-0
lines changed
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
--TEST--
2+
Exhaustive test of ISO-2022-JP-MS text encoding
3+
--SKIPIF--
4+
<?php
5+
extension_loaded('mbstring') or die('skip mbstring not available');
6+
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
7+
?>
8+
--FILE--
9+
<?php
10+
srand(444); /* Make results consistent */
11+
include('encoding_tests.inc');
12+
mb_substitute_character(0x25); // '%'
13+
14+
function shiftJISDecode($bytes) {
15+
/* Convert CP932's default Shift-JIS representation to kuten code */
16+
$first = ($bytes >> 8) & 0xFF;
17+
$second = $bytes & 0xFF;
18+
$hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
19+
if ($second > 0x9E) {
20+
$kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
21+
} else if ($second > 0x7F) {
22+
$kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
23+
} else {
24+
$kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
25+
}
26+
return $kuten;
27+
}
28+
29+
/* Read in the table of all characters in CP932 */
30+
$cp932Chars = array(); /* CP932 string -> UTF-32BE string */
31+
$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+');
32+
while ($line = fgets($fp, 256)) {
33+
if ($line[0] == '#')
34+
continue;
35+
36+
if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
37+
if ($bytes < 256)
38+
continue;
39+
/* ISO-2022-JP-MS only uses the first two ranges of MS vendor extensions */
40+
if ($bytes >= 0xFA00)
41+
continue;
42+
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint);
43+
}
44+
}
45+
46+
/* Windows-932 has many cases where two different kuten codes map to the same
47+
* Unicode codepoints */
48+
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) {
49+
$bytes = pack('n', shiftJISDecode($i));
50+
$nonInvertible[$bytes] = $cp932Chars[$bytes];
51+
}
52+
53+
/* Add User Defined codes (which use ESC $ ( ? escape sequence)) */
54+
$udcChars = array();
55+
for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) {
56+
$i = $cp - 0xE000;
57+
$bytes = ((($i / 94) + 0x7F - 0x5E) << 8) + (($i % 94) + 0x21);
58+
$udcChars[pack('n', $bytes)] = pack('N', $cp);
59+
}
60+
61+
/* Read in table of all characters in JISX-0201 charset */
62+
$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */
63+
$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+');
64+
while ($line = fgets($fp, 256)) {
65+
if ($line[0] == '#')
66+
continue;
67+
68+
if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
69+
$jisx0201Chars[chr($byte)] = pack('N', $codepoint);
70+
}
71+
72+
function testValid($from, $to, $encoding, $bothWays = true) {
73+
identifyValidString($from, $encoding);
74+
convertValidString($from, $to, $encoding, 'UTF-32BE', false);
75+
76+
if ($bothWays) {
77+
/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
78+
if (substr($from, 0, 3) == "\x1B(B")
79+
$from = substr($from, 3, strlen($from) - 3);
80+
/* If the string switches to a different charset, it should switch back to
81+
* ASCII at the end */
82+
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false || strpos($from, "\x1B\$(@") !== false || strpos($from, "\x1B\$(?") !== false)
83+
$from .= "\x1B(B";
84+
85+
convertValidString($to, $from, 'UTF-32BE', $encoding, false);
86+
}
87+
}
88+
89+
function testInvalid($from, $to, $encoding) {
90+
testInvalidString($from, $to, $encoding, 'UTF-32BE');
91+
}
92+
93+
for ($i = 0; $i < 0x80; $i++) {
94+
if ($i == 0x1B)
95+
continue;
96+
testValid(chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS');
97+
testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false);
98+
testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false);
99+
}
100+
101+
for ($i = 0x80; $i < 256; $i++) {
102+
if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
103+
continue;
104+
testInvalid(chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
105+
testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
106+
testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
107+
}
108+
109+
echo "ASCII support OK\n";
110+
111+
/* All valid JIS X 0201 characters
112+
* Those with a 1 in the high bit are JIS X 0201 kana */
113+
foreach ($jisx0201Chars as $jisx0201 => $utf32BE) {
114+
if (ord($jisx0201) >= 128) {
115+
$kana = chr(ord($jisx0201) - 128);
116+
testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-MS', false);
117+
testValid($jisx0201, $utf32BE, 'ISO-2022-JP-MS', false);
118+
}
119+
}
120+
121+
for ($i = 0x80; $i < 256; $i++) {
122+
if ($i >= 0xA1 && $i <= 0xDF)
123+
continue;
124+
testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
125+
testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS');
126+
}
127+
128+
echo "JIS X 0201 support OK\n";
129+
130+
function testAllValidCharsWithPrefix($validChars, $prefix, $bothWays) {
131+
$good = array_keys($validChars);
132+
shuffle($good);
133+
while (!empty($good)) {
134+
$length = min(rand(5,10), count($good));
135+
$from = $to = '';
136+
while ($length--) {
137+
$goodChar = array_pop($good);
138+
$from .= $goodChar;
139+
$to .= $validChars[$goodChar];
140+
}
141+
testValid($prefix . $from, $to, 'ISO-2022-JP-MS', $bothWays);
142+
}
143+
}
144+
145+
$validChars = $cp932Chars;
146+
/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
147+
for ($i = 0; $i <= 0x7F; $i++)
148+
$validChars[chr($i)] = chr($i);
149+
for ($i = 0xA1; $i <= 0xDF; $i++)
150+
$validChars[chr($i)] = $jisx0201Chars[chr($i)];
151+
$lenTable = map(range(0xE0, 0xFC), 2, map(range(0x81, 0x9F), 2));
152+
findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
153+
154+
foreach ($nonInvertible as $bytes => $char)
155+
unset($cp932Chars[$bytes]);
156+
157+
testAllValidCharsWithPrefix($cp932Chars, "\x1B\$B", true);
158+
testAllValidCharsWithPrefix($nonInvertible, "\x1B\$B", false);
159+
160+
foreach (array_keys($invalidChars) as $invalid)
161+
testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
162+
foreach (array_keys($truncatedChars) as $truncated)
163+
testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
164+
165+
echo "JIS X 0208 (with MS extensions) support OK\n";
166+
167+
$validChars = $udcChars;
168+
/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
169+
for ($i = 0; $i <= 0x7F; $i++)
170+
$validChars[chr($i)] = chr($i);
171+
for ($i = 0xA1; $i <= 0xDF; $i++)
172+
$validChars[chr($i)] = $jisx0201Chars[chr($i)];
173+
$lenTable = map(range(0xE0, 0xFC), 2, map(range(0x81, 0x9F), 2));
174+
findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
175+
176+
testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true);
177+
178+
foreach (array_keys($invalidChars) as $invalid)
179+
testInvalidString("\x1B\$(?" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
180+
foreach (array_keys($truncatedChars) as $truncated)
181+
testInvalidString("\x1B\$(?" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
182+
183+
echo "UDC support OK\n";
184+
185+
?>
186+
--EXPECT--
187+
ASCII support OK
188+
JIS X 0201 support OK
189+
JIS X 0208 (with MS extensions) support OK
190+
UDC support OK

0 commit comments

Comments
 (0)