|
| 1 | +--TEST-- |
| 2 | +Exhaustive test of ISO-2022-JP-MS text encoding |
| 3 | +--SKIPIF-- |
| 4 | +<?php |
| 5 | +extension_loaded('mbstring') or die('skip mbstring not available'); |
| 6 | +if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); |
| 7 | +?> |
| 8 | +--FILE-- |
| 9 | +<?php |
| 10 | +srand(444); /* Make results consistent */ |
| 11 | +include('encoding_tests.inc'); |
| 12 | +mb_substitute_character(0x25); // '%' |
| 13 | + |
| 14 | +function shiftJISDecode($bytes) { |
| 15 | + /* Convert CP932's default Shift-JIS representation to kuten code */ |
| 16 | + $first = ($bytes >> 8) & 0xFF; |
| 17 | + $second = $bytes & 0xFF; |
| 18 | + $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81); |
| 19 | + if ($second > 0x9E) { |
| 20 | + $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21); |
| 21 | + } else if ($second > 0x7F) { |
| 22 | + $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21); |
| 23 | + } else { |
| 24 | + $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21); |
| 25 | + } |
| 26 | + return $kuten; |
| 27 | +} |
| 28 | + |
| 29 | +/* Read in the table of all characters in CP932 */ |
| 30 | +$cp932Chars = array(); /* CP932 string -> UTF-32BE string */ |
| 31 | +$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+'); |
| 32 | +while ($line = fgets($fp, 256)) { |
| 33 | + if ($line[0] == '#') |
| 34 | + continue; |
| 35 | + |
| 36 | + if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) { |
| 37 | + if ($bytes < 256) |
| 38 | + continue; |
| 39 | + /* ISO-2022-JP-MS only uses the first two ranges of MS vendor extensions */ |
| 40 | + if ($bytes >= 0xFA00) |
| 41 | + continue; |
| 42 | + $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint); |
| 43 | + } |
| 44 | +} |
| 45 | + |
| 46 | +/* Windows-932 has many cases where two different kuten codes map to the same |
| 47 | + * Unicode codepoints */ |
| 48 | +foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) { |
| 49 | + $bytes = pack('n', shiftJISDecode($i)); |
| 50 | + $nonInvertible[$bytes] = $cp932Chars[$bytes]; |
| 51 | +} |
| 52 | + |
| 53 | +/* Add User Defined codes (which use ESC $ ( ? escape sequence)) */ |
| 54 | +$udcChars = array(); |
| 55 | +for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) { |
| 56 | + $i = $cp - 0xE000; |
| 57 | + $bytes = ((($i / 94) + 0x7F - 0x5E) << 8) + (($i % 94) + 0x21); |
| 58 | + $udcChars[pack('n', $bytes)] = pack('N', $cp); |
| 59 | +} |
| 60 | + |
| 61 | +/* Read in table of all characters in JISX-0201 charset */ |
| 62 | +$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */ |
| 63 | +$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+'); |
| 64 | +while ($line = fgets($fp, 256)) { |
| 65 | + if ($line[0] == '#') |
| 66 | + continue; |
| 67 | + |
| 68 | + if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2) |
| 69 | + $jisx0201Chars[chr($byte)] = pack('N', $codepoint); |
| 70 | +} |
| 71 | + |
| 72 | +function testValid($from, $to, $encoding, $bothWays = true) { |
| 73 | + identifyValidString($from, $encoding); |
| 74 | + convertValidString($from, $to, $encoding, 'UTF-32BE', false); |
| 75 | + |
| 76 | + if ($bothWays) { |
| 77 | + /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ |
| 78 | + if (substr($from, 0, 3) == "\x1B(B") |
| 79 | + $from = substr($from, 3, strlen($from) - 3); |
| 80 | + /* If the string switches to a different charset, it should switch back to |
| 81 | + * ASCII at the end */ |
| 82 | + if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false || strpos($from, "\x1B\$(@") !== false || strpos($from, "\x1B\$(?") !== false) |
| 83 | + $from .= "\x1B(B"; |
| 84 | + |
| 85 | + convertValidString($to, $from, 'UTF-32BE', $encoding, false); |
| 86 | + } |
| 87 | +} |
| 88 | + |
| 89 | +function testInvalid($from, $to, $encoding) { |
| 90 | + testInvalidString($from, $to, $encoding, 'UTF-32BE'); |
| 91 | +} |
| 92 | + |
| 93 | +for ($i = 0; $i < 0x80; $i++) { |
| 94 | + if ($i == 0x1B) |
| 95 | + continue; |
| 96 | + testValid(chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS'); |
| 97 | + testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false); |
| 98 | + testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false); |
| 99 | +} |
| 100 | + |
| 101 | +for ($i = 0x80; $i < 256; $i++) { |
| 102 | + if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana |
| 103 | + continue; |
| 104 | + testInvalid(chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); |
| 105 | + testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); |
| 106 | + testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); |
| 107 | +} |
| 108 | + |
| 109 | +echo "ASCII support OK\n"; |
| 110 | + |
| 111 | +/* All valid JIS X 0201 characters |
| 112 | + * Those with a 1 in the high bit are JIS X 0201 kana */ |
| 113 | +foreach ($jisx0201Chars as $jisx0201 => $utf32BE) { |
| 114 | + if (ord($jisx0201) >= 128) { |
| 115 | + $kana = chr(ord($jisx0201) - 128); |
| 116 | + testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-MS', false); |
| 117 | + testValid($jisx0201, $utf32BE, 'ISO-2022-JP-MS', false); |
| 118 | + } |
| 119 | +} |
| 120 | + |
| 121 | +for ($i = 0x80; $i < 256; $i++) { |
| 122 | + if ($i >= 0xA1 && $i <= 0xDF) |
| 123 | + continue; |
| 124 | + testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); |
| 125 | + testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); |
| 126 | +} |
| 127 | + |
| 128 | +echo "JIS X 0201 support OK\n"; |
| 129 | + |
| 130 | +function testAllValidCharsWithPrefix($validChars, $prefix, $bothWays) { |
| 131 | + $good = array_keys($validChars); |
| 132 | + shuffle($good); |
| 133 | + while (!empty($good)) { |
| 134 | + $length = min(rand(5,10), count($good)); |
| 135 | + $from = $to = ''; |
| 136 | + while ($length--) { |
| 137 | + $goodChar = array_pop($good); |
| 138 | + $from .= $goodChar; |
| 139 | + $to .= $validChars[$goodChar]; |
| 140 | + } |
| 141 | + testValid($prefix . $from, $to, 'ISO-2022-JP-MS', $bothWays); |
| 142 | + } |
| 143 | +} |
| 144 | + |
| 145 | +$validChars = $cp932Chars; |
| 146 | +/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */ |
| 147 | +for ($i = 0; $i <= 0x7F; $i++) |
| 148 | + $validChars[chr($i)] = chr($i); |
| 149 | +for ($i = 0xA1; $i <= 0xDF; $i++) |
| 150 | + $validChars[chr($i)] = $jisx0201Chars[chr($i)]; |
| 151 | +$lenTable = map(range(0xE0, 0xFC), 2, map(range(0x81, 0x9F), 2)); |
| 152 | +findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable); |
| 153 | + |
| 154 | +foreach ($nonInvertible as $bytes => $char) |
| 155 | + unset($cp932Chars[$bytes]); |
| 156 | + |
| 157 | +testAllValidCharsWithPrefix($cp932Chars, "\x1B\$B", true); |
| 158 | +testAllValidCharsWithPrefix($nonInvertible, "\x1B\$B", false); |
| 159 | + |
| 160 | +foreach (array_keys($invalidChars) as $invalid) |
| 161 | + testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); |
| 162 | +foreach (array_keys($truncatedChars) as $truncated) |
| 163 | + testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); |
| 164 | + |
| 165 | +echo "JIS X 0208 (with MS extensions) support OK\n"; |
| 166 | + |
| 167 | +$validChars = $udcChars; |
| 168 | +/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */ |
| 169 | +for ($i = 0; $i <= 0x7F; $i++) |
| 170 | + $validChars[chr($i)] = chr($i); |
| 171 | +for ($i = 0xA1; $i <= 0xDF; $i++) |
| 172 | + $validChars[chr($i)] = $jisx0201Chars[chr($i)]; |
| 173 | +$lenTable = map(range(0xE0, 0xFC), 2, map(range(0x81, 0x9F), 2)); |
| 174 | +findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable); |
| 175 | + |
| 176 | +testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true); |
| 177 | + |
| 178 | +foreach (array_keys($invalidChars) as $invalid) |
| 179 | + testInvalidString("\x1B\$(?" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); |
| 180 | +foreach (array_keys($truncatedChars) as $truncated) |
| 181 | + testInvalidString("\x1B\$(?" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); |
| 182 | + |
| 183 | +echo "UDC support OK\n"; |
| 184 | + |
| 185 | +?> |
| 186 | +--EXPECT-- |
| 187 | +ASCII support OK |
| 188 | +JIS X 0201 support OK |
| 189 | +JIS X 0208 (with MS extensions) support OK |
| 190 | +UDC support OK |
0 commit comments