diff --git a/resources/tables/Transcoder_Cesu8_ByteArray.bin b/resources/tables/Transcoder_Cesu8_ByteArray.bin new file mode 100644 index 00000000..fa55e869 Binary files /dev/null and b/resources/tables/Transcoder_Cesu8_ByteArray.bin differ diff --git a/resources/tables/Transcoder_Cesu8_WordArray.bin b/resources/tables/Transcoder_Cesu8_WordArray.bin new file mode 100644 index 00000000..eb72d0e5 Binary files /dev/null and b/resources/tables/Transcoder_Cesu8_WordArray.bin differ diff --git a/resources/tables/Transcoder_Escape_ByteArray.bin b/resources/tables/Transcoder_Escape_ByteArray.bin index a425608e..22cd44c7 100644 Binary files a/resources/tables/Transcoder_Escape_ByteArray.bin and b/resources/tables/Transcoder_Escape_ByteArray.bin differ diff --git a/resources/tables/Transcoder_Escape_WordArray.bin b/resources/tables/Transcoder_Escape_WordArray.bin index 0d705633..e0ca9d15 100644 Binary files a/resources/tables/Transcoder_Escape_WordArray.bin and b/resources/tables/Transcoder_Escape_WordArray.bin differ diff --git a/resources/tables/Transcoder_SingleByte_ByteArray.bin b/resources/tables/Transcoder_SingleByte_ByteArray.bin index 2d5dd43e..a6846f65 100644 Binary files a/resources/tables/Transcoder_SingleByte_ByteArray.bin and b/resources/tables/Transcoder_SingleByte_ByteArray.bin differ diff --git a/resources/tables/Transcoder_SingleByte_WordArray.bin b/resources/tables/Transcoder_SingleByte_WordArray.bin index 538b7f29..61364d9d 100644 Binary files a/resources/tables/Transcoder_SingleByte_WordArray.bin and b/resources/tables/Transcoder_SingleByte_WordArray.bin differ diff --git a/scripts/generate.rb b/scripts/generate.rb index 6050b6a2..e630df29 100755 --- a/scripts/generate.rb +++ b/scripts/generate.rb @@ -65,6 +65,7 @@ def generate_encoding_list "KOI8-R" => "KOI8R", "KOI8-U" => "KOI8U", "Shift_JIS" => "SJIS", + "CESU-8" => "CESU8", "UTF-16BE" => "UTF16BE", "UTF-16LE" => "UTF16LE", "UTF-32BE" => "UTF32BE", @@ -78,7 +79,7 @@ def generate_encoding_list "Windows-1257" => "Windows_1257" } - defines, other = open("#{REPO_PATH}/encdb.h").read.tr('()', '').scan(/ENC_([A-Z_]+)(.*?);/m).reject{|a, b| b =~ /CESU/}.partition { |a, b| a =~ /DEFINE/ } + defines, other = open("#{REPO_PATH}/encdb.h").read.tr('()', '').scan(/ENC_([A-Z_]+)(.*?);/m).partition { |a, b| a =~ /DEFINE/ } other << ["ALIAS", "\"MS932\", \"Windows-31J\""] other << ["ALIAS", "\"UTF8\", \"UTF-8\""] @@ -93,7 +94,7 @@ def generate_transcoder_list generic_list = [] transcoder_list = [] - Dir["#{REPO_PATH}/enc/trans/*.c"].reject{|f| f =~ /transdb/ || f =~ /cesu/}.sort.each do |trans_file| + Dir["#{REPO_PATH}/enc/trans/*.c"].reject{|f| f =~ /transdb/}.sort.each do |trans_file| name = trans_file[/(\w+)\.c/, 1].split('_').map{|e| e.capitalize}.join("") trans_src = open(trans_file){|f|f.read} diff --git a/src/org/jcodings/EncodingList.java b/src/org/jcodings/EncodingList.java index 70f63545..35e64f84 100644 --- a/src/org/jcodings/EncodingList.java +++ b/src/org/jcodings/EncodingList.java @@ -28,6 +28,7 @@ static final void load() { EncodingDB.declare("Big5", "BIG5"); EncodingDB.declare("Big5-HKSCS", "Big5HKSCS"); EncodingDB.declare("Big5-UAO", "Big5UAO"); + EncodingDB.declare("CESU-8", "CESU8"); EncodingDB.declare("CP949", "CP949"); EncodingDB.declare("Emacs-Mule", "EmacsMule"); EncodingDB.declare("EUC-JP", "EUCJP"); @@ -68,6 +69,8 @@ static final void load() { EncodingDB.alias("BINARY", "ASCII-8BIT"); EncodingDB.replicate("IBM437", "ASCII-8BIT"); EncodingDB.alias("CP437", "IBM437"); + EncodingDB.replicate("IBM720", "ASCII-8BIT"); + EncodingDB.alias("CP720", "IBM720"); EncodingDB.replicate("IBM737", "ASCII-8BIT"); EncodingDB.alias("CP737", "IBM737"); EncodingDB.replicate("IBM775", "ASCII-8BIT"); @@ -205,6 +208,7 @@ public static Encoding getInstance(String name) { case "BIG5": return BIG5Encoding.INSTANCE; case "Big5HKSCS": return Big5HKSCSEncoding.INSTANCE; case "Big5UAO": return Big5UAOEncoding.INSTANCE; + case "CESU8": return CESU8Encoding.INSTANCE; case "CP949": return CP949Encoding.INSTANCE; case "EmacsMule": return EmacsMuleEncoding.INSTANCE; case "EUCJP": return EUCJPEncoding.INSTANCE; diff --git a/src/org/jcodings/specific/CESU8Encoding.java b/src/org/jcodings/specific/CESU8Encoding.java new file mode 100644 index 00000000..7773219d --- /dev/null +++ b/src/org/jcodings/specific/CESU8Encoding.java @@ -0,0 +1,430 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.jcodings.specific; + +import org.jcodings.Config; +import org.jcodings.IntHolder; +import org.jcodings.ascii.AsciiTables; +import org.jcodings.exception.ErrorCodes; +import org.jcodings.unicode.UnicodeEncoding; + +public final class CESU8Encoding extends UnicodeEncoding { + static final boolean USE_INVALID_CODE_SCHEME = true; + + protected CESU8Encoding() { + super("CESU-8", 1, 6, CESU8EncLen, CESU8Trans); + } + + @Override + public String getCharsetName() { + return "CESU-8"; + } + + @Override + public int length(byte[] bytes, int p, int end) { + int b = bytes[p] & 0xff; + if (b <= 127) { + return 1; + } + int s = TransZero[b]; + if (s < 0) + return CHAR_INVALID; + return lengthForOneUptoSix(bytes, p, end, b, s); + } + + private int lengthForOneUptoSix(byte[] bytes, int p, int end, int b, int s) { + if (++p == end) { + return missing(b, 1); + } + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 2 : CHAR_INVALID; + } + if (++p == end) { + return missing(b, s == 4 ? 4 : TransZero[b] - 2); + } + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 3 : CHAR_INVALID; + } + if (++p == end) + return missing(b, 3); + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 4 : CHAR_INVALID; + } + if (++p == end) + return missing(b, 2); + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 5 : CHAR_INVALID; + } + if (++p == end) + return missing(b, 1); + s = Trans[s][bytes[p] & 0xff]; + return s == A ? 6 : CHAR_INVALID; + } + + @Override + public boolean isNewLine(byte[] bytes, int p, int end) { + if (p < end) { + if (bytes[p] == (byte) 0x0a) + return true; + + if (Config.USE_UNICODE_ALL_LINE_TERMINATORS) { + if (!Config.USE_CRNL_AS_LINE_TERMINATOR) { + if (bytes[p] == (byte) 0x0d) + return true; + } + + if (p + 1 < end) { + if (bytes[p + 1] == (byte) 0x85 && bytes[p] == (byte) 0xc2) + return true; + if (p + 2 < end) { + if ((bytes[p + 2] == (byte) 0xa8 || bytes[p + 2] == (byte) 0xa9) && + bytes[p + 1] == (byte) 0x80 && bytes[p] == (byte) 0xe2) + return true; + } + } + } + } + return false; + } + + private static final int INVALID_CODE_FE = 0xfffffffe; + private static final int INVALID_CODE_FF = 0xffffffff; + private static final int VALID_CODE_LIMIT = 0x0010ffff; + + @Override + public int codeToMbcLength(int code) { + if ((code & 0xffffff80) == 0) { + return 1; + } else if ((code & 0xfffff800) == 0) { + return 2; + } else if ((code & 0xffff0000) == 0) { + return 3; + } else if ((code & 0xFFFFFFFFL) <= VALID_CODE_LIMIT) { + return 6; + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FE) { + return 1; + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FF) { + return 1; + } else { + return ErrorCodes.ERR_TOO_BIG_WIDE_CHAR_VALUE; + } + } + + @Override + public int mbcToCode(byte[] bytes, int p, int end) { + int len = length(bytes, p, end); + int c = bytes[p] & 0xff; + + switch (len) { + case 1: + return c; + case 2: + return ((c & 0x1F) << 6) | (bytes[p + 1] & 0xff & 0x3f); + case 3: + return ((c & 0xF) << 12) | ((bytes[p + 1] & 0xff & 0x3f) << 6) | (bytes[p + 2] & 0xff & 0x3f); + case 6: + { + int high = ((c & 0xF) << 12) | ((bytes[p + 1] & 0xff & 0x3f) << 6) | (bytes[p + 2] & 0xff & 0x3f); + int low = ((bytes[p + 3] & 0xff & 0xF) << 12) | ((bytes[p + 4] & 0xff & 0x3f) << 6) | (bytes[p + 5] & 0xff & 0x3f); + return ((high & 0x03ff) << 10) + (low & 0x03ff) + 0x10000; + } + } + + if (USE_INVALID_CODE_SCHEME) { + if (c > 0xfd) { + return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); + } + } + return c; + } + + static byte trailS(int code, int shift) { + return (byte) (((code >>> shift) & 0x3f) | 0x80); + } + + static byte trail0(int code) { + return (byte) ((code & 0x3f) | 0x80); + } + + static byte trailS(long code, int shift) { + return (byte) (((code >>> shift) & 0x3f) | 0x80); + } + + static byte trail0(long code) { + return (byte) ((code & 0x3f) | 0x80); + } + + @Override + public int codeToMbc(int code, byte[] bytes, int p) { + int p_ = p; + if ((code & 0xffffff80) == 0) { + bytes[p_] = (byte) code; + return 1; + } else { + if ((code & 0xfffff800) == 0) { + bytes[p_++] = (byte) (((code >>> 6) & 0x1f) | 0xc0); + } else if ((code & 0xffff0000) == 0) { + bytes[p_++] = (byte) (((code >>> 12) & 0x0f) | 0xe0); + bytes[p_++] = trailS(code, 6); + } else if ((code & 0xFFFFFFFFL) <= VALID_CODE_LIMIT) { + long high = (code >> 10) + 0xD7C0; + code = (code & 0x3FF) + 0xDC00; + bytes[p_++] = (byte)(((high>>12) & 0x0f) | 0xe0); + bytes[p_++] = trailS(high, 6); + bytes[p_++] = trail0(high); + bytes[p_++] = (byte)(((code>>12) & 0x0f) | 0xe0); + bytes[p_++] = trailS(code, 6); + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FE) { + bytes[p_] = (byte) 0xfe; + return 1; + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FF) { + bytes[p_] = (byte) 0xff; + return 1; + } else { + return ErrorCodes.ERR_TOO_BIG_WIDE_CHAR_VALUE; + } + bytes[p_++] = trail0(code); + return p_ - p; + } + } + + @Override + public int mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] fold) { + int p = pp.value; + int foldP = 0; + + if (isMbcAscii(bytes[p])) { + + if (Config.USE_UNICODE_CASE_FOLD_TURKISH_AZERI) { + if ((flag & Config.CASE_FOLD_TURKISH_AZERI) != 0) { + if (bytes[p] == (byte) 0x49) { + fold[foldP++] = (byte) 0xc4; + fold[foldP] = (byte) 0xb1; + pp.value++; + return 2; + } + } + } + + fold[foldP] = AsciiTables.ToLowerCaseTable[bytes[p] & 0xff]; + pp.value++; + return 1; + } else { + return super.mbcCaseFold(flag, bytes, pp, end, fold); + } + } + + @Override + public int[] ctypeCodeRange(int ctype, IntHolder sbOut) { + sbOut.value = 0x80; + return super.ctypeCodeRange(ctype); + } + + private static boolean utf8IsLead(int c) { + return ((c & 0xc0) & 0xff) != 0x80; + } + + @Override + public int leftAdjustCharHead(byte[] bytes, int p, int s, int end) { + if (s <= p) + return s; + int p_ = s; + while (!utf8IsLead(bytes[p_] & 0xff) && p_ > p) + p_--; + return p_; + } + + @Override + public boolean isReverseMatchAllowed(byte[] bytes, int p, int end) { + return true; + } + + private static final int CESU8EncLen[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + }; + + static final int CESU8Trans[][] = new int[][] { + { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* 9 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* a */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* b */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, 7, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + } + }; + + public static final CESU8Encoding INSTANCE = new CESU8Encoding(); +} diff --git a/src/org/jcodings/transcode/TranscodeFunctions.java b/src/org/jcodings/transcode/TranscodeFunctions.java index 4fef6d15..67e0f49a 100644 --- a/src/org/jcodings/transcode/TranscodeFunctions.java +++ b/src/org/jcodings/transcode/TranscodeFunctions.java @@ -11,6 +11,27 @@ public class TranscodeFunctions { public static final int BE = 1; public static final int LE = 2; + public static int funSoToCESU8(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int osize) { + long scalar = ((s[0]&0x07)<<18) | ((s[1]&0x3F)<<12) | ((s[2]&0x3F)<< 6) | (s[3]&0x3F); + scalar -= 0x10000; + o[0] = (byte)0xED; + o[1] = (byte)(0xA0 | (scalar >> 16)); + o[2] = (byte)(0x80 | ((scalar >> 10) & 0x3F)); + o[3] = (byte)0xED; + o[4] = (byte)(0xB0 | ((scalar >> 6) & 0x0F)); + o[5] = (byte)(0x80 | (scalar & 0x3F)); + return 6; + } + + public static int funSoFromCESU8(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int osize) { + long scalar = ( ((s[1]&0x0F)<<16) | ((s[2]&0x3F)<<10) | ((s[4]&0x0F)<< 6) | (s[5]&0x3F)) + 0x10000; + o[0] = (byte)(0xF0 | (scalar >> 18)); + o[1] = (byte)(0x80 | ((scalar >> 12) & 0x3F)); + o[2] = (byte)(0x80 | ((scalar >> 6) & 0x3F)); + o[3] = (byte)(0x80 | (scalar & 0x3F)); + return 4; + } + public static int funSoToUTF16(byte[] statep, byte[] sBytes, int sStart, int l, byte[] o, int oStart, int osize) { int sp = 0; if (statep[sp] == 0) { diff --git a/src/org/jcodings/transcode/TranscoderList.java b/src/org/jcodings/transcode/TranscoderList.java index 5f9f1aa7..a3e64982 100644 --- a/src/org/jcodings/transcode/TranscoderList.java +++ b/src/org/jcodings/transcode/TranscoderList.java @@ -34,6 +34,8 @@ static void load() { TranscoderDB.declare("UTF-8", "CP951", null /*To_CP951*/); TranscoderDB.declare("Big5-UAO", "UTF-8", null /*From_Big5_UAO*/); TranscoderDB.declare("UTF-8", "Big5-UAO", null /*To_Big5_UAO*/); + TranscoderDB.declare("CESU-8", "UTF-8", "From_CESU_8"); + TranscoderDB.declare("UTF-8", "CESU-8", "To_CESU_8"); TranscoderDB.declare("GB2312", "UTF-8", null /*From_GB2312*/); TranscoderDB.declare("GB12345", "UTF-8", null /*From_GB12345*/); TranscoderDB.declare("UTF-8", "GB2312", null /*To_GB2312*/); @@ -153,10 +155,12 @@ static void load() { TranscoderDB.declare("UTF-8", "WINDOWS-1257", null /*To_WINDOWS_1257*/); TranscoderDB.declare("IBM437", "UTF-8", null /*From_IBM437*/); TranscoderDB.declare("UTF-8", "IBM437", null /*To_IBM437*/); - TranscoderDB.declare("IBM775", "UTF-8", null /*From_IBM775*/); - TranscoderDB.declare("UTF-8", "IBM775", null /*To_IBM775*/); + TranscoderDB.declare("IBM720", "UTF-8", null /*From_IBM720*/); + TranscoderDB.declare("UTF-8", "IBM720", null /*To_IBM720*/); TranscoderDB.declare("IBM737", "UTF-8", null /*From_IBM737*/); TranscoderDB.declare("UTF-8", "IBM737", null /*To_IBM737*/); + TranscoderDB.declare("IBM775", "UTF-8", null /*From_IBM775*/); + TranscoderDB.declare("UTF-8", "IBM775", null /*To_IBM775*/); TranscoderDB.declare("IBM852", "UTF-8", null /*From_IBM852*/); TranscoderDB.declare("UTF-8", "IBM852", null /*To_IBM852*/); TranscoderDB.declare("IBM855", "UTF-8", null /*From_IBM855*/); @@ -260,7 +264,7 @@ static void load() { new GenericTranscoderEntry("UTF8-SoftBank", "SJIS-SoftBank", 84704, "EmojiSjisSoftbank", 1, 4, 2, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("", "amp_escape", 8, "Escape", 1, 1, 5, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("", "xml_text_escape", 32, "Escape", 1, 1, 5, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("", "xml_attr_content_escape", 60, "Escape", 1, 1, 6, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("", "xml_attr_content_escape", 64, "Escape", 1, 1, 6, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("GBK", "UTF-8", 89284, "Gbk", 1, 2, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("UTF-8", "GBK", 182912, "Gbk", 1, 4, 2, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("EUC-JP", "UTF-8", 54488, "JapaneseEuc", 1, 3, 1, AsciiCompatibility.CONVERTER, 0), @@ -335,63 +339,67 @@ static void load() { new GenericTranscoderEntry("UTF-8", "WINDOWS-1257", 24952, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("IBM437", "UTF-8", 25476, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("UTF-8", "IBM437", 26312, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM775", "UTF-8", 26836, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM775", 27480, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM737", "UTF-8", 28004, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM737", 28516, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM852", "UTF-8", 29040, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM852", 29656, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM855", "UTF-8", 30180, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM855", 30732, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM857", "UTF-8", 31248, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM857", 31760, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM860", "UTF-8", 32284, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM860", 32672, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM861", "UTF-8", 33196, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM861", 33508, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM862", "UTF-8", 34032, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM862", 34276, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM863", "UTF-8", 34800, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM863", 35180, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM865", "UTF-8", 35704, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM865", 36016, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM866", "UTF-8", 36540, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM866", 36996, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM869", "UTF-8", 37488, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM869", 38004, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACCROATIAN", "UTF-8", 38528, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACCROATIAN", 39360, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACCYRILLIC", "UTF-8", 39884, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACCYRILLIC", 40588, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACGREEK", "UTF-8", 41112, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACGREEK", 41812, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACICELAND", "UTF-8", 42336, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACICELAND", 43052, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACROMAN", "UTF-8", 43576, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACROMAN", 44060, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACROMANIA", "UTF-8", 44584, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACROMANIA", 44960, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACTURKISH", "UTF-8", 45480, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACTURKISH", 45836, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACUKRAINE", "UTF-8", 46360, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACUKRAINE", 46584, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("KOI8-U", "UTF-8", 47108, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "KOI8-U", 47892, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("KOI8-R", "UTF-8", 48416, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "KOI8-R", 48948, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("TIS-620", "UTF-8", 49312, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "TIS-620", 49356, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("CP850", "UTF-8", 49880, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "CP850", 50428, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("CP852", "UTF-8", 29040, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "CP852", 29656, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("CP855", "UTF-8", 30180, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "CP855", 30732, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM720", "UTF-8", 26808, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM720", 27288, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM737", "UTF-8", 27812, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM737", 28300, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM775", "UTF-8", 28824, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM775", 29468, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM852", "UTF-8", 29992, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM852", 30608, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM855", "UTF-8", 31132, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM855", 31684, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM857", "UTF-8", 32200, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM857", 32712, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM860", "UTF-8", 33236, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM860", 33624, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM861", "UTF-8", 34148, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM861", 34460, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM862", "UTF-8", 34984, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM862", 35228, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM863", "UTF-8", 35752, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM863", 36132, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM865", "UTF-8", 36656, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM865", 36968, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM866", "UTF-8", 37492, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM866", 37948, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM869", "UTF-8", 38440, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM869", 38956, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACCROATIAN", "UTF-8", 39480, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACCROATIAN", 40312, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACCYRILLIC", "UTF-8", 40836, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACCYRILLIC", 41540, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACGREEK", "UTF-8", 42064, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACGREEK", 42764, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACICELAND", "UTF-8", 43288, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACICELAND", 44004, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACROMAN", "UTF-8", 44528, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACROMAN", 45012, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACROMANIA", "UTF-8", 45536, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACROMANIA", 45912, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACTURKISH", "UTF-8", 46432, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACTURKISH", 46788, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACUKRAINE", "UTF-8", 47312, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACUKRAINE", 47536, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("KOI8-U", "UTF-8", 48060, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "KOI8-U", 48844, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("KOI8-R", "UTF-8", 49368, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "KOI8-R", 49900, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("TIS-620", "UTF-8", 50264, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "TIS-620", 50308, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("CP850", "UTF-8", 50832, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "CP850", 51380, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("CP852", "UTF-8", 29992, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "CP852", 30608, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("CP855", "UTF-8", 31132, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "CP855", 31684, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("UTF-8", "UTF8-MAC", 52420, "Utf8Mac", 1, 4, 9, AsciiCompatibility.CONVERTER, 0) }; public static Transcoder getInstance(String name) { switch (name) { + case "From_CESU_8": return From_CESU_8_Transcoder.INSTANCE; + case "To_CESU_8": return To_CESU_8_Transcoder.INSTANCE; case "Iso2022jp_kddi_decoder": return Iso2022jp_kddi_decoder_Transcoder.INSTANCE; case "Iso2022jp_kddi_encoder": return Iso2022jp_kddi_encoder_Transcoder.INSTANCE; case "Escape_xml_attr_quote": return Escape_xml_attr_quote_Transcoder.INSTANCE; diff --git a/src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java b/src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java new file mode 100644 index 00000000..016a3de7 --- /dev/null +++ b/src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java @@ -0,0 +1,37 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.jcodings.transcode.specific; + +import org.jcodings.transcode.AsciiCompatibility; +import org.jcodings.transcode.TranscodeFunctions; +import org.jcodings.transcode.Transcoder; + +public class From_CESU_8_Transcoder extends Transcoder { + protected From_CESU_8_Transcoder () { + super("CESU-8", "UTF-8", 148, "CESU8UTF8", 1, 6, 4, AsciiCompatibility.DECODER, 0); + } + + public static final Transcoder INSTANCE = new From_CESU_8_Transcoder(); + + @Override + public int startToOutput(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int oSize) { + return TranscodeFunctions.funSoFromCESU8(statep, s, sStart, l, o, oStart, oSize); + } +} diff --git a/src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java b/src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java new file mode 100644 index 00000000..b0fb434e --- /dev/null +++ b/src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java @@ -0,0 +1,48 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.jcodings.transcode.specific; + +import org.jcodings.transcode.AsciiCompatibility; +import org.jcodings.transcode.TranscodeFunctions; +import org.jcodings.transcode.Transcoder; + +public class To_CESU_8_Transcoder extends Transcoder { + protected To_CESU_8_Transcoder () { + super("UTF-8", "CESU-8", 240, "UTF8CESU8", 1, 4, 6, AsciiCompatibility.ENCODER, 1); + } + + public static final Transcoder INSTANCE = new To_CESU_8_Transcoder(); + + @Override + public boolean hasStateInit() { + return true; + } + + @Override + public int stateInit(byte[] statep) { + statep[0] = 0; + return 0; + } + + @Override + public int startToOutput(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int oSize) { + return TranscodeFunctions.funSoToCESU8(statep, s, sStart, l, o, oStart, oSize); + } +} diff --git a/test/org/jcodings/specific/TestCESU8.java b/test/org/jcodings/specific/TestCESU8.java new file mode 100644 index 00000000..287ec92a --- /dev/null +++ b/test/org/jcodings/specific/TestCESU8.java @@ -0,0 +1,49 @@ +package org.jcodings.specific; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + +import org.jcodings.Encoding; +import org.jcodings.constants.CharacterType; +import java.nio.charset.StandardCharsets; +import org.junit.Test; + +public class TestCESU8 { + final Encoding enc = CESU8Encoding.INSTANCE; + + @Test + public void testUnicodeLength6byteChar() throws Exception { + byte[]bytes = "\u00ed\u00a0\u0080\u00ed\u00b0\u0080".getBytes("ISO-8859-1"); + assertEquals(1, enc.strLength(bytes, 0, bytes.length)); + } + + @Test + public void testUnicodeLength() throws Exception { + byte[]bytes = "test\u00C5\u0099".getBytes(); + assertEquals(6, enc.strLength(bytes, 0, bytes.length)); + } + + @Test + public void testUnicodeLengthLong() throws Exception { + byte[]bytes = ("\u00C5\u0099\u00C5\u00A1\u00C4\u009B\u00C5\u0099\u00C5\u00A1\u00C4\u009B\u00C5\u0099\u00C3\u00A9\u00C4" + + "\u009B\u00C3\u00BD\u00C5\u0099\u00C5\u00A1\u00C4\u009B\u00C3\u00A9\u00C4\u009B\u00C3\u00A9\u00C5\u00BE\u00C4\u009B\u00C5\u00A1" + + "\u00C3\u00A9\u00C5\u00BE\u00C4\u009B\u00C5\u00BE\u00C3\u00A9\u00C4\u009B\u00C5\u00A1").getBytes("ISO-8859-1"); + assertEquals(26, enc.strLength(bytes, 0, bytes.length)); + } + + @Test + public void testCodeToMbcLength() throws Exception { + assertEquals(enc.codeToMbcLength(0x01), 1); + assertEquals(enc.codeToMbcLength(0x1F608), 6); + } + + @Test + public void testMbcToCode() throws Exception { + assertEquals('Ø', enc.mbcToCode("mØØse".getBytes("UTF-8"), 1, 3)); + } + + @Test + public void testEncodingLoad() throws Exception { + assertEquals(CESU8Encoding.INSTANCE, Encoding.load("CESU8")); + } +}