From 1a34f2c7be1474e6b0879b22f03406fd6d1a1254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Rosick=C3=BD?= Date: Sun, 30 Jan 2022 03:17:14 +0100 Subject: [PATCH] implement CESU-8 --- .../tables/Transcoder_Cesu8_ByteArray.bin | Bin 0 -> 766 bytes .../tables/Transcoder_Cesu8_WordArray.bin | Bin 0 -> 252 bytes .../tables/Transcoder_Escape_ByteArray.bin | Bin 1059 -> 1066 bytes .../tables/Transcoder_Escape_WordArray.bin | Bin 84 -> 88 bytes .../Transcoder_SingleByte_ByteArray.bin | Bin 25837 -> 26606 bytes .../Transcoder_SingleByte_WordArray.bin | Bin 50440 -> 51392 bytes scripts/generate.rb | 5 +- src/org/jcodings/EncodingList.java | 4 + src/org/jcodings/specific/CESU8Encoding.java | 430 ++++++++++++++++++ .../transcode/TranscodeFunctions.java | 21 + .../jcodings/transcode/TranscoderList.java | 118 ++--- .../specific/From_CESU_8_Transcoder.java | 37 ++ .../specific/To_CESU_8_Transcoder.java | 48 ++ test/org/jcodings/specific/TestCESU8.java | 49 ++ 14 files changed, 655 insertions(+), 57 deletions(-) create mode 100644 resources/tables/Transcoder_Cesu8_ByteArray.bin create mode 100644 resources/tables/Transcoder_Cesu8_WordArray.bin create mode 100644 src/org/jcodings/specific/CESU8Encoding.java create mode 100644 src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java create mode 100644 src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java create mode 100644 test/org/jcodings/specific/TestCESU8.java diff --git a/resources/tables/Transcoder_Cesu8_ByteArray.bin b/resources/tables/Transcoder_Cesu8_ByteArray.bin new file mode 100644 index 0000000000000000000000000000000000000000..fa55e8696907b8affc4c170f6e799e38b1887b8c GIT binary patch literal 766 zcmZQzVEWatpOHeafS?H*FjTyK%h<4XXgL_fjJN( literal 0 HcmV?d00001 diff --git a/resources/tables/Transcoder_Cesu8_WordArray.bin b/resources/tables/Transcoder_Cesu8_WordArray.bin new file mode 100644 index 0000000000000000000000000000000000000000..eb72d0e512e5fb6f054fd1129da78eeaced2c81b GIT binary patch literal 252 zcmZQzV6bCgU|4KO>F0LAwM#n%8a2moou Zj}RKGd2@hrTYz{E5FcS+U}6T+X8=@l58wa* literal 0 HcmV?d00001 diff --git a/resources/tables/Transcoder_Escape_ByteArray.bin b/resources/tables/Transcoder_Escape_ByteArray.bin index a425608ec27b0ed84a3ae496065bd4faa2a2c9d2..22cd44c7b557206aeaac7683560be3f32237915a 100644 GIT binary patch delta 48 zcmZ3?v5JF*fq_MBvLds_MjcTmM&^lvBFrodtP>3n@-nFvmgbjOGpQvOHq)$ delta 37 ocmazDVPRlk5ShqgCe<{NfuZ>#0|S!?P(%ZW`573PWr4H{0CH~x!T4`tYqfq!NgRg8VJVxKiiA3kgcuA4CM#E0m<8Y&hM1!^gS%UFjw<*4u^ y4lx-{A#PvVQ9>;t``rYdod(Q2$EmePnQQ3RHDvTFZk-XFs%Y<;jO=vxp85gpbT*;@ delta 80 zcmV-W0I&b<&jIbp0R{j7Wa$8Zld)HTv4-eDlPOyxlm0yivm8hS0h2!{43i~Bo|D^7 m5tDEz1(Wb136oz$Jd;d7ACvG|1hc+G4gr(PM@zFFNzfi5Ngsy* diff --git a/resources/tables/Transcoder_SingleByte_WordArray.bin b/resources/tables/Transcoder_SingleByte_WordArray.bin index 538b7f2913cf098cce12afa718b27737b36be2c5..61364d9d53b21e01b0963ff119adc02f92f8d26c 100644 GIT binary patch delta 4452 zcmaJ^e^6A{6+ZhgFT~JjJCv+j$}q$rRT2q99om^XhFF4xAS%&SDOr(haR(7l0g<=_ z(L&sil|x}wh(HkKM+JF6L4gt^$Wo(|Iqd2Wrpryn_pt(&5WqR1S24YZ&sP zLfGeE_X2g}hy~sT8bmw9A8(50XH%SpZan@G=mC0xv%qWU+BL+_>7V>bUFkm_qCmGG8oS6cuG5GLI6`!@wg%|CNZUfePR>FavxA+yL$ZZNPKsL}`QWK76Vq z_-72d3aAwB({SH*yNMj&jsQo2W4s|_3CDZ+bI{YiKYVVQXj_CD`FnV;^Th>=c=?lh z>tQ^V147cg07SKV#`_QWFM*ZeL>6xk{{^^d!S4besPTHDRLo*QYY8KIr5fqNXHG<1 zH@HZ|>D1V?o0umQww{*Kwgxz1QwV^L79&~_90Mc(cwRXJ-UM66?W?!&2ZtZ!$b?@g zKSraT%XKB9!ebLW=8|72mugK9mVP4~sj9BaTVGN`#g2?1E;NK#; zizs8h=F@w!$yoT^Y%+v;Et_;FhMvGP`fTsR7@|^IG8u8OBLPn`4^#9P2uf7O%yT6v zqk<>X%<}WMh%#$AZd;0!<-_gUGNi0b4z!?|WVxb^H1u&x&K9Xe0v^d^2u9pBC36Q9 z?_9_QS@J4Cy~xF_p%X?HrW%ubee&u~L_nUrJXh@C-F&J-JW23uEQq+B12Cd6YF zu`PtETqLk6AFY^4Q?)x4LrxRrAlaNrH9i+g-O6&mTTEJV6k}kEVT|&>cu~=)7ctuX zXb+=RzS{YzPRe!g^zI;e6&7btgQCxob<&;@4$M8cDSr%IwYq+Sy88T-Xm66nLGE4@ z>06S&2AAoY&edDJPugqe%pD83W=o*5ChuLH9J>OZ5%$k7 zlU^O=*8DrxeI~?f^EYDtlYltIZvc?XLd3d&hj`nS#%Cl692rTMZnwFG7AHis-coF4$Ke@Ioyp8EU z2VQs&{5YG;hV?~L-##7B;>;CIyGD0)D?&wzM)3?)saSy23E1F7mG%pzkw6TP02px! zaX9t|YIV~qf45%NN&8LQ^IDV~=S_4Vy}YQ=FD@2VCpu^)IM7OT(8#ssaOt2)nJ&UM zJ=nnaiWkdqHlmVt4n2?`Tr%nDuoWsw#_nBYA3TFyB3t=eK$u9!osqyje(OU+7BK9;)=& z=1M{KOm$}WdquoQ#QQ|NM^>-F1z*5ZWmn|Y46~Zs7ew=8cHi}@FA;Hpy6ID|X(ON@ zjN*uETx@)Epn8KOJPuyOXcaH7fGuKo0b{VWC9NQ_@hiL)yZwevPS}X|+e$7dkCoSK z#ycyFN6K$Wm4)2LvDT_tqRSxk5MTp-v;}XtJ`9QSR*kCD%szu_YQT?!*Q8YDsWDY) z;5%V!QmUq5M*=ZG0$?Os^ATcZVCzUZQ1Or?IU?9rX_nW1h_~hno;VS}epMs#Iz1j+ zd8#T{UbhMF(rWf~1R2($9><*MtgZVn#4ardH_ZtSgJIoOywK;UZgEG8#qfic>V;hF zhzd^(gel}`y{{m|=Vfq(9KmuUIU=Udop^O~CnW-t{@ncbj^|--~i@CLwKZG{%PCh=SL{#Gv zK0d|v*eKptvr?*^=A@GbzFMQVuE+gW2i5Lg6X+!h^VE;Qw|n?*aPI-Tby!(_n^hT79h4=C5RSEb`_jiJAhHi4${m;Z1h{^s9e9dHEP+y!tV z!OOze1YbMrPcAlW{um}oE5dr=avy`{d&qd1ob(h?%Y1g6RJV3ZoEq7(63N+MBT=zU zTQb#T-P^P!4?Zp7rV`A6ZQ6Va7*-cg^lQG2e%^=+0m5KA&_)6=KmuS?cWFy4Y#s4| ziUxk9;m_RKR4KLG#Nm(Tx(+6ZNhc8tvx*$^qYSXote@!;EEFe9s___;^a_VEF&Dz7Q8k& z1EvU@C94C)?wG{A(5lm$?c#_xO)wJ;Y^(TatBdY8V7a+ zNL^#Rq9Z_l8H?F<+g*LDX&F&BDkdu0oro6p{rkj}2Vas$a@Pp5LnXUzf}j8WlDpCG zLHne;&^4A_d{bUJQmmX%Ue!t6BfPL{d4TCG;YSdDZH4PmJ#it0{uR64p9{PEGEBHr zdLz`3+4oT|)=HdXaUb<2xTbIo)9d@d`-pl&5fcK0AwCUlB!Ds8TYAfK@c3IeSD`q% zb97Q~56|xjn49u*qH|cV=w~>x=c=5Ni^u8mfq1s``dCtWiTXw{h9+OUbi`Xl$nVEI zZeIK=Rs!M=uwTrn#aDrI|80u@rhb#hC+@FffcQx4FW|}EUrPNI?0+^{PWckdC>Nd$ z&lu1OBi6Cs2wz&2?Y|(W<`4}|q7rgy579fFoH$Tq8S+M7ksQXpU(XN@m|M>fG(Jer zJFP@RaJjV%;rV_osdo|6sjT>uStq~pDeT+aI=EOGHhuFIxau8z1rFQrhY-$w=aZ!O z9QaF!=CkLAt?!E@wd!9&b;WJ$bQclFSp30?k5Fff8s|(zi)8Nqa&pc(Anykv9>sR9 aP-DfHusZ=K7GzcekGgJ8Gp-hA!T$j1D-|gK delta 3928 zcmaJ^Z%|a%6+ipbmuhLsOp(wXN*H4ZqohTLT0U5-NG&2EDfnJWuCCVOS|H3z6dvu25_>Y<)FTS-9(h zM%tCZ*1Qny`XZv;c~GgzevV|;_d!putE$-}$j4f*0+)k>uNB`7wUDBtOo zi}F4LiE?B2L-#<{p@ys}?8iaKThyWtb=ML*NVI1@?9^Iz*CHhJqVD`k&@lLLdfrvc zo~76zD8+$vpi~5Ui~}OrS@CUUUGYCO(w;ny+Pg~r@gt(baxU0=Oe*Z-*}dsf(OOO` zJt-Bn@nq=|sW_7T%fj>yDSR?=BM?~cTYCaGI#Pjob7q7%lq{d3l>(_GnnFRUOJ48*jY(4Xdta}dZG`F1%lXY8&PEYXd zvr7_nXNipdILkFXL=JTJdqB=8VyH6l82<+(o?7v=>V6M>f?F@1VDak`i2RIsyxJ7A zax>mYW0lwa9!&gpKsM}^U4rZ;XmcjyV{A2r$(zrEo#ru<*032@y*bot{X1ADy#(1> zx6EjVDS9eEG-3psu{I;&W+auAgEbjC704#&PmjUwYasN^7_0hafiDM-#6>@wOR(Y$ z0Q=3akWI2cQHe=A!S5LZlowlbG*a_0*Pn~gzSco>R^*_9^jqX8ez8o!%H%n`sHRA+ zkB{M1?XglzG*4a8a#MRy$ZHcsEq1Th7`y})FE56ChHtlPbCbh}TGfuUOu-bzwIcae zBr1wb&IXT!Jq4Nq5L6TaJ5gKh6LK)}%vCb*ur!PZ} zGjRq#vKSPT`G`h3pXQqWH5YYq>QbT$*_i$i)?NwcdlwxmFO0%mtxG)isd{`7p(!5O zN05nn?+P8lcg?W#(lTx8Cq!ldEzAa>@{GJW762^cH5Na4O9V_(S!W3|h*35-x~gPn?nc^rH+C=Mj#shE3= zs`&wU4JnT-_%`g0QVVy=Tk~dUN!xaVjr`x}JpisJ9F+WY06oEd0}B9q(v&Gno`#)$P4c!L z?7J~bdO7VX(N!yt_J>Hl!93F+BBy;u)Vmspt}$%?KG+Q8091QTSb@ykMsF{JuG5=m zZvct}O1lKZcduzANeSRF+ZuWM08xJ^hYkc9`m%GrQztMHP%fCPE$p>hnw9;^0#Ywbupbaz#8iCp$g!-@J5n2GxP;`NM%jO5QuA8tFzbPuM?_Gxy?GGh6^B zXZ{lGEc?A%Z5YzQQ(*OkEQfHrJXsDQ^Cx9_bDC%<4mvf%A?)X7nE3^4W>x=%2W{gE zszHZ9hm~)v+pKx^jVYpc0EIgM+rJwkYZ8d;-AcU!17IJ)*ZxWA-Ek%2gGCy{tpZr8 z?Y)C`h*#KA>6M8;ZsPmWQSX*HOjuFg+%G1F&z-uJ4SO`jQR}u9AG)5B9E0G!pnu)< LV(!FC<8b^JDWs%P diff --git a/scripts/generate.rb b/scripts/generate.rb index 6050b6a2..e630df29 100755 --- a/scripts/generate.rb +++ b/scripts/generate.rb @@ -65,6 +65,7 @@ def generate_encoding_list "KOI8-R" => "KOI8R", "KOI8-U" => "KOI8U", "Shift_JIS" => "SJIS", + "CESU-8" => "CESU8", "UTF-16BE" => "UTF16BE", "UTF-16LE" => "UTF16LE", "UTF-32BE" => "UTF32BE", @@ -78,7 +79,7 @@ def generate_encoding_list "Windows-1257" => "Windows_1257" } - defines, other = open("#{REPO_PATH}/encdb.h").read.tr('()', '').scan(/ENC_([A-Z_]+)(.*?);/m).reject{|a, b| b =~ /CESU/}.partition { |a, b| a =~ /DEFINE/ } + defines, other = open("#{REPO_PATH}/encdb.h").read.tr('()', '').scan(/ENC_([A-Z_]+)(.*?);/m).partition { |a, b| a =~ /DEFINE/ } other << ["ALIAS", "\"MS932\", \"Windows-31J\""] other << ["ALIAS", "\"UTF8\", \"UTF-8\""] @@ -93,7 +94,7 @@ def generate_transcoder_list generic_list = [] transcoder_list = [] - Dir["#{REPO_PATH}/enc/trans/*.c"].reject{|f| f =~ /transdb/ || f =~ /cesu/}.sort.each do |trans_file| + Dir["#{REPO_PATH}/enc/trans/*.c"].reject{|f| f =~ /transdb/}.sort.each do |trans_file| name = trans_file[/(\w+)\.c/, 1].split('_').map{|e| e.capitalize}.join("") trans_src = open(trans_file){|f|f.read} diff --git a/src/org/jcodings/EncodingList.java b/src/org/jcodings/EncodingList.java index 70f63545..35e64f84 100644 --- a/src/org/jcodings/EncodingList.java +++ b/src/org/jcodings/EncodingList.java @@ -28,6 +28,7 @@ static final void load() { EncodingDB.declare("Big5", "BIG5"); EncodingDB.declare("Big5-HKSCS", "Big5HKSCS"); EncodingDB.declare("Big5-UAO", "Big5UAO"); + EncodingDB.declare("CESU-8", "CESU8"); EncodingDB.declare("CP949", "CP949"); EncodingDB.declare("Emacs-Mule", "EmacsMule"); EncodingDB.declare("EUC-JP", "EUCJP"); @@ -68,6 +69,8 @@ static final void load() { EncodingDB.alias("BINARY", "ASCII-8BIT"); EncodingDB.replicate("IBM437", "ASCII-8BIT"); EncodingDB.alias("CP437", "IBM437"); + EncodingDB.replicate("IBM720", "ASCII-8BIT"); + EncodingDB.alias("CP720", "IBM720"); EncodingDB.replicate("IBM737", "ASCII-8BIT"); EncodingDB.alias("CP737", "IBM737"); EncodingDB.replicate("IBM775", "ASCII-8BIT"); @@ -205,6 +208,7 @@ public static Encoding getInstance(String name) { case "BIG5": return BIG5Encoding.INSTANCE; case "Big5HKSCS": return Big5HKSCSEncoding.INSTANCE; case "Big5UAO": return Big5UAOEncoding.INSTANCE; + case "CESU8": return CESU8Encoding.INSTANCE; case "CP949": return CP949Encoding.INSTANCE; case "EmacsMule": return EmacsMuleEncoding.INSTANCE; case "EUCJP": return EUCJPEncoding.INSTANCE; diff --git a/src/org/jcodings/specific/CESU8Encoding.java b/src/org/jcodings/specific/CESU8Encoding.java new file mode 100644 index 00000000..7773219d --- /dev/null +++ b/src/org/jcodings/specific/CESU8Encoding.java @@ -0,0 +1,430 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.jcodings.specific; + +import org.jcodings.Config; +import org.jcodings.IntHolder; +import org.jcodings.ascii.AsciiTables; +import org.jcodings.exception.ErrorCodes; +import org.jcodings.unicode.UnicodeEncoding; + +public final class CESU8Encoding extends UnicodeEncoding { + static final boolean USE_INVALID_CODE_SCHEME = true; + + protected CESU8Encoding() { + super("CESU-8", 1, 6, CESU8EncLen, CESU8Trans); + } + + @Override + public String getCharsetName() { + return "CESU-8"; + } + + @Override + public int length(byte[] bytes, int p, int end) { + int b = bytes[p] & 0xff; + if (b <= 127) { + return 1; + } + int s = TransZero[b]; + if (s < 0) + return CHAR_INVALID; + return lengthForOneUptoSix(bytes, p, end, b, s); + } + + private int lengthForOneUptoSix(byte[] bytes, int p, int end, int b, int s) { + if (++p == end) { + return missing(b, 1); + } + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 2 : CHAR_INVALID; + } + if (++p == end) { + return missing(b, s == 4 ? 4 : TransZero[b] - 2); + } + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 3 : CHAR_INVALID; + } + if (++p == end) + return missing(b, 3); + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 4 : CHAR_INVALID; + } + if (++p == end) + return missing(b, 2); + s = Trans[s][bytes[p] & 0xff]; + if (s < 0) { + return s == A ? 5 : CHAR_INVALID; + } + if (++p == end) + return missing(b, 1); + s = Trans[s][bytes[p] & 0xff]; + return s == A ? 6 : CHAR_INVALID; + } + + @Override + public boolean isNewLine(byte[] bytes, int p, int end) { + if (p < end) { + if (bytes[p] == (byte) 0x0a) + return true; + + if (Config.USE_UNICODE_ALL_LINE_TERMINATORS) { + if (!Config.USE_CRNL_AS_LINE_TERMINATOR) { + if (bytes[p] == (byte) 0x0d) + return true; + } + + if (p + 1 < end) { + if (bytes[p + 1] == (byte) 0x85 && bytes[p] == (byte) 0xc2) + return true; + if (p + 2 < end) { + if ((bytes[p + 2] == (byte) 0xa8 || bytes[p + 2] == (byte) 0xa9) && + bytes[p + 1] == (byte) 0x80 && bytes[p] == (byte) 0xe2) + return true; + } + } + } + } + return false; + } + + private static final int INVALID_CODE_FE = 0xfffffffe; + private static final int INVALID_CODE_FF = 0xffffffff; + private static final int VALID_CODE_LIMIT = 0x0010ffff; + + @Override + public int codeToMbcLength(int code) { + if ((code & 0xffffff80) == 0) { + return 1; + } else if ((code & 0xfffff800) == 0) { + return 2; + } else if ((code & 0xffff0000) == 0) { + return 3; + } else if ((code & 0xFFFFFFFFL) <= VALID_CODE_LIMIT) { + return 6; + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FE) { + return 1; + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FF) { + return 1; + } else { + return ErrorCodes.ERR_TOO_BIG_WIDE_CHAR_VALUE; + } + } + + @Override + public int mbcToCode(byte[] bytes, int p, int end) { + int len = length(bytes, p, end); + int c = bytes[p] & 0xff; + + switch (len) { + case 1: + return c; + case 2: + return ((c & 0x1F) << 6) | (bytes[p + 1] & 0xff & 0x3f); + case 3: + return ((c & 0xF) << 12) | ((bytes[p + 1] & 0xff & 0x3f) << 6) | (bytes[p + 2] & 0xff & 0x3f); + case 6: + { + int high = ((c & 0xF) << 12) | ((bytes[p + 1] & 0xff & 0x3f) << 6) | (bytes[p + 2] & 0xff & 0x3f); + int low = ((bytes[p + 3] & 0xff & 0xF) << 12) | ((bytes[p + 4] & 0xff & 0x3f) << 6) | (bytes[p + 5] & 0xff & 0x3f); + return ((high & 0x03ff) << 10) + (low & 0x03ff) + 0x10000; + } + } + + if (USE_INVALID_CODE_SCHEME) { + if (c > 0xfd) { + return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); + } + } + return c; + } + + static byte trailS(int code, int shift) { + return (byte) (((code >>> shift) & 0x3f) | 0x80); + } + + static byte trail0(int code) { + return (byte) ((code & 0x3f) | 0x80); + } + + static byte trailS(long code, int shift) { + return (byte) (((code >>> shift) & 0x3f) | 0x80); + } + + static byte trail0(long code) { + return (byte) ((code & 0x3f) | 0x80); + } + + @Override + public int codeToMbc(int code, byte[] bytes, int p) { + int p_ = p; + if ((code & 0xffffff80) == 0) { + bytes[p_] = (byte) code; + return 1; + } else { + if ((code & 0xfffff800) == 0) { + bytes[p_++] = (byte) (((code >>> 6) & 0x1f) | 0xc0); + } else if ((code & 0xffff0000) == 0) { + bytes[p_++] = (byte) (((code >>> 12) & 0x0f) | 0xe0); + bytes[p_++] = trailS(code, 6); + } else if ((code & 0xFFFFFFFFL) <= VALID_CODE_LIMIT) { + long high = (code >> 10) + 0xD7C0; + code = (code & 0x3FF) + 0xDC00; + bytes[p_++] = (byte)(((high>>12) & 0x0f) | 0xe0); + bytes[p_++] = trailS(high, 6); + bytes[p_++] = trail0(high); + bytes[p_++] = (byte)(((code>>12) & 0x0f) | 0xe0); + bytes[p_++] = trailS(code, 6); + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FE) { + bytes[p_] = (byte) 0xfe; + return 1; + } else if (USE_INVALID_CODE_SCHEME && code == INVALID_CODE_FF) { + bytes[p_] = (byte) 0xff; + return 1; + } else { + return ErrorCodes.ERR_TOO_BIG_WIDE_CHAR_VALUE; + } + bytes[p_++] = trail0(code); + return p_ - p; + } + } + + @Override + public int mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] fold) { + int p = pp.value; + int foldP = 0; + + if (isMbcAscii(bytes[p])) { + + if (Config.USE_UNICODE_CASE_FOLD_TURKISH_AZERI) { + if ((flag & Config.CASE_FOLD_TURKISH_AZERI) != 0) { + if (bytes[p] == (byte) 0x49) { + fold[foldP++] = (byte) 0xc4; + fold[foldP] = (byte) 0xb1; + pp.value++; + return 2; + } + } + } + + fold[foldP] = AsciiTables.ToLowerCaseTable[bytes[p] & 0xff]; + pp.value++; + return 1; + } else { + return super.mbcCaseFold(flag, bytes, pp, end, fold); + } + } + + @Override + public int[] ctypeCodeRange(int ctype, IntHolder sbOut) { + sbOut.value = 0x80; + return super.ctypeCodeRange(ctype); + } + + private static boolean utf8IsLead(int c) { + return ((c & 0xc0) & 0xff) != 0x80; + } + + @Override + public int leftAdjustCharHead(byte[] bytes, int p, int s, int end) { + if (s <= p) + return s; + int p_ = s; + while (!utf8IsLead(bytes[p_] & 0xff) && p_ > p) + p_--; + return p_; + } + + @Override + public boolean isReverseMatchAllowed(byte[] bytes, int p, int end) { + return true; + } + + private static final int CESU8EncLen[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + }; + + static final int CESU8Trans[][] = new int[][] { + { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* a */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* 9 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* a */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* b */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, 7, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + }, + { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F + } + }; + + public static final CESU8Encoding INSTANCE = new CESU8Encoding(); +} diff --git a/src/org/jcodings/transcode/TranscodeFunctions.java b/src/org/jcodings/transcode/TranscodeFunctions.java index 4fef6d15..67e0f49a 100644 --- a/src/org/jcodings/transcode/TranscodeFunctions.java +++ b/src/org/jcodings/transcode/TranscodeFunctions.java @@ -11,6 +11,27 @@ public class TranscodeFunctions { public static final int BE = 1; public static final int LE = 2; + public static int funSoToCESU8(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int osize) { + long scalar = ((s[0]&0x07)<<18) | ((s[1]&0x3F)<<12) | ((s[2]&0x3F)<< 6) | (s[3]&0x3F); + scalar -= 0x10000; + o[0] = (byte)0xED; + o[1] = (byte)(0xA0 | (scalar >> 16)); + o[2] = (byte)(0x80 | ((scalar >> 10) & 0x3F)); + o[3] = (byte)0xED; + o[4] = (byte)(0xB0 | ((scalar >> 6) & 0x0F)); + o[5] = (byte)(0x80 | (scalar & 0x3F)); + return 6; + } + + public static int funSoFromCESU8(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int osize) { + long scalar = ( ((s[1]&0x0F)<<16) | ((s[2]&0x3F)<<10) | ((s[4]&0x0F)<< 6) | (s[5]&0x3F)) + 0x10000; + o[0] = (byte)(0xF0 | (scalar >> 18)); + o[1] = (byte)(0x80 | ((scalar >> 12) & 0x3F)); + o[2] = (byte)(0x80 | ((scalar >> 6) & 0x3F)); + o[3] = (byte)(0x80 | (scalar & 0x3F)); + return 4; + } + public static int funSoToUTF16(byte[] statep, byte[] sBytes, int sStart, int l, byte[] o, int oStart, int osize) { int sp = 0; if (statep[sp] == 0) { diff --git a/src/org/jcodings/transcode/TranscoderList.java b/src/org/jcodings/transcode/TranscoderList.java index 5f9f1aa7..a3e64982 100644 --- a/src/org/jcodings/transcode/TranscoderList.java +++ b/src/org/jcodings/transcode/TranscoderList.java @@ -34,6 +34,8 @@ static void load() { TranscoderDB.declare("UTF-8", "CP951", null /*To_CP951*/); TranscoderDB.declare("Big5-UAO", "UTF-8", null /*From_Big5_UAO*/); TranscoderDB.declare("UTF-8", "Big5-UAO", null /*To_Big5_UAO*/); + TranscoderDB.declare("CESU-8", "UTF-8", "From_CESU_8"); + TranscoderDB.declare("UTF-8", "CESU-8", "To_CESU_8"); TranscoderDB.declare("GB2312", "UTF-8", null /*From_GB2312*/); TranscoderDB.declare("GB12345", "UTF-8", null /*From_GB12345*/); TranscoderDB.declare("UTF-8", "GB2312", null /*To_GB2312*/); @@ -153,10 +155,12 @@ static void load() { TranscoderDB.declare("UTF-8", "WINDOWS-1257", null /*To_WINDOWS_1257*/); TranscoderDB.declare("IBM437", "UTF-8", null /*From_IBM437*/); TranscoderDB.declare("UTF-8", "IBM437", null /*To_IBM437*/); - TranscoderDB.declare("IBM775", "UTF-8", null /*From_IBM775*/); - TranscoderDB.declare("UTF-8", "IBM775", null /*To_IBM775*/); + TranscoderDB.declare("IBM720", "UTF-8", null /*From_IBM720*/); + TranscoderDB.declare("UTF-8", "IBM720", null /*To_IBM720*/); TranscoderDB.declare("IBM737", "UTF-8", null /*From_IBM737*/); TranscoderDB.declare("UTF-8", "IBM737", null /*To_IBM737*/); + TranscoderDB.declare("IBM775", "UTF-8", null /*From_IBM775*/); + TranscoderDB.declare("UTF-8", "IBM775", null /*To_IBM775*/); TranscoderDB.declare("IBM852", "UTF-8", null /*From_IBM852*/); TranscoderDB.declare("UTF-8", "IBM852", null /*To_IBM852*/); TranscoderDB.declare("IBM855", "UTF-8", null /*From_IBM855*/); @@ -260,7 +264,7 @@ static void load() { new GenericTranscoderEntry("UTF8-SoftBank", "SJIS-SoftBank", 84704, "EmojiSjisSoftbank", 1, 4, 2, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("", "amp_escape", 8, "Escape", 1, 1, 5, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("", "xml_text_escape", 32, "Escape", 1, 1, 5, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("", "xml_attr_content_escape", 60, "Escape", 1, 1, 6, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("", "xml_attr_content_escape", 64, "Escape", 1, 1, 6, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("GBK", "UTF-8", 89284, "Gbk", 1, 2, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("UTF-8", "GBK", 182912, "Gbk", 1, 4, 2, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("EUC-JP", "UTF-8", 54488, "JapaneseEuc", 1, 3, 1, AsciiCompatibility.CONVERTER, 0), @@ -335,63 +339,67 @@ static void load() { new GenericTranscoderEntry("UTF-8", "WINDOWS-1257", 24952, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("IBM437", "UTF-8", 25476, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("UTF-8", "IBM437", 26312, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM775", "UTF-8", 26836, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM775", 27480, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM737", "UTF-8", 28004, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM737", 28516, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM852", "UTF-8", 29040, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM852", 29656, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM855", "UTF-8", 30180, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM855", 30732, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM857", "UTF-8", 31248, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM857", 31760, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM860", "UTF-8", 32284, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM860", 32672, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM861", "UTF-8", 33196, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM861", 33508, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM862", "UTF-8", 34032, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM862", 34276, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM863", "UTF-8", 34800, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM863", 35180, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM865", "UTF-8", 35704, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM865", 36016, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM866", "UTF-8", 36540, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM866", 36996, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("IBM869", "UTF-8", 37488, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "IBM869", 38004, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACCROATIAN", "UTF-8", 38528, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACCROATIAN", 39360, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACCYRILLIC", "UTF-8", 39884, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACCYRILLIC", 40588, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACGREEK", "UTF-8", 41112, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACGREEK", 41812, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACICELAND", "UTF-8", 42336, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACICELAND", 43052, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACROMAN", "UTF-8", 43576, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACROMAN", 44060, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACROMANIA", "UTF-8", 44584, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACROMANIA", 44960, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACTURKISH", "UTF-8", 45480, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACTURKISH", 45836, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("MACUKRAINE", "UTF-8", 46360, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "MACUKRAINE", 46584, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("KOI8-U", "UTF-8", 47108, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "KOI8-U", 47892, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("KOI8-R", "UTF-8", 48416, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "KOI8-R", 48948, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("TIS-620", "UTF-8", 49312, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "TIS-620", 49356, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("CP850", "UTF-8", 49880, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "CP850", 50428, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("CP852", "UTF-8", 29040, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "CP852", 29656, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("CP855", "UTF-8", 30180, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), - new GenericTranscoderEntry("UTF-8", "CP855", 30732, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM720", "UTF-8", 26808, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM720", 27288, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM737", "UTF-8", 27812, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM737", 28300, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM775", "UTF-8", 28824, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM775", 29468, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM852", "UTF-8", 29992, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM852", 30608, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM855", "UTF-8", 31132, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM855", 31684, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM857", "UTF-8", 32200, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM857", 32712, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM860", "UTF-8", 33236, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM860", 33624, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM861", "UTF-8", 34148, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM861", 34460, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM862", "UTF-8", 34984, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM862", 35228, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM863", "UTF-8", 35752, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM863", 36132, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM865", "UTF-8", 36656, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM865", 36968, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM866", "UTF-8", 37492, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM866", 37948, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("IBM869", "UTF-8", 38440, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "IBM869", 38956, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACCROATIAN", "UTF-8", 39480, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACCROATIAN", 40312, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACCYRILLIC", "UTF-8", 40836, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACCYRILLIC", 41540, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACGREEK", "UTF-8", 42064, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACGREEK", 42764, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACICELAND", "UTF-8", 43288, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACICELAND", 44004, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACROMAN", "UTF-8", 44528, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACROMAN", 45012, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACROMANIA", "UTF-8", 45536, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACROMANIA", 45912, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACTURKISH", "UTF-8", 46432, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACTURKISH", 46788, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("MACUKRAINE", "UTF-8", 47312, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "MACUKRAINE", 47536, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("KOI8-U", "UTF-8", 48060, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "KOI8-U", 48844, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("KOI8-R", "UTF-8", 49368, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "KOI8-R", 49900, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("TIS-620", "UTF-8", 50264, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "TIS-620", 50308, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("CP850", "UTF-8", 50832, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "CP850", 51380, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("CP852", "UTF-8", 29992, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "CP852", 30608, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("CP855", "UTF-8", 31132, "SingleByte", 1, 1, 1, AsciiCompatibility.CONVERTER, 0), + new GenericTranscoderEntry("UTF-8", "CP855", 31684, "SingleByte", 1, 4, 1, AsciiCompatibility.CONVERTER, 0), new GenericTranscoderEntry("UTF-8", "UTF8-MAC", 52420, "Utf8Mac", 1, 4, 9, AsciiCompatibility.CONVERTER, 0) }; public static Transcoder getInstance(String name) { switch (name) { + case "From_CESU_8": return From_CESU_8_Transcoder.INSTANCE; + case "To_CESU_8": return To_CESU_8_Transcoder.INSTANCE; case "Iso2022jp_kddi_decoder": return Iso2022jp_kddi_decoder_Transcoder.INSTANCE; case "Iso2022jp_kddi_encoder": return Iso2022jp_kddi_encoder_Transcoder.INSTANCE; case "Escape_xml_attr_quote": return Escape_xml_attr_quote_Transcoder.INSTANCE; diff --git a/src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java b/src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java new file mode 100644 index 00000000..016a3de7 --- /dev/null +++ b/src/org/jcodings/transcode/specific/From_CESU_8_Transcoder.java @@ -0,0 +1,37 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.jcodings.transcode.specific; + +import org.jcodings.transcode.AsciiCompatibility; +import org.jcodings.transcode.TranscodeFunctions; +import org.jcodings.transcode.Transcoder; + +public class From_CESU_8_Transcoder extends Transcoder { + protected From_CESU_8_Transcoder () { + super("CESU-8", "UTF-8", 148, "CESU8UTF8", 1, 6, 4, AsciiCompatibility.DECODER, 0); + } + + public static final Transcoder INSTANCE = new From_CESU_8_Transcoder(); + + @Override + public int startToOutput(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int oSize) { + return TranscodeFunctions.funSoFromCESU8(statep, s, sStart, l, o, oStart, oSize); + } +} diff --git a/src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java b/src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java new file mode 100644 index 00000000..b0fb434e --- /dev/null +++ b/src/org/jcodings/transcode/specific/To_CESU_8_Transcoder.java @@ -0,0 +1,48 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.jcodings.transcode.specific; + +import org.jcodings.transcode.AsciiCompatibility; +import org.jcodings.transcode.TranscodeFunctions; +import org.jcodings.transcode.Transcoder; + +public class To_CESU_8_Transcoder extends Transcoder { + protected To_CESU_8_Transcoder () { + super("UTF-8", "CESU-8", 240, "UTF8CESU8", 1, 4, 6, AsciiCompatibility.ENCODER, 1); + } + + public static final Transcoder INSTANCE = new To_CESU_8_Transcoder(); + + @Override + public boolean hasStateInit() { + return true; + } + + @Override + public int stateInit(byte[] statep) { + statep[0] = 0; + return 0; + } + + @Override + public int startToOutput(byte[] statep, byte[] s, int sStart, int l, byte[] o, int oStart, int oSize) { + return TranscodeFunctions.funSoToCESU8(statep, s, sStart, l, o, oStart, oSize); + } +} diff --git a/test/org/jcodings/specific/TestCESU8.java b/test/org/jcodings/specific/TestCESU8.java new file mode 100644 index 00000000..287ec92a --- /dev/null +++ b/test/org/jcodings/specific/TestCESU8.java @@ -0,0 +1,49 @@ +package org.jcodings.specific; + +import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; + +import org.jcodings.Encoding; +import org.jcodings.constants.CharacterType; +import java.nio.charset.StandardCharsets; +import org.junit.Test; + +public class TestCESU8 { + final Encoding enc = CESU8Encoding.INSTANCE; + + @Test + public void testUnicodeLength6byteChar() throws Exception { + byte[]bytes = "\u00ed\u00a0\u0080\u00ed\u00b0\u0080".getBytes("ISO-8859-1"); + assertEquals(1, enc.strLength(bytes, 0, bytes.length)); + } + + @Test + public void testUnicodeLength() throws Exception { + byte[]bytes = "test\u00C5\u0099".getBytes(); + assertEquals(6, enc.strLength(bytes, 0, bytes.length)); + } + + @Test + public void testUnicodeLengthLong() throws Exception { + byte[]bytes = ("\u00C5\u0099\u00C5\u00A1\u00C4\u009B\u00C5\u0099\u00C5\u00A1\u00C4\u009B\u00C5\u0099\u00C3\u00A9\u00C4" + + "\u009B\u00C3\u00BD\u00C5\u0099\u00C5\u00A1\u00C4\u009B\u00C3\u00A9\u00C4\u009B\u00C3\u00A9\u00C5\u00BE\u00C4\u009B\u00C5\u00A1" + + "\u00C3\u00A9\u00C5\u00BE\u00C4\u009B\u00C5\u00BE\u00C3\u00A9\u00C4\u009B\u00C5\u00A1").getBytes("ISO-8859-1"); + assertEquals(26, enc.strLength(bytes, 0, bytes.length)); + } + + @Test + public void testCodeToMbcLength() throws Exception { + assertEquals(enc.codeToMbcLength(0x01), 1); + assertEquals(enc.codeToMbcLength(0x1F608), 6); + } + + @Test + public void testMbcToCode() throws Exception { + assertEquals('Ø', enc.mbcToCode("mØØse".getBytes("UTF-8"), 1, 3)); + } + + @Test + public void testEncodingLoad() throws Exception { + assertEquals(CESU8Encoding.INSTANCE, Encoding.load("CESU8")); + } +}