diff --git a/README.md b/README.md index bdfe717..e101f42 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Unlike National 2010, we always use the same transliteration regardless of the p The accented counterpart of и is й and is represented by a separate letter, *j*. -*Example:* Zghurs'kyj (Згурський) +*Example:* Zhurs'kyj (Згурський) #### Soft Signs and Apostrophes The second change to National 2010 is that we try to restore soft signs and apostrophes: @@ -79,18 +79,37 @@ Another modification was to provide the following mappings: * c → ц * q → щ * w → ш -* x → х +* x → ж Note that these mappings are phonetically inaccurate. However, using them still has a few advantages: * Every letter of the Latin alphabet is covered -* When the user types *ch*, we can map the first letter to *ц*, then replace it by *ч* -* *w* has a similar shape to *ш* -* *q* and *w* are located next to each other on the English keyboard, *щ* is therefore easy to find -* *x* has the same shape as its Cyrillic counterpart +* When the user types *ch*, we can map the first letter to *ц*, then replace it by *ч*. Without this rule, the user would not get any visual feedback. +* Furthermore, the following mappings were chosen considering the similarity in shapes: + * *w* has a similar shape to *ш* + * *x* has a similar shape to *ж* +* Another advantage is the proximity on the English keyboard layout: + * *q* and *w* are located next to each other; *ш* and *щ* characters are phonetically close + * *z* and *x* are located next to each other; *з* and *ж* characters are phonetically close + +#### Precedence +The replacement patterns are applied sequentially by traversing the input character-by-character. In some cases, a rule spanning multiple characters should not be applied. An example is the word: схильність. The transliteration of *сх* corresponds to two separate letters *s* and *h*, which would map to *ш*. To prevent this, one can place a vertical bar between the two characters. The full transliteration then looks as follows: *s|hyl'nist* ## Russian -We use rules similar to the previous rules from _Ukrainian_ transliteration. +The Russian rules are similar to the Ukrainian ones. + +Some differences are: + +* *i* corresponds to *и*, whereas *y* to *ы* +* Russian distinguishes between soft and hard signs. It does not have apostrophes. The following mappings are used: + * Soft sign: *'* for ь + * Hard sign: *"* for ъ + +### Precedence +As with the Ukrainian rules, a vertical bar can be placed to avoid certain rules from being applied. + +* красивые: krasivy|e +* сходить: s|hodit ### Mapping | Latin | Cyrillic | @@ -118,7 +137,7 @@ We use rules similar to the previous rules from _Ukrainian_ transliteration. | u | у | | v | в | | w | ш | -| x | х | +| x | ж | | y | ы | | z | з | | " | ъ | diff --git a/shared/src/main/scala/translit/Russian.scala b/shared/src/main/scala/translit/Russian.scala index 35e78a9..5d90372 100644 --- a/shared/src/main/scala/translit/Russian.scala +++ b/shared/src/main/scala/translit/Russian.scala @@ -27,7 +27,7 @@ object Russian extends Language { 'u' -> 'у', 'v' -> 'в', 'w' -> 'ш', - 'x' -> 'х', + 'x' -> 'ж', 'y' -> 'ы', 'z' -> 'з', '"' -> 'ъ' @@ -40,7 +40,10 @@ object Russian extends Language { "ye" -> 'э', "zh" -> 'ж', "yo" -> 'ё', - "yu" -> 'ю' + "yu" -> 'ю', + + "y|" -> 'ы', // красивые, выучил + "s|" -> 'с' // сходить ) val biGramsIncremental = incrementalNgram(biGrams) diff --git a/shared/src/main/scala/translit/Ukrainian.scala b/shared/src/main/scala/translit/Ukrainian.scala index b41fe19..d5c6ca3 100644 --- a/shared/src/main/scala/translit/Ukrainian.scala +++ b/shared/src/main/scala/translit/Ukrainian.scala @@ -9,8 +9,8 @@ object Ukrainian extends Language { 'd' -> 'д', 'e' -> 'е', 'f' -> 'ф', - 'g' -> 'ґ', - 'h' -> 'г', + 'g' -> 'г', + 'h' -> 'х', 'i' -> 'і', 'j' -> 'й', 'k' -> 'к', @@ -32,7 +32,7 @@ object Ukrainian extends Language { 'c' -> 'ц', 'q' -> 'щ', 'w' -> 'ш', - 'x' -> 'х' + 'x' -> 'ж' ) val biGrams = Map( @@ -41,20 +41,20 @@ object Ukrainian extends Language { "yi" -> 'ї', "yu" -> 'ю', + "g'" -> 'ґ', + "ch" -> 'ч', - "kh" -> 'х', "sh" -> 'ш', "ts" -> 'ц', - "zh" -> 'ж' + "zh" -> 'ж', + + // With the vertical bar, transliteration can be disabled. + "s|" -> 'с' ) val biGramsIncremental = incrementalNgram(biGrams) - val triGrams = Map( - "zgh" -> 'г' - ) - val triGramsIncremental = incrementalNgram(triGrams) ++ Map( - "шцh" -> 'щ', - "зґh" -> 'г' + val triGramsIncremental = Map( + "шцh" -> 'щ' ) val fourGrams = Map( @@ -102,7 +102,7 @@ object Ukrainian extends Language { incremental: Boolean = false): (Int, Char) = { val (biGramsL, triGramsL) = if (incremental) (biGramsIncremental, triGramsIncremental) - else (biGrams, triGrams) + else (biGrams, Map.empty[String, Char]) val ofs = offset + 1 if (ofs >= 4 && fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase) diff --git a/shared/src/test/scala/translit/RussianSpec.scala b/shared/src/test/scala/translit/RussianSpec.scala index f46ad96..8ff61a7 100644 --- a/shared/src/test/scala/translit/RussianSpec.scala +++ b/shared/src/test/scala/translit/RussianSpec.scala @@ -24,7 +24,6 @@ class RussianSpec extends FunSuite { "роща" -> "roshcha", "съел" -> "s\"el", "тележка" -> "telezhka", - "ухват" -> "uxvat", "ухват" -> "uhvat", "фольклор" -> "fol'klor", "халтура" -> "haltura", @@ -34,7 +33,7 @@ class RussianSpec extends FunSuite { "щавель" -> "shchavel'", "электровоз" -> "yelektrovoz", "юла" -> "yula", - "ягненок" -> "yagnenok", + "ягненок" -> "yagnenok" ) correctMapping.foreach { case (cyrillic, latin) => @@ -48,9 +47,20 @@ class RussianSpec extends FunSuite { assert(Russian.latinToCyrillic("шцh" , incremental = true) == "щ") assert(Russian.latinToCyrillic("Шцh" , incremental = true) == "Щ") assert(Russian.latinToCyrillic("багазh", incremental = true) == "багаж") + } + test("Other words") { assert(Russian.latinToCyrillic("peshkom" ) == "пешком") assert(Russian.latinToCyrillic("zhizn'" ) == "жизнь") assert(Russian.latinToCyrillic("shchetka") == "щетка") } + + test("Exceptions") { + assert(Russian.latinToCyrillic("vy") == "вы") + assert(Russian.latinToCyrillic("rajon") == "район") + assert(Russian.latinToCyrillic("schitayu") == "считаю") + assert(Russian.latinToCyrillic("s|hodit'") == "сходить") + assert(Russian.latinToCyrillic("vy|uchil") == "выучил") + assert(Russian.latinToCyrillic("krasivy|e") == "красивые") + } } diff --git a/shared/src/test/scala/translit/UkrainianSpec.scala b/shared/src/test/scala/translit/UkrainianSpec.scala index 97f8ed7..414aa35 100644 --- a/shared/src/test/scala/translit/UkrainianSpec.scala +++ b/shared/src/test/scala/translit/UkrainianSpec.scala @@ -6,22 +6,22 @@ class UkrainianSpec extends FunSuite { val correctMapping = List( "Алушта" -> "Alushta", "Андрій" -> "Andrij", - "Борщагівка" -> "Borshchahivka", + "Борщагівка" -> "Borshchagivka", "Борисенко" -> "Borysenko", "Вінниця" -> "Vinnytsya", "Володимир" -> "Volodymyr", - "Гадяч" -> "Hadyach", - "Богдан" -> "Bohdan", - "Згурський" -> "Zghurskyj", - "Ґалаґан" -> "Galagan", - "Ґорґани" -> "Gorgany", + "Гадяч" -> "Gadyach", + "Богдан" -> "Bogdan", + "Згурський" -> "Zgurskyj", + "Ґалаґан" -> "G'alag'an", + "Ґорґани" -> "G'org'any", "Донецьк" -> "Donetsk", "Дмитро" -> "Dmytro", "Рівне" -> "Rivne", - "Олег" -> "Oleh", + "Олег" -> "Oleg", "Есмань" -> "Esman", "Єнакієве" -> "Yenakiyeve", - "Гаєвич" -> "Hayevych", + "Гаєвич" -> "Gayevych", "Короп'є" -> "Koropye", "Житомир" -> "Zhytomyr", "Жанна" -> "Zhanna", @@ -29,7 +29,7 @@ class UkrainianSpec extends FunSuite { "Закарпаття" -> "Zakarpattya", "Казимирчук" -> "Kazymyrchuk", "Медвин" -> "Medvyn", - "Михайленко" -> "Mykhajlenko", + "Михайленко" -> "Myhajlenko", "Іванків" -> "Ivankiv", "Іващенко" -> "Ivashchenko", "Їжакевич" -> "Yizhakevych", @@ -55,24 +55,24 @@ class UkrainianSpec extends FunSuite { "Соломія" -> "Solomiya", "Тернопіль" -> "Ternopil", "Троць" -> "Trots", - "Ужгород" -> "Uzhhorod", + "Ужгород" -> "Uzhgorod", "Уляна" -> "Ulyana", "Фастів" -> "Fastiv", "Філіпчук" -> "Filipchuk", - "Харків" -> "Kharkiv", - "Христина" -> "Khrystyna", + "Харків" -> "Harkiv", + "Христина" -> "Hrystyna", "Біла Церква" -> "Bila Tserkva", "Стеценко" -> "Stetsenko", "Чернівці" -> "Chernivtsi", "Шевченко" -> "Shevchenko", "Шостка" -> "Shostka", "Кишеньки" -> "Kyshenky", - "Щербухи" -> "Shcherbukhy", - "Гоща" -> "Hoshcha", - "Гаращенко" -> "Harashchenko", + "Щербухи" -> "Shcherbuhy", + "Гоща" -> "Goshcha", + "Гаращенко" -> "Garashchenko", "Юрій" -> "Yurij", "Корюківка" -> "Koryukivka", - "Яготин" -> "Yahotyn", + "Яготин" -> "Yagotyn", "Ярошенко" -> "Yaroshenko", "Костянтин" -> "Kostyantyn", "Знам'янка" -> "Znamyanka", @@ -94,12 +94,12 @@ class UkrainianSpec extends FunSuite { test("yi / yy") { assert(Ukrainian.latinToCyrillic("Kyyiv") == "Київ") assert(Ukrainian.latinToCyrillic("Kryyivka") == "Криївка") - assert(Ukrainian.latinToCyrillic("Katehoriyi") == "Категорії") + assert(Ukrainian.latinToCyrillic("Kategoriyi") == "Категорії") assert(Ukrainian.latinToCyrillic("Stryjs'kyj park") == "Стрийський парк") assert(Ukrainian.latinToCyrillic("Stryjs'ka") == "Стрийська") assert(Ukrainian.latinToCyrillic("kofeyin") == "кофеїн") - assert(Ukrainian.latinToCyrillic("pryjnyatykh") == "прийнятих") + assert(Ukrainian.latinToCyrillic("pryjnyatyh") == "прийнятих") assert(Ukrainian.latinToCyrillic("Staryj") == "Старий") assert(Ukrainian.latinToCyrillic("Avtomyjka") == "Автомийка") } @@ -115,7 +115,7 @@ class UkrainianSpec extends FunSuite { assert(Ukrainian.latinToCyrillic("MYROSLAVA") == "Мирослава".toUpperCase) assert(Ukrainian.latinToCyrillic("Al'bert".toUpperCase) == "Альберт".toUpperCase) - assert(Ukrainian.latinToCyrillic("Zghurs'kyj".toUpperCase) == "Згурський".toUpperCase) + assert(Ukrainian.latinToCyrillic("Zgurs'kyj".toUpperCase) == "Згурський".toUpperCase) assert(Ukrainian.latinToCyrillic("sut'") == "суть") assert(Ukrainian.latinToCyrillic("SUT'") == "суть".toUpperCase) @@ -144,17 +144,17 @@ class UkrainianSpec extends FunSuite { assert(Ukrainian.latinToCyrillic("pidv'yazaty") == "підв'язати") assert(Ukrainian.latinToCyrillic("rozm'yakshyty") == "розм'якшити") assert(Ukrainian.latinToCyrillic("bur'yan") == "бур'ян") - assert(Ukrainian.latinToCyrillic("mizhhir'ya") == "міжгір'я") + assert(Ukrainian.latinToCyrillic("mizhgir'ya") == "міжгір'я") assert(Ukrainian.latinToCyrillic("pir'ya") == "пір'я") assert(Ukrainian.latinToCyrillic("matir'yu") == "матір'ю") assert(Ukrainian.latinToCyrillic("na podvir'yi") == "на подвір'ї") assert(Ukrainian.latinToCyrillic("bez'yazykyj") == "без'язикий") assert(Ukrainian.latinToCyrillic("vid'yizd") == "від'їзд") assert(Ukrainian.latinToCyrillic("z'yednanyj") == "з'єднаний") - assert(Ukrainian.latinToCyrillic("z'yikhaty") == "з'їхати") + assert(Ukrainian.latinToCyrillic("z'yihaty") == "з'їхати") assert(Ukrainian.latinToCyrillic("z'yavytysya") == "з'явитися") assert(Ukrainian.latinToCyrillic("ob'yem") == "об'єм") - assert(Ukrainian.latinToCyrillic("pid'yikhaty") == "під'їхати") + assert(Ukrainian.latinToCyrillic("pid'yihaty") == "під'їхати") assert(Ukrainian.latinToCyrillic("roz'yushyty") == "роз'юшити") assert(Ukrainian.latinToCyrillic("roz'yasnyty") == "роз'яснити") assert(Ukrainian.latinToCyrillic("dyt'yasla") == "дит'ясла") @@ -194,7 +194,7 @@ class UkrainianSpec extends FunSuite { // From https://uk.wikipedia.org/wiki/%D0%AC assert(Ukrainian.latinToCyrillic("vis'") == "вісь") - assert(Ukrainian.latinToCyrillic("gedz'") == "ґедзь") + assert(Ukrainian.latinToCyrillic("g'edz'") == "ґедзь") assert(Ukrainian.latinToCyrillic("kin'") == "кінь") assert(Ukrainian.latinToCyrillic("mid'") == "мідь") assert(Ukrainian.latinToCyrillic("namoroz'") == "наморозь") @@ -203,17 +203,17 @@ class UkrainianSpec extends FunSuite { assert(Ukrainian.latinToCyrillic("shvets'") == "швець") assert(Ukrainian.latinToCyrillic("blyz'ko") == "близько") assert(Ukrainian.latinToCyrillic("vos'myj") == "восьмий") - assert(Ukrainian.latinToCyrillic("han'ba") == "ганьба") - assert(Ukrainian.latinToCyrillic("Hryts'ko") == "Грицько") + assert(Ukrainian.latinToCyrillic("gan'ba") == "ганьба") + assert(Ukrainian.latinToCyrillic("Gryts'ko") == "Грицько") assert(Ukrainian.latinToCyrillic("dyad'ko") == "дядько") assert(Ukrainian.latinToCyrillic("kil'tse") == "кільце") assert(Ukrainian.latinToCyrillic("molot'ba") == "молотьба") - assert(Ukrainian.latinToCyrillic("d'ohot'") == "дьоготь") + assert(Ukrainian.latinToCyrillic("d'ogot'") == "дьоготь") assert(Ukrainian.latinToCyrillic("dz'ob") == "дзьоб") assert(Ukrainian.latinToCyrillic("l'on") == "льон") assert(Ukrainian.latinToCyrillic("s'omyj") == "сьомий") - assert(Ukrainian.latinToCyrillic("tr'okh") == "трьох") - assert(Ukrainian.latinToCyrillic("t'okhkaty") == "тьохкати") + assert(Ukrainian.latinToCyrillic("tr'oh") == "трьох") + assert(Ukrainian.latinToCyrillic("t'ohkaty") == "тьохкати") // TODO Words where the soft sign cannot be restored // assert(Ukrainian.latinToCyrillic("N'yurd") == "Ньюрд") @@ -238,11 +238,12 @@ class UkrainianSpec extends FunSuite { assert(Ukrainian.latinToCyrillic("sekretar") == "секретар") assert(Ukrainian.latinToCyrillic("teper") == "тепер") assert(Ukrainian.latinToCyrillic("shkolyar") == "школяр") - assert(Ukrainian.latinToCyrillic("Kharkiv") == "Харків") + assert(Ukrainian.latinToCyrillic("Harkiv") == "Харків") assert(Ukrainian.latinToCyrillic("Al'bert Ejnshtejn") == "Альберт Ейнштейн") assert(Ukrainian.latinToCyrillic("zdayut'sya") == "здаються") assert(Ukrainian.latinToCyrillic("postijnomu") == "постійному") assert(Ukrainian.latinToCyrillic("Jota") == "Йота") + assert(Ukrainian.latinToCyrillic("Puzata Hata") == "Пузата Хата") } test("s vs c") { @@ -250,6 +251,11 @@ class UkrainianSpec extends FunSuite { assert(Ukrainian.latinToCyrillic("vlasnym") == "власним") } + test("сх") { + assert(Ukrainian.latinToCyrillic("s|hyl'nist'") == "схильність") + assert(Ukrainian.latinToCyrillic("s|hopyv") == "схопив") + } + test("Offsets") { assert(Ukrainian.latinToCyrillicOfs("shch", 0) == (0, 'с')) assert(Ukrainian.latinToCyrillicOfs("shch", 1) == (-1, 'ш')) @@ -271,17 +277,17 @@ class UkrainianSpec extends FunSuite { assert(Ukrainian.latinToCyrillicOfs("vych", 1) == (0, 'и')) assert(Ukrainian.latinToCyrillicOfs("vych", 3) == (-1, 'ч')) - assert(Ukrainian.latinToCyrillicOfs("zgh", 2) == (-1, 'г')) + assert(Ukrainian.latinToCyrillicOfs("zg", 0) == (0, 'з')) + assert(Ukrainian.latinToCyrillicOfs("zg", 1) == (0, 'г')) } test("Convenience mappings") { - assert(Ukrainian.latinToCyrillic("Puzata Xata") == "Пузата Хата") - assert(Ukrainian.latinToCyrillic("cqwx") == "цщшх") + assert(Ukrainian.latinToCyrillic("cqwx") == "цщшж") } test("Incremental transliteration") { - assert(Ukrainian.latinToCyrillic("zgh") == "зг") - assert(Ukrainian.latinToCyrillic("зґh", incremental = true) == "зг") + assert(Ukrainian.latinToCyrillic("zg") == "зг") + assert(Ukrainian.latinToCyrillic("зg", incremental = true) == "зг") assert(Ukrainian.latinToCyrillic("Шцh", incremental = true) == "Щ") }