Skip to content

Commit 5ea375b

Browse files
committed
Ukrainian: Implement Cyrillic to Latin conversion
1 parent 287ab4d commit 5ea375b

File tree

7 files changed

+108
-36
lines changed

7 files changed

+108
-36
lines changed

shared/src/main/scala/translit/Helpers.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package translit
22

33
object Helpers {
44
def applyCase(str: String, isUpper: Boolean): String =
5-
if (isUpper) str.toUpperCase else str
5+
if (isUpper) str(0).toUpper + str.tail else str
66

77
def restoreCaseAll(str: String, cyrillic: Char): Char =
88
if (str.forall(_.isUpper)) cyrillic.toUpper else cyrillic

shared/src/main/scala/translit/Language.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ trait Language {
1515
latin: String, cyrillic: String, append: Char
1616
): (Int, String)
1717

18-
def cyrillicToLatinIncremental(letter: Char): String
18+
def cyrillicToLatinIncremental(cyrillic: String, letter: Char): (Int, String)
1919

2020
def latinToCyrillic(text: String): String = {
2121
val result = new StringBuilder(text.length)
@@ -37,7 +37,10 @@ trait Language {
3737
var offset = 0
3838

3939
while (offset < text.length) {
40-
result.append(cyrillicToLatinIncremental(text(offset)))
40+
val (length, c) = cyrillicToLatinIncremental(
41+
text.take(offset), text(offset))
42+
if (length < 0) result.setLength(result.length + length)
43+
result.append(c)
4144
offset += 1
4245
}
4346

shared/src/main/scala/translit/Noop.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ object Noop extends translit.Language {
55
latin: String, cyrillic: String, append: Char
66
): (Int, String) = (0, append.toString)
77

8-
override def cyrillicToLatinIncremental(letter: Char): String =
9-
letter.toString
8+
override def cyrillicToLatinIncremental(
9+
cyrillic: String, letter: Char
10+
): (Int, String) = (0, letter.toString)
1011
}
1112

shared/src/main/scala/translit/Russian.scala

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ object Russian extends Language {
3434
'z' -> 'з'
3535
)
3636

37-
// Infer capital from previous character
37+
// Infer case from previous character
3838
val uniGramsSpecial = Map(
3939
'\'' -> 'ь',
4040
'`' -> 'ъ'
@@ -58,6 +58,7 @@ object Russian extends Language {
5858
)
5959

6060
val uniGramsInv = uniGrams.toList.map(_.swap).toMap
61+
val uniGramsSpecialInv = uniGramsSpecial.toList.map(_.swap).toMap
6162
val biGramsInv = biGrams.toList.map(_.swap).toMap
6263
val triGramsInv = triGrams.toList.map(_.swap).toMap
6364
val fourGramsInv = fourGrams.toList.map(_.swap).toMap
@@ -109,13 +110,31 @@ object Russian extends Language {
109110
}
110111
}
111112

112-
override def cyrillicToLatinIncremental(letter: Char): String = {
113+
private def toLatin(letter: Char): String = {
113114
val isUpper = letter.isUpper
114115
val letterLc = letter.toLower
115116
fourGramsInv.get(letterLc).map(applyCase(_, isUpper))
116117
.orElse(triGramsInv.get(letterLc).map(applyCase(_, isUpper)))
117118
.orElse(biGramsInv.get(letterLc).map(applyCase(_, isUpper)))
118119
.orElse(uniGramsInv.get(letterLc).map(x => applyCase(x.toString, isUpper)))
120+
.orElse(uniGramsSpecialInv.get(letterLc).map(x => applyCase(x.toString, isUpper)))
119121
.getOrElse(letter.toString)
120122
}
123+
124+
override def cyrillicToLatinIncremental(
125+
cyrillic: String, letter: Char
126+
): (Int, String) = {
127+
val current = toLatin(letter)
128+
129+
val changeCase =
130+
letter.isUpper &&
131+
(cyrillic.length == 1 || cyrillic.lastOption.exists(_.isUpper))
132+
133+
if (!changeCase) (0, current)
134+
else {
135+
val mapped = toLatin(cyrillic.last)
136+
val rest = mapped.tail
137+
(-rest.length, rest.toUpperCase + current.toUpperCase)
138+
}
139+
}
121140
}

shared/src/main/scala/translit/Ukrainian.scala

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ class Ukrainian(apostrophes: Boolean) extends Language {
1414
'e' -> 'е',
1515
'f' -> 'ф',
1616
'g' -> 'г',
17-
'h' -> 'х',
1817
'i' -> 'і',
1918
'j' -> 'й',
2019
'k' -> 'к',
@@ -34,6 +33,7 @@ class Ukrainian(apostrophes: Boolean) extends Language {
3433
// Mappings for more convenient typing. Allows us to cover every letter of
3534
// the Latin alphabet
3635
'c' -> 'ц',
36+
'h' -> 'х',
3737
'q' -> 'щ',
3838
'w' -> 'ш',
3939
'x' -> 'ж'
@@ -52,8 +52,7 @@ class Ukrainian(apostrophes: Boolean) extends Language {
5252
"ts" -> 'ц',
5353
"zh" -> 'ж',
5454

55-
// With the vertical bar, transliteration can be disabled.
56-
"s|" -> 'с'
55+
"kh" -> 'х'
5756
)
5857

5958
val triGrams = Map[String, Char]()
@@ -62,6 +61,15 @@ class Ukrainian(apostrophes: Boolean) extends Language {
6261
"shch" -> 'щ'
6362
)
6463

64+
val uniGramsInv = uniGrams.toList.map(_.swap).toMap
65+
val uniGramsSpecialInv = Map(
66+
'ь' -> '\'',
67+
'\'' -> '\''
68+
)
69+
val biGramsInv = biGrams.toList.map(_.swap).toMap
70+
val triGramsInv = triGrams.toList.map(_.swap).toMap
71+
val fourGramsInv = fourGrams.toList.map(_.swap).toMap
72+
6573
val apostrophePatterns = Set(
6674
('b', "ya"),
6775
('b', "ye"),
@@ -158,7 +166,36 @@ class Ukrainian(apostrophes: Boolean) extends Language {
158166
} else result
159167
}
160168

161-
def cyrillicToLatinIncremental(letter: Char): String = ???
169+
private def toLatin(letter: Char): String = {
170+
val isUpper = letter.isUpper
171+
val letterLc = letter.toLower
172+
fourGramsInv.get(letterLc).map(applyCase(_, isUpper))
173+
.orElse(triGramsInv.get(letterLc).map(applyCase(_, isUpper)))
174+
.orElse(biGramsInv.get(letterLc).map(applyCase(_, isUpper)))
175+
.orElse(uniGramsInv.get(letterLc).map(x => applyCase(x.toString, isUpper)))
176+
.orElse(uniGramsSpecialInv.get(letterLc).map(x => applyCase(x.toString, isUpper)))
177+
.getOrElse(letter.toString)
178+
}
179+
180+
override def cyrillicToLatinIncremental(
181+
cyrillic: String, letter: Char
182+
): (Int, String) = {
183+
val current = toLatin(letter)
184+
185+
val changeCase =
186+
letter.isUpper && {
187+
val withoutApostrophes = cyrillic.filter(_ != '\'')
188+
withoutApostrophes.length == 1 ||
189+
withoutApostrophes.lastOption.exists(_.isUpper)
190+
}
191+
192+
if (!changeCase) (0, current)
193+
else {
194+
val mapped = toLatin(cyrillic.last)
195+
val rest = mapped.tail
196+
(-rest.length, rest.toUpperCase + current.toUpperCase)
197+
}
198+
}
162199
}
163200

164201
object Ukrainian extends Ukrainian(apostrophes = true)

shared/src/test/scala/translit/RussianSpec.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,5 +94,8 @@ class RussianSpec extends FunSuite {
9494
test("Cyrillic to Latin") {
9595
val latin = Russian.cyrillicToLatin("Фердинанд Теннис, описал два важнейших социологических абстрактных понятия")
9696
assert(latin == "Ferdinand Tennis, opisal dva vazhnejshikh sociologicheskikh abstraktnykh ponyatiya")
97+
98+
val latin2 = Russian.cyrillicToLatin("Звезда расположена в главной части созвездия приблизительно посередине между Гаммой Лебедя и Альбирео.")
99+
assert(latin2 == "Zvezda raspolozhena v glavnoj chasti sozvezdiya priblizitel'no poseredine mezhdu Gammoj Lebedya i Al'bireo.")
97100
}
98101
}

shared/src/test/scala/translit/UkrainianSpec.scala

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ package translit
33
import org.scalatest.FunSuite
44

55
class UkrainianSpec extends FunSuite {
6-
val correctMapping = List(
6+
val words = List(
77
"Алушта" -> "Alushta",
88
"Андрій" -> "Andrij",
99
"Борщагівка" -> "Borshchagivka",
@@ -12,24 +12,24 @@ class UkrainianSpec extends FunSuite {
1212
"Володимир" -> "Volodymyr",
1313
"Гадяч" -> "Gadyach",
1414
"Богдан" -> "Bogdan",
15-
"Згурський" -> "Zgurskyj",
15+
"Згурський" -> "Zgurs'kyj",
1616
"Ґалаґан" -> "G'alag'an",
1717
"Ґорґани" -> "G'org'any",
18-
"Донецьк" -> "Donetsk",
18+
"Донецьк" -> "Donets'k",
1919
"Дмитро" -> "Dmytro",
2020
"Рівне" -> "Rivne",
2121
"Олег" -> "Oleg",
22-
"Есмань" -> "Esman",
22+
"Есмань" -> "Esman'",
2323
"Єнакієве" -> "Yenakiyeve",
2424
"Гаєвич" -> "Gayevych",
25-
"Короп'є" -> "Koropye",
25+
"Короп'є" -> "Korop'ye",
2626
"Житомир" -> "Zhytomyr",
2727
"Жанна" -> "Zhanna",
2828
"Жежелів" -> "Zhezheliv",
2929
"Закарпаття" -> "Zakarpattya",
3030
"Казимирчук" -> "Kazymyrchuk",
3131
"Медвин" -> "Medvyn",
32-
"Михайленко" -> "Myhajlenko",
32+
"Михайленко" -> "Mykhajlenko",
3333
"Іванків" -> "Ivankiv",
3434
"Іващенко" -> "Ivashchenko",
3535
"Їжакевич" -> "Yizhakevych",
@@ -50,44 +50,42 @@ class UkrainianSpec extends FunSuite {
5050
"Полтава" -> "Poltava",
5151
"Петро" -> "Petro",
5252
"Решетилівка" -> "Reshetylivka",
53-
"Рибчинський" -> "Rybchynskyj",
53+
"Рибчинський" -> "Rybchyns'kyj",
5454
"Суми" -> "Sumy",
5555
"Соломія" -> "Solomiya",
56-
"Тернопіль" -> "Ternopil",
57-
"Троць" -> "Trots",
56+
"Тернопіль" -> "Ternopil'",
57+
"Троць" -> "Trots'",
5858
"Ужгород" -> "Uzhgorod",
5959
"Уляна" -> "Ulyana",
6060
"Фастів" -> "Fastiv",
6161
"Філіпчук" -> "Filipchuk",
62-
"Харків" -> "Harkiv",
63-
"Христина" -> "Hrystyna",
62+
"Харків" -> "Kharkiv",
63+
"Христина" -> "Khrystyna",
6464
"Біла Церква" -> "Bila Tserkva",
6565
"Стеценко" -> "Stetsenko",
6666
"Чернівці" -> "Chernivtsi",
6767
"Шевченко" -> "Shevchenko",
6868
"Шостка" -> "Shostka",
69-
"Кишеньки" -> "Kyshenky",
70-
"Щербухи" -> "Shcherbuhy",
69+
"Кишеньки" -> "Kyshen'ky",
70+
"Щербухи" -> "Shcherbukhy",
7171
"Гоща" -> "Goshcha",
7272
"Гаращенко" -> "Garashchenko",
7373
"Юрій" -> "Yurij",
7474
"Корюківка" -> "Koryukivka",
7575
"Яготин" -> "Yagotyn",
7676
"Ярошенко" -> "Yaroshenko",
7777
"Костянтин" -> "Kostyantyn",
78-
"Знам'янка" -> "Znamyanka",
78+
"Знам'янка" -> "Znam'yanka",
7979
"Феодосія" -> "Feodosiya"
8080
)
8181

82-
def removeApostropheAndSoftSign(str: String): String =
83-
str
84-
.replaceAll("ь", "")
85-
.replaceAll("'", "")
82+
words.foreach { case (cyrillic, latin) =>
83+
test(s"$cyrillic <-> $latin") {
84+
assert(Ukrainian.cyrillicToLatin(cyrillic) == latin)
85+
assert(Ukrainian.latinToCyrillic(latin) == cyrillic)
8686

87-
correctMapping.foreach { case (cyrillic, latin) =>
88-
test(s"$latin -> $cyrillic") {
89-
assert(Ukrainian.latinToCyrillic(latin) ==
90-
removeApostropheAndSoftSign(cyrillic))
87+
assert(Ukrainian.cyrillicToLatin(cyrillic.toUpperCase) == latin.toUpperCase)
88+
assert(Ukrainian.latinToCyrillic(latin.toUpperCase) == cyrillic.toUpperCase)
9189
}
9290
}
9391

@@ -253,9 +251,9 @@ class UkrainianSpec extends FunSuite {
253251
}
254252

255253
test("сх") {
256-
assert(Ukrainian.latinToCyrillic("s|hyl'nist'") == "схильність")
257-
assert(Ukrainian.latinToCyrillic("s|hopyv") == "схопив")
258-
assert(Ukrainian.latinToCyrillic("s|hodi") == "сході")
254+
assert(Ukrainian.latinToCyrillic("skhyl'nist'") == "схильність")
255+
assert(Ukrainian.latinToCyrillic("skhopyv") == "схопив")
256+
assert(Ukrainian.latinToCyrillic("skhodi") == "сході")
259257
}
260258

261259
test("Incremental interface") {
@@ -296,4 +294,15 @@ class UkrainianSpec extends FunSuite {
296294
test("Convenience mappings") {
297295
assert(Ukrainian.latinToCyrillic("cqwx") == "цщшж")
298296
}
297+
298+
test("Cyrillic to Latin") {
299+
assert(Ukrainian.cyrillicToLatin("Щ") == "Shch")
300+
assert(Ukrainian.cyrillicToLatin("ЩЕ") == "SHCHE")
301+
302+
assert(
303+
Ukrainian.cyrillicToLatin("готовність, схильність суб'єкта до поведінкового акту, дії, вчинку, їх послідовності") ==
304+
"gotovnist', skhyl'nist' sub'yekta do povedinkovogo aktu, diyi, vchynku, yikh poslidovnosti")
305+
306+
assert(Ukrainian.cyrillicToLatin("ІЯ") == "IYA")
307+
}
299308
}

0 commit comments

Comments
 (0)