Skip to content

Commit d8f3669

Browse files
committed
Russian: Fix case rules
1 parent 724ffa0 commit d8f3669

File tree

3 files changed

+64
-40
lines changed

3 files changed

+64
-40
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ Some differences are:
116116
| e | е |
117117
| f | ф |
118118
| g | г |
119-
| h | х |
119+
| h, kh | х |
120120
| i | и |
121121
| j | й |
122122
| k | к |

shared/src/main/scala/translit/Russian.scala

Lines changed: 48 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -74,40 +74,54 @@ object Russian extends Language {
7474
): (Int, String) = {
7575
val text = latin + append
7676
val ofs = text.length
77-
78-
if (ofs >= 4 &&
79-
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
80-
val chars = text.substring(ofs - 4, ofs)
81-
val cyrillic = fourGrams(chars.toLowerCase)
82-
(-2, restoreCaseFirst(chars, cyrillic).toString)
83-
} else if (ofs >= 3
84-
&& yLetters.contains(text.substring(ofs - 3, ofs - 1).toLowerCase)
85-
&& !iotatedLetters.contains(text.substring(ofs - 2, ofs).toLowerCase)
86-
) {
87-
val cyrillic = uniGrams.getOrElse(text(ofs - 1).toLower, text(ofs - 1))
88-
(0, (if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic).toString)
89-
} else if (ofs >= 3 &&
90-
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)) {
91-
val chars = text.substring(ofs - 3, ofs)
92-
val cyrillic = triGrams(chars.toLowerCase)
93-
(-2, restoreCaseFirst(chars, cyrillic).toString)
94-
} else if (ofs >= 2 &&
95-
biGrams.contains(text.substring(ofs - 2, ofs).toLowerCase)) {
96-
val chars = text.substring(ofs - 2, ofs)
97-
val cyrillic = biGrams(chars.toLowerCase)
98-
(-1, restoreCaseFirst(chars, cyrillic).toString)
99-
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
100-
val cyrillic = uniGrams(text(ofs - 1).toLower)
101-
(0, (if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic).toString)
102-
} else if (ofs >= 2 && uniGramsSpecial.contains(text(ofs - 1))) {
103-
val result =
104-
if (ofs >= 3 && text(ofs - 2).isUpper && text(ofs - 3).isUpper)
105-
uniGramsSpecial(text(ofs - 1)).toUpper
106-
else uniGramsSpecial(text(ofs - 1))
107-
(0, result.toString)
108-
} else {
109-
(0, text(ofs - 1).toString)
110-
}
77+
val result =
78+
if (ofs >= 4 &&
79+
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
80+
val chars = text.substring(ofs - 4, ofs)
81+
val cyrillic = fourGrams(chars.toLowerCase)
82+
(-2, restoreCaseFirst(chars, cyrillic).toString)
83+
} else if (ofs >= 3
84+
&& yLetters.contains(text.substring(ofs - 3, ofs - 1).toLowerCase)
85+
&& !iotatedLetters.contains(text.substring(ofs - 2, ofs).toLowerCase)
86+
) {
87+
val cyrillic = uniGrams.getOrElse(text(ofs - 1).toLower, text(ofs - 1))
88+
(0, (if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic).toString)
89+
} else if (ofs >= 3 &&
90+
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)) {
91+
val chars = text.substring(ofs - 3, ofs)
92+
val cyrillic = triGrams(chars.toLowerCase)
93+
(-2, restoreCaseFirst(chars, cyrillic).toString)
94+
} else if (ofs >= 2 &&
95+
biGrams.contains(text.substring(ofs - 2, ofs).toLowerCase)) {
96+
val chars = text.substring(ofs - 2, ofs)
97+
val cyrillic = biGrams(chars.toLowerCase)
98+
(-1, restoreCaseFirst(chars, cyrillic).toString)
99+
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
100+
val cyrillic = uniGrams(text(ofs - 1).toLower)
101+
(0, (if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic).toString)
102+
} else if (ofs >= 2 && uniGramsSpecial.contains(text(ofs - 1))) {
103+
val result =
104+
if (ofs >= 3 && text(ofs - 2).isUpper && text(ofs - 3).isUpper)
105+
uniGramsSpecial(text(ofs - 1)).toUpper
106+
else uniGramsSpecial(text(ofs - 1))
107+
(0, result.toString)
108+
} else {
109+
(0, text(ofs - 1).toString)
110+
}
111+
112+
if (ofs >= 3 && uniGramsSpecial.contains(text(ofs - 2))) {
113+
val (l, r) = (text(ofs - 3), text(ofs - 1))
114+
val letter = uniGramsSpecial(text(ofs - 2))
115+
val replace = if (l.isUpper && r.isUpper) letter.toUpper else letter
116+
val cyrillicOfs = cyrillic.length - 1
117+
118+
if (replace == cyrillic(cyrillicOfs)) result
119+
else {
120+
val updated = replace + cyrillic.substring(
121+
cyrillicOfs + 1, cyrillic.length + result._1)
122+
(-updated.length + result._1, updated + result._2)
123+
}
124+
} else result
111125
}
112126

113127
private def toLatin(letter: Char): String = {

shared/src/test/scala/translit/RussianSpec.scala

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ package translit
33
import org.scalatest.FunSuite
44

55
class RussianSpec extends FunSuite {
6-
val correctMapping = List(
6+
val words = List(
77
"Андрей" -> "Andrej",
88
"Борис" -> "Boris",
99
"Валера" -> "Valera",
@@ -24,9 +24,9 @@ class RussianSpec extends FunSuite {
2424
"роща" -> "roshcha",
2525
"съел" -> "s`el",
2626
"тележка" -> "telezhka",
27-
"ухват" -> "uhvat",
27+
"ухват" -> "ukhvat",
2828
"фольклор" -> "fol'klor",
29-
"халтура" -> "haltura",
29+
"халтура" -> "khaltura",
3030
"цвет" -> "cvet",
3131
"червь" -> "cherv'",
3232
"швея" -> "shveya",
@@ -36,9 +36,13 @@ class RussianSpec extends FunSuite {
3636
"ягненок" -> "yagnenok"
3737
)
3838

39-
correctMapping.foreach { case (cyrillic, latin) =>
40-
test(s"$latin -> $cyrillic") {
39+
words.foreach { case (cyrillic, latin) =>
40+
test(s"$cyrillic <-> $latin") {
41+
assert(Russian.cyrillicToLatin(cyrillic) == latin)
4142
assert(Russian.latinToCyrillic(latin) == cyrillic)
43+
44+
assert(Russian.cyrillicToLatin(cyrillic.toUpperCase) == latin.toUpperCase)
45+
assert(Russian.latinToCyrillic(latin.toUpperCase) == cyrillic.toUpperCase)
4246
}
4347
}
4448

@@ -98,4 +102,10 @@ class RussianSpec extends FunSuite {
98102
val latin2 = Russian.cyrillicToLatin("Звезда расположена в главной части созвездия приблизительно посередине между Гаммой Лебедя и Альбирео.")
99103
assert(latin2 == "Zvezda raspolozhena v glavnoj chasti sozvezdiya priblizitel'no poseredine mezhdu Gammoj Lebedya i Al'bireo.")
100104
}
105+
106+
test("Incremental transliteration") {
107+
assert(Russian.latinToCyrillic("S'") == "Сь")
108+
assert(Russian.latinToCyrillic("S'o") == "Сьо")
109+
assert(Russian.latinToCyrillic("S'O") == "СЬО")
110+
}
101111
}

0 commit comments

Comments
 (0)