diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 0cef9fa5b2..b595177d82 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -66,6 +66,7 @@ dependencies { implementation(libs.kotlin.stdlib.jdk8) api(libs.commonsCsv) + implementation(libs.commonsIo) implementation(libs.serialization.core) implementation(libs.serialization.json) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index a5e745c4f3..33596e8d57 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -2,6 +2,7 @@ package org.jetbrains.kotlinx.dataframe.io import org.apache.commons.csv.CSVFormat import org.apache.commons.csv.CSVRecord +import org.apache.commons.io.input.BOMInputStream import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataColumn @@ -19,6 +20,7 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator import org.jetbrains.kotlinx.dataframe.impl.api.Parsers import org.jetbrains.kotlinx.dataframe.impl.api.parse import org.jetbrains.kotlinx.dataframe.values +import java.io.BufferedInputStream import java.io.BufferedReader import java.io.File import java.io.FileInputStream @@ -272,21 +274,20 @@ public fun DataFrame.Companion.readDelim( duplicate: Boolean = true, charset: Charset = defaultCharset, parserOptions: ParserOptions? = null, -): AnyFrame = - if (isCompressed) { - InputStreamReader(GZIPInputStream(inStream), charset) - } else { - BufferedReader(InputStreamReader(inStream, charset)) - }.run { - readDelim( - this, - getFormat(csvType, delimiter, header, duplicate), - colTypes, - skipLines, - readLines, - parserOptions, - ) - } +): AnyFrame { + val bufferedInStream = BufferedInputStream(if (isCompressed) GZIPInputStream(inStream) else inStream) + val bomIn = BOMInputStream.builder().setInputStream(bufferedInStream).get() + val bufferedReader = BufferedReader(InputStreamReader(bomIn, charset)) + + return readDelim( + reader = bufferedReader, + format = getFormat(csvType, delimiter, header, duplicate), + colTypes = colTypes, + skipLines = skipLines, + readLines = readLines, + parserOptions = parserOptions, + ) +} public enum class ColType { Int, diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt index 43efdc8c99..71d09cb01c 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt @@ -276,10 +276,17 @@ class CsvTests { df shouldBe dataFrameOf("a", "b", "c")(1, 2, 3) } + @Test + fun `file with BOM`() { + val df = DataFrame.readCSV(withBomCsv, delimiter = ';') + df.columnNames() shouldBe listOf("Column1", "Column2") + } + companion object { private val simpleCsv = testCsv("testCSV") private val csvWithFrenchLocale = testCsv("testCSVwithFrenchLocale") private val wineCsv = testCsv("wine") private val durationCsv = testCsv("duration") + private val withBomCsv = testCsv("with-bom") } } diff --git a/core/src/test/resources/with-bom.csv b/core/src/test/resources/with-bom.csv new file mode 100644 index 0000000000..c711a7b828 --- /dev/null +++ b/core/src/test/resources/with-bom.csv @@ -0,0 +1,3 @@ +Column1;Column2 +0,25;18 +1,24;19 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index f71849afe0..b14609930b 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -24,6 +24,7 @@ kover = "0.6.1" commonsCsv = "1.10.0" commonsCompress = "1.26.0" +commonsIo = "2.16.1" serialization = "1.7.0" fuel = "2.3.1" poi = "5.2.5" @@ -71,6 +72,7 @@ kotlin-reflect = { group = "org.jetbrains.kotlin", name = "kotlin-reflect", vers kotlin-scriptingJvm = { group = "org.jetbrains.kotlin", name = "kotlin-scripting-jvm", version.ref = "kotlin" } commonsCsv = { group = "org.apache.commons", name = "commons-csv", version.ref = "commonsCsv" } commonsCompress = { group = "org.apache.commons", name = "commons-compress", version.ref = "commonsCompress" } +commonsIo = { group = "commons-io", name = "commons-io", version.ref = "commonsIo" } # Serialization serialization-core = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-core", version.ref = "serialization" } serialization-json = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-json", version.ref = "serialization" }