Skip to content

Commit a9b856c

Browse files
committed
feat(model): Detect and exclude binary license files
Detect and exclude binary license files using Apache Tika. When a non-text file is found during the license info creation process, a warning is printed, and it is excluded from the final report. This prevents the inclusion of binary files that previously caused the reporter to enter an endless loop during report generation. Signed-off-by: Julian Olderdissen <[email protected]>
1 parent 442518a commit a9b856c

File tree

4 files changed

+41
-0
lines changed

4 files changed

+41
-0
lines changed

gradle/libs.versions.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ slf4j = "2.0.17"
6767
springCore = "6.2.7"
6868
svnkit = "1.10.12"
6969
sw360Client = "17.0.1-m2"
70+
tika = "3.1.0"
7071
wagonHttp = "3.5.3"
7172
wiremock = "3.13.0"
7273
xmlutil = "0.91.1"
@@ -186,6 +187,7 @@ slf4j = { module = "org.slf4j:slf4j-api ", version.ref = "slf4j" }
186187
springCore = { module = "org.springframework:spring-core", version.ref = "springCore" }
187188
svnkit = { module = "com.tmatesoft.svnkit:svnkit", version.ref = "svnkit" }
188189
sw360Client = { module = "org.eclipse.sw360:client", version.ref = "sw360Client" }
190+
tika = { module = "org.apache.tika:tika-core", version.ref = "tika" }
189191
wagon-http = { module = "org.apache.maven.wagon:wagon-http", version.ref = "wagonHttp" }
190192
wiremock = { module = "org.wiremock:wiremock", version.ref = "wiremock" }
191193
xz = { module = "org.tukaani:xz", version.ref = "xz" }

model/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies {
3535
api(libs.jackson.dataformat.yaml)
3636
api(libs.log4j.api)
3737

38+
implementation(libs.tika)
3839
implementation(libs.bundles.exposed)
3940
implementation(libs.bundles.hoplite)
4041
implementation(libs.hikari)

model/src/main/kotlin/utils/FileArchiver.kt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import kotlin.time.measureTime
2525
import kotlin.time.measureTimedValue
2626

2727
import org.apache.logging.log4j.kotlin.logger
28+
import org.apache.tika.Tika
29+
import org.apache.tika.mime.MimeTypes
2830

2931
import org.ossreviewtoolkit.model.KnownProvenance
3032
import org.ossreviewtoolkit.utils.common.FileMatcher
@@ -77,6 +79,11 @@ class FileArchiver(
7779
directory.packZip(zipFile, overwrite = true) { file ->
7880
val relativePath = file.relativeTo(directory).invariantSeparatorsPath
7981

82+
if (Tika().detect(file) != MimeTypes.PLAIN_TEXT) {
83+
logger.warn { "Not adding file '$relativePath' to archive because it is not a text file." }
84+
return@packZip false
85+
}
86+
8087
matcher.matches(relativePath).also { result ->
8188
logger.debug {
8289
if (result) {

model/src/test/kotlin/utils/FileArchiverTest.kt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ import io.kotest.core.spec.style.StringSpec
2323
import io.kotest.core.test.TestCase
2424
import io.kotest.engine.spec.tempdir
2525
import io.kotest.matchers.file.aFile
26+
import io.kotest.matchers.file.containFile
2627
import io.kotest.matchers.file.exist
2728
import io.kotest.matchers.file.shouldContainNFiles
29+
import io.kotest.matchers.should
2830
import io.kotest.matchers.shouldBe
2931
import io.kotest.matchers.shouldNot
3032

@@ -66,6 +68,13 @@ class FileArchiverTest : StringSpec() {
6668
file.writeText(path)
6769
}
6870

71+
private fun createEmptyFile(path: String): File {
72+
val file = workingDir.resolve(path)
73+
file.parentFile.safeMkdirs()
74+
file.createNewFile()
75+
return file
76+
}
77+
6978
/**
7079
* Assert that this directory contains a file at [path] which contains the [path] as text.
7180
*/
@@ -162,5 +171,27 @@ class FileArchiverTest : StringSpec() {
162171
archiver.unarchive(targetDir, PROVENANCE) shouldBe true
163172
targetDir shouldContainNFiles 0
164173
}
174+
175+
"exclude basic binary license file" {
176+
createEmptyFile("License").writeBytes(byteArrayOf(0xFF.toByte(), 0xD8.toByte()))
177+
178+
val archiver = FileArchiver.createDefault()
179+
archiver.archive(workingDir, PROVENANCE)
180+
val result = archiver.unarchive(targetDir, PROVENANCE)
181+
182+
result shouldBe true
183+
targetDir shouldNot containFile("License")
184+
}
185+
186+
"include utf8 file with japanese chars" {
187+
createEmptyFile("License").writeText("ぁあぃいぅうぇえぉおかが")
188+
189+
val archiver = FileArchiver.createDefault()
190+
archiver.archive(workingDir, PROVENANCE)
191+
val result = archiver.unarchive(targetDir, PROVENANCE)
192+
193+
result shouldBe true
194+
targetDir should containFile("License")
195+
}
165196
}
166197
}

0 commit comments

Comments
 (0)