Skip to content

Commit 59cb538

Browse files
committed
feat(model): Detect and exclude binary license files
Detect and exclude binary license files using Apache Tika. When a non-text file is found during the license info creation process, a warning is printed, and it is excluded from the final report. This prevents the inclusion of binary files that previously caused the reporter to enter an endless loop during report generation. Signed-off-by: Julian Olderdissen <[email protected]>
1 parent 442518a commit 59cb538

File tree

4 files changed

+46
-4
lines changed

4 files changed

+46
-4
lines changed

gradle/libs.versions.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ slf4j = "2.0.17"
6767
springCore = "6.2.7"
6868
svnkit = "1.10.12"
6969
sw360Client = "17.0.1-m2"
70+
tika = "3.1.0"
7071
wagonHttp = "3.5.3"
7172
wiremock = "3.13.0"
7273
xmlutil = "0.91.1"
@@ -186,6 +187,7 @@ slf4j = { module = "org.slf4j:slf4j-api ", version.ref = "slf4j" }
186187
springCore = { module = "org.springframework:spring-core", version.ref = "springCore" }
187188
svnkit = { module = "com.tmatesoft.svnkit:svnkit", version.ref = "svnkit" }
188189
sw360Client = { module = "org.eclipse.sw360:client", version.ref = "sw360Client" }
190+
tika = { module = "org.apache.tika:tika-core", version.ref = "tika" }
189191
wagon-http = { module = "org.apache.maven.wagon:wagon-http", version.ref = "wagonHttp" }
190192
wiremock = { module = "org.wiremock:wiremock", version.ref = "wiremock" }
191193
xz = { module = "org.tukaani:xz", version.ref = "xz" }

model/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies {
3535
api(libs.jackson.dataformat.yaml)
3636
api(libs.log4j.api)
3737

38+
implementation(libs.tika)
3839
implementation(libs.bundles.exposed)
3940
implementation(libs.bundles.hoplite)
4041
implementation(libs.hikari)

model/src/main/kotlin/utils/FileArchiver.kt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import kotlin.time.measureTime
2525
import kotlin.time.measureTimedValue
2626

2727
import org.apache.logging.log4j.kotlin.logger
28+
import org.apache.tika.Tika
2829

2930
import org.ossreviewtoolkit.model.KnownProvenance
3031
import org.ossreviewtoolkit.utils.common.FileMatcher
@@ -60,6 +61,13 @@ class FileArchiver(
6061
ignoreCase = true
6162
)
6263

64+
private val invalidTypes = listOf(
65+
"application/octet-stream",
66+
"application/x-executable",
67+
"application/x-sharedlib",
68+
"application/x-object"
69+
)
70+
6371
/**
6472
* Return whether an archive corresponding to [provenance] exists.
6573
*/
@@ -77,6 +85,11 @@ class FileArchiver(
7785
directory.packZip(zipFile, overwrite = true) { file ->
7886
val relativePath = file.relativeTo(directory).invariantSeparatorsPath
7987

88+
if (!file.isValidTextFile()) {
89+
logger.warn { "Not adding file '$relativePath' to archive because it is not a text file." }
90+
return@packZip false
91+
}
92+
8093
matcher.matches(relativePath).also { result ->
8194
logger.debug {
8295
if (result) {
@@ -125,4 +138,9 @@ class FileArchiver(
125138
logger.error { "Failed to unarchive data for $provenance: ${it.collectMessages()}" }
126139
}.isSuccess
127140
}
141+
142+
private fun File.isValidTextFile(): Boolean {
143+
val detected = Tika().detect(this)
144+
return invalidTypes.none { detected.startsWith(it, ignoreCase = true) }
145+
}
128146
}

model/src/test/kotlin/utils/FileArchiverTest.kt

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@ package org.ossreviewtoolkit.model.utils
2222
import io.kotest.core.spec.style.StringSpec
2323
import io.kotest.core.test.TestCase
2424
import io.kotest.engine.spec.tempdir
25-
import io.kotest.matchers.file.aFile
26-
import io.kotest.matchers.file.exist
27-
import io.kotest.matchers.file.shouldContainNFiles
25+
import io.kotest.matchers.file.*
2826
import io.kotest.matchers.shouldBe
2927
import io.kotest.matchers.shouldNot
3028

@@ -60,10 +58,11 @@ class FileArchiverTest : StringSpec() {
6058
storage = FileProvenanceFileStorage(LocalFileStorage(storageDir), FileArchiverConfiguration.ARCHIVE_FILENAME)
6159
}
6260

63-
private fun createFile(path: String) {
61+
private fun createFile(path: String): File {
6462
val file = workingDir.resolve(path)
6563
file.parentFile.safeMkdirs()
6664
file.writeText(path)
65+
return file
6766
}
6867

6968
/**
@@ -162,5 +161,27 @@ class FileArchiverTest : StringSpec() {
162161
archiver.unarchive(targetDir, PROVENANCE) shouldBe true
163162
targetDir shouldContainNFiles 0
164163
}
164+
165+
"exclude basic binary license file" {
166+
createFile("License").writeBytes(byteArrayOf(0xFF.toByte(), 0xD8.toByte()))
167+
168+
val archiver = FileArchiver.createDefault()
169+
archiver.archive(workingDir, PROVENANCE)
170+
val result = archiver.unarchive(targetDir, PROVENANCE)
171+
172+
result shouldBe true
173+
targetDir shouldNot containFile("License")
174+
}
175+
176+
"include utf8 file with japanese chars" {
177+
createFile("License").writeText("ぁあぃいぅうぇえぉおかがきぎ")
178+
179+
val archiver = FileArchiver.createDefault()
180+
archiver.archive(workingDir, PROVENANCE)
181+
val result = archiver.unarchive(targetDir, PROVENANCE)
182+
183+
result shouldBe true
184+
targetDir shouldNot containFile("License")
185+
}
165186
}
166187
}

0 commit comments

Comments
 (0)