Skip to content

Commit 735bbb0

Browse files
committed
feat(model): Detect and exclude binary license files
Adds logic to detect binary license files using Apache Tika. If a non-text file is found during the license info creation process, a warning is printed, and it is not included in the final report. This avoids including binary files that previously caused the reporter to enter an endless loop during report generation. Signed-off-by: Julian Olderdissen <[email protected]>
1 parent d7575a7 commit 735bbb0

File tree

2 files changed

+51
-1
lines changed

2 files changed

+51
-1
lines changed

model/src/main/kotlin/utils/FileArchiver.kt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import kotlin.time.measureTime
2525
import kotlin.time.measureTimedValue
2626

2727
import org.apache.logging.log4j.kotlin.logger
28+
import org.apache.tika.Tika
2829

2930
import org.ossreviewtoolkit.model.KnownProvenance
3031
import org.ossreviewtoolkit.utils.common.FileMatcher
@@ -60,6 +61,8 @@ class FileArchiver(
6061
ignoreCase = true
6162
)
6263

64+
private val tika = Tika()
65+
6366
/**
6467
* Return whether an archive corresponding to [provenance] exists.
6568
*/
@@ -77,6 +80,11 @@ class FileArchiver(
7780
directory.packZip(zipFile, overwrite = true) { file ->
7881
val relativePath = file.relativeTo(directory).invariantSeparatorsPath
7982

83+
if (!file.isValidTextFile()) {
84+
logger.warn { "Not adding file '$relativePath' to archive because it is not a text file." }
85+
return@packZip false
86+
}
87+
8088
matcher.matches(relativePath).also { result ->
8189
logger.debug {
8290
if (result) {
@@ -125,4 +133,15 @@ class FileArchiver(
125133
logger.error { "Failed to unarchive data for $provenance: ${it.collectMessages()}" }
126134
}.isSuccess
127135
}
136+
137+
internal fun File.isValidTextFile(): Boolean {
138+
val detected = tika.detect(this)
139+
val invalidTypes = listOf(
140+
"application/octet-stream",
141+
"application/x-executable",
142+
"application/x-sharedlib",
143+
"application/x-object"
144+
)
145+
return invalidTypes.none { detected.startsWith(it, ignoreCase = true) }
146+
}
128147
}

model/src/test/kotlin/utils/FileArchiverTest.kt

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ import io.kotest.core.test.TestCase
2424
import io.kotest.engine.spec.tempdir
2525
import io.kotest.matchers.file.aFile
2626
import io.kotest.matchers.file.exist
27+
import io.kotest.matchers.file.shouldContainFile
2728
import io.kotest.matchers.file.shouldContainNFiles
29+
import io.kotest.matchers.file.shouldNotContainFile
2830
import io.kotest.matchers.shouldBe
2931
import io.kotest.matchers.shouldNot
3032

@@ -60,10 +62,11 @@ class FileArchiverTest : StringSpec() {
6062
storage = FileProvenanceFileStorage(LocalFileStorage(storageDir), FileArchiverConfiguration.ARCHIVE_FILENAME)
6163
}
6264

63-
private fun createFile(path: String) {
65+
private fun createFile(path: String): File {
6466
val file = workingDir.resolve(path)
6567
file.parentFile.safeMkdirs()
6668
file.writeText(path)
69+
return file
6770
}
6871

6972
/**
@@ -162,5 +165,33 @@ class FileArchiverTest : StringSpec() {
162165
archiver.unarchive(targetDir, PROVENANCE) shouldBe true
163166
targetDir shouldContainNFiles 0
164167
}
168+
169+
"exclude basic binary license file" {
170+
createFile("License").writeBytes(byteArrayOf(0xFF.toByte(), 0xD8.toByte()))
171+
172+
val archiver = FileArchiver.createDefault()
173+
174+
archiver.archive(workingDir, PROVENANCE)
175+
176+
val result = archiver.unarchive(targetDir, PROVENANCE)
177+
178+
result shouldBe true
179+
targetDir.shouldNotContainFile("License")
180+
}
181+
182+
"include utf8 file with japanese chars" {
183+
createFile("License").writeText("ぁあぃいぅうぇえぉおかがきぎ", charset = Charsets.UTF_8)
184+
185+
val archiver = FileArchiver.createDefault()
186+
187+
archiver.archive(workingDir, PROVENANCE)
188+
189+
val result = archiver.unarchive(targetDir, PROVENANCE)
190+
191+
result shouldBe true
192+
with(targetDir) {
193+
shouldContainFile("License")
194+
}
195+
}
165196
}
166197
}

0 commit comments

Comments
 (0)