-
Notifications
You must be signed in to change notification settings - Fork 356
Add an option to prefer ScanCode's file- over line-level findings #8152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9a35bd6
bf8a138
ae20031
7245f60
e6deb43
0046a2e
b405929
24e7163
ead5b94
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,8 +22,6 @@ package org.ossreviewtoolkit.plugins.scanners.scancode | |
| import java.io.File | ||
| import java.time.Instant | ||
|
|
||
| import kotlin.math.max | ||
|
|
||
| import org.apache.logging.log4j.kotlin.logger | ||
|
|
||
| import org.ossreviewtoolkit.model.ScanSummary | ||
|
|
@@ -40,7 +38,6 @@ import org.ossreviewtoolkit.utils.common.Options | |
| import org.ossreviewtoolkit.utils.common.Os | ||
| import org.ossreviewtoolkit.utils.common.ProcessCapture | ||
| import org.ossreviewtoolkit.utils.common.safeDeleteRecursively | ||
| import org.ossreviewtoolkit.utils.common.splitOnWhitespace | ||
| import org.ossreviewtoolkit.utils.common.withoutPrefix | ||
| import org.ossreviewtoolkit.utils.ort.createOrtTempDir | ||
|
|
||
|
|
@@ -55,49 +52,30 @@ import org.semver4j.Semver | |
| * configuration [options][PluginConfiguration.options]: | ||
| * | ||
| * * **"commandLine":** Command line options that modify the result. These are added to the [ScannerDetails] when | ||
| * looking up results from the [ScanResultsStorage]. Defaults to [DEFAULT_CONFIGURATION_OPTIONS]. | ||
| * looking up results from the [ScanResultsStorage]. Defaults to [ScanCodeConfig.DEFAULT_COMMAND_LINE_OPTIONS]. | ||
| * * **"commandLineNonConfig":** Command line options that do not modify the result and should therefore not be | ||
| * considered in [configuration], like "--processes". Defaults to [DEFAULT_NON_CONFIGURATION_OPTIONS]. | ||
| * considered in [configuration], like "--processes". Defaults to | ||
| * [ScanCodeConfig.DEFAULT_COMMAND_LINE_NON_CONFIG_OPTIONS]. | ||
| * * **preferFileLicense**: A flag to indicate whether the "high-level" per-file license reported by ScanCode starting | ||
| * with version 32 should be used instead of the individual "low-level" per-line license findings. The per-file | ||
| * license may be different from the conjunction of per-line licenses and is supposed to contain fewer | ||
| * false-positives. However, no exact line numbers can be associated to the per-file license anymore. If enabled, the | ||
| * start line of the per-file license finding is set to the minimum of all start lines for per-line findings in that | ||
| * file, the end line is set to the maximum of all end lines for per-line findings in that file, and the score is set | ||
| * to the arithmetic average of the scores of all per-line findings in that file. | ||
| */ | ||
| class ScanCode internal constructor( | ||
| name: String, | ||
| config: ScanCodeConfig, | ||
| private val config: ScanCodeConfig, | ||
| private val wrapperConfig: ScannerWrapperConfig | ||
| ) : CommandLinePathScannerWrapper(name) { | ||
| // This constructor is required by the `RequirementsCommand`. | ||
| constructor(name: String, wrapperConfig: ScannerWrapperConfig) : this(name, ScanCodeConfig.EMPTY, wrapperConfig) | ||
| constructor(name: String, wrapperConfig: ScannerWrapperConfig) : this(name, ScanCodeConfig.DEFAULT, wrapperConfig) | ||
|
|
||
| companion object { | ||
| const val SCANNER_NAME = "ScanCode" | ||
|
|
||
| private const val LICENSE_REFERENCES_OPTION_VERSION = "32.0.0" | ||
| private const val OUTPUT_FORMAT = "json-pp" | ||
| private const val TIMEOUT = 300 | ||
|
|
||
| /** | ||
| * Configuration options that are relevant for [configuration] because they change the result file. | ||
| */ | ||
| private val DEFAULT_CONFIGURATION_OPTIONS = listOf( | ||
| "--copyright", | ||
| "--license", | ||
| "--info", | ||
| "--strip-root", | ||
| "--timeout", TIMEOUT.toString() | ||
| ) | ||
|
|
||
| /** | ||
| * Configuration options that are not relevant for [configuration] because they do not change the result | ||
| * file. | ||
| */ | ||
| private val DEFAULT_NON_CONFIGURATION_OPTIONS = listOf( | ||
| "--processes", max(1, Runtime.getRuntime().availableProcessors() - 1).toString() | ||
| ) | ||
|
|
||
| private val OUTPUT_FORMAT_OPTION = if (OUTPUT_FORMAT.startsWith("json")) { | ||
| "--$OUTPUT_FORMAT" | ||
| } else { | ||
| "--output-$OUTPUT_FORMAT" | ||
| } | ||
| } | ||
|
|
||
| class Factory : ScannerWrapperFactory<ScanCodeConfig>(SCANNER_NAME) { | ||
|
|
@@ -107,35 +85,33 @@ class ScanCode internal constructor( | |
| override fun parseConfig(options: Options, secrets: Options) = ScanCodeConfig.create(options) | ||
| } | ||
|
|
||
| override val matcher by lazy { ScannerMatcher.create(details, wrapperConfig.matcherConfig) } | ||
fviernau marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| override val readFromStorage by lazy { wrapperConfig.readFromStorageWithDefault(matcher) } | ||
|
|
||
| override val writeToStorage by lazy { wrapperConfig.writeToStorageWithDefault(matcher) } | ||
|
|
||
| override val configuration by lazy { | ||
| buildList { | ||
| addAll(configurationOptions) | ||
| add(OUTPUT_FORMAT_OPTION) | ||
| }.joinToString(" ") | ||
| } | ||
|
|
||
| private val configurationOptions = config.commandLine?.splitOnWhitespace() ?: DEFAULT_CONFIGURATION_OPTIONS | ||
| private val nonConfigurationOptions = config.commandLineNonConfig?.splitOnWhitespace() | ||
| ?: DEFAULT_NON_CONFIGURATION_OPTIONS | ||
| private val commandLineOptions by lazy { getCommandLineOptions(version) } | ||
|
|
||
| internal fun getCommandLineOptions(version: String) = | ||
| buildList { | ||
| addAll(configurationOptions) | ||
| addAll(nonConfigurationOptions) | ||
| addAll(config.commandLine) | ||
| addAll(config.commandLineNonConfig) | ||
fviernau marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if (Semver(version).isGreaterThanOrEqualTo(LICENSE_REFERENCES_OPTION_VERSION)) { | ||
| // Required to be able to map ScanCode license keys to SPDX IDs. | ||
| add("--license-references") | ||
| } | ||
| } | ||
|
|
||
| val commandLineOptions by lazy { getCommandLineOptions(version) } | ||
| override val configuration by lazy { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding the per file findings as entries into the text location based license findings which correspond to a line range an a specific file, also implies that the existing license finding curation mechanism becomes applicable to these per file entries. So, from the perspective of creating the license finding curations, I wonder about the implication such as would we accept (e.g. in ort-config) also license finding curations which match such per file license findings?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we should, mostly for two reasons:
|
||
| buildList { | ||
| addAll(config.commandLine) | ||
|
|
||
| // Add this in the style of a fake command line option for consistency with the above. | ||
| if (config.preferFileLicense) add("--prefer-file-license") | ||
| }.joinToString(" ") | ||
| } | ||
|
|
||
| override val matcher by lazy { ScannerMatcher.create(details, wrapperConfig.matcherConfig) } | ||
|
|
||
| override val readFromStorage by lazy { wrapperConfig.readFromStorageWithDefault(matcher) } | ||
|
|
||
| override val writeToStorage by lazy { wrapperConfig.writeToStorageWithDefault(matcher) } | ||
|
|
||
| override fun command(workingDir: File?) = | ||
| listOfNotNull(workingDir, if (Os.isWindows) "scancode.bat" else "scancode").joinToString(File.separator) | ||
|
|
@@ -179,7 +155,7 @@ class ScanCode internal constructor( | |
| } | ||
|
|
||
| override fun createSummary(result: String, startTime: Instant, endTime: Instant): ScanSummary = | ||
| parseResult(result).toScanSummary() | ||
| parseResult(result).toScanSummary(config.preferFileLicense) | ||
|
|
||
| /** | ||
| * Execute ScanCode with the configured arguments to scan the given [path] and produce [resultFile]. | ||
|
|
@@ -188,8 +164,8 @@ class ScanCode internal constructor( | |
| ProcessCapture( | ||
| command(), | ||
| *commandLineOptions.toTypedArray(), | ||
| path.absolutePath, | ||
| OUTPUT_FORMAT_OPTION, | ||
| resultFile.absolutePath | ||
| // The output format option needs to directly precede the result file path. | ||
| "--json-pp", resultFile.absolutePath, | ||
| path.absolutePath | ||
| ) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,7 +58,7 @@ private data class LicenseMatch( | |
| val score: Float | ||
| ) | ||
|
|
||
| fun ScanCodeResult.toScanSummary(): ScanSummary { | ||
| fun ScanCodeResult.toScanSummary(preferFileLicense: Boolean = false): ScanSummary { | ||
| val licenseFindings = mutableSetOf<LicenseFinding>() | ||
| val copyrightFindings = mutableSetOf<CopyrightFinding>() | ||
| val issues = mutableListOf<Issue>() | ||
|
|
@@ -91,19 +91,31 @@ fun ScanCodeResult.toScanSummary(): ScanSummary { | |
| it.value.first() | ||
| } | ||
|
|
||
| licenses.mapTo(licenseFindings) { license -> | ||
| // ScanCode uses its own license keys as identifiers in license expressions. | ||
| val spdxLicenseExpression = license.licenseExpression.mapLicense(scanCodeKeyToSpdxIdMappings) | ||
|
|
||
| LicenseFinding( | ||
| license = spdxLicenseExpression, | ||
| if (preferFileLicense && file is FileEntry.Version3 && file.detectedLicenseExpressionSpdx != null) { | ||
| licenseFindings += LicenseFinding( | ||
| license = file.detectedLicenseExpressionSpdx, | ||
| location = TextLocation( | ||
| path = file.path, | ||
| startLine = license.startLine, | ||
| endLine = license.endLine | ||
| startLine = licenses.minOf { it.startLine }, | ||
| endLine = licenses.maxOf { it.endLine } | ||
| ), | ||
| score = license.score | ||
| score = licenses.map { it.score }.average().toFloat() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you know the rational DOS use when coming up with an
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@Etsija and @lamppu probably know better, but the average is pretty much the only metric that somewhat makes sense and is reasonably easy to calculate. Min. does not make sense, max. does not make sense. Of course, you could do something "crazy" like only taking the scores of those line findings into account whose licenses are also contained in the file finding, and maybe even weight the score according to the number of lines matched, but that's much more work for questionable benefit.
Let's ask the author's: @pombredanne, does ScanCode's
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Another alternative would be to omit
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Making lines optional would be a bigger change in the data model, and it would prevent us from matching copyright to license findings, and probably have more implications. However, I forgot that the
More likely to be flaky than what, than the individual per-line findings? As the lines and score are calculated from the per-line findings, they can only be flaky if the per-line findings themselves are already flaky.
Correct, but that's the way forward how ScanCode will operate, I guess. As this discussion shows, there seem to be no endeavors to enhance the quality of the per-line findings themselves, but to achieve better quality by post-processing these findings into the per-file finding.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't much about the processing inside of scancode, so I don't want to move this discussion off-topic,
I believe this would exhibit several advantages over the current solution. Also, IMO it'd be nice that per line findings would no more be mutually exclusive from per file findings. @pombredanne - what do you think about this idea?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree, if that's possible, and ScanCode does not require internal data for this that's not written to the result file.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@fviernau I still need you input here. I believe that's the last thing to clarify before we could merge?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd have slight tendency to omit it, but I also believe my knowledge is too limited in that area to make the call. So, all in all I prefer to omit it, but would be fine if you kept it. Up to you.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mnonnenmacher what do you think about #8152 (comment) ? |
||
| ) | ||
|
Comment on lines
+94
to
103
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| } else { | ||
| licenses.mapTo(licenseFindings) { license -> | ||
| // ScanCode uses its own license keys as identifiers in license expressions. | ||
| val spdxLicenseExpression = license.licenseExpression.mapLicense(scanCodeKeyToSpdxIdMappings) | ||
|
|
||
| LicenseFinding( | ||
| license = spdxLicenseExpression, | ||
| location = TextLocation( | ||
| path = file.path, | ||
| startLine = license.startLine, | ||
| endLine = license.endLine | ||
| ), | ||
| score = license.score | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| file.copyrights.mapTo(copyrightFindings) { copyright -> | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.