diff --git a/inc/limits.h b/inc/limits.h index b91dedd..69122a4 100644 --- a/inc/limits.h +++ b/inc/limits.h @@ -37,9 +37,9 @@ /* Snippets */ #define DEFAULT_MATCHMAP_FILES 10000 // Default number of files evaluated in snippet matching -#define MAX_MATCHMAP_FILES (DEFAULT_MATCHMAP_FILES * 5) // Max number of files evaluated in snippet matching to prevent performance issues +#define MAX_MATCHMAP_FILES (DEFAULT_MATCHMAP_FILES * 10) // Max number of files evaluated in snippet matching to prevent performance issues +#define MIN_LINES_COVERAGE 0.8 #define SKIP_SNIPPETS_IF_FILE_BIGGER (1024 * 1024 * 4) -#define SKIP_SNIPPETS_IF_STARTS_WITH (const char*[3]) {"{", "release_date) return true; - if (!path_is_third_party(a->file) && path_is_third_party(b->file) && !(engine_flags & ENABLE_PATH_HINT)) + if (!path_is_third_party(a->file) && path_is_third_party(b->file)) { scanlog("Component rejected by third party filter\n"); return false; @@ -331,7 +331,7 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ { if (purl_source_check(a) > purl_source_check(b)) { - scanlog("Component prefered by vsource\n"); + scanlog("Component prefered by source\n"); return true; } @@ -471,39 +471,15 @@ bool load_matches(match_data_t *match) { scanlog("Load matches\n"); - /* Compile match ranges and fill up matched percent */ - int hits = 100; - int matched_percent = 100; - /* Get matching line ranges (snippet match) */ - if (match->type == MATCH_SNIPPET) - { - hits = compile_ranges(match); - scanlog("compile_ranges returns %d hits\n", hits); - - if (hits < min_match_hits) - { - match->type = MATCH_NONE; - return false; - } - - float percent = (hits * 100) / match->scan_ower->total_lines; - if (hits) - matched_percent = floor(percent); - if (matched_percent > 99) - matched_percent = 99; - if (matched_percent < 1) - matched_percent = 1; - - asprintf(&match->matched_percent, "%u%%", matched_percent); - } - else if (match->type == MATCH_BINARY) + + if (match->type == MATCH_BINARY) { asprintf(&match->line_ranges, "n/a"); asprintf(&match->oss_ranges, "n/a"); asprintf(&match->matched_percent, "%d functions matched", match->hits); } - else + else if (match->type == MATCH_FILE) { asprintf(&match->line_ranges, "all"); asprintf(&match->oss_ranges, "all"); @@ -696,11 +672,13 @@ void match_select_best(scan_data_t *scan) break; } - if (!best_match_component->identified && match_component->identified) + if ((!best_match_component->identified && match_component->identified) || + (strcmp(best_match_component->vendor,best_match_component->component) && !strcmp(match_component->vendor, match_component->component)) || + (path_is_third_party(best_match_component->file) && !path_is_third_party(match_component->file))) { scanlog("Replacing best match for a prefered component\n"); scan->matches_list_array[i]->best_match = item->match; - } + } } } diff --git a/src/match_list.c b/src/match_list.c index 276fa7d..9df35af 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -311,11 +311,11 @@ bool match_list_add(match_list_t *list, match_data_t *new_match, bool (*val)(mat } /* in autolimit mode the list doesnt have a fix size, it will accept all the matchest until a 75% of the fist element (the biggest) */ //TODO: this part of the code should be in the function pointer or I need to re-evaluate the archtecture of this function */ - if (list->autolimit && !tolerance_eval(list->headp.lh_first->match->hits, list->last_element->match->hits)) + if (list->autolimit && !tolerance_eval(list->headp.lh_first->match->lines_matched, list->last_element->match->lines_matched)) { np = list->headp.lh_first; /*We have to find and remove the unwanted elements */ - for (; np->entries.le_next != NULL && tolerance_eval(list->headp.lh_first->match->hits, np->entries.le_next->match->hits); np = np->entries.le_next) + for (; np->entries.le_next != NULL && tolerance_eval(list->headp.lh_first->match->lines_matched, np->entries.le_next->match->lines_matched); np = np->entries.le_next) { } @@ -403,6 +403,18 @@ bool match_list_print(match_list_t *list, bool (*printer)(match_data_t *fpa), ch return true; } +bool match_list_eval(match_list_t *list, match_data_t * in, bool (*eval)(match_data_t *fpa, match_data_t *fpb)) +{ + int i = 0; + for (struct entry *np = list->headp.lh_first; np != NULL && iitems; np = np->entries.le_next) + { + if(eval(np->match, in)) + return true; + i++; + } + return false; +} + void component_list_print(component_list_t *list, bool (*printer)(component_data_t *fpa), char *separator) { for (struct comp_entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) diff --git a/src/report.c b/src/report.c index 99c058c..c650328 100644 --- a/src/report.c +++ b/src/report.c @@ -334,17 +334,19 @@ bool print_json_match(struct match_data_t * match) printf(",\"source_hash\": \"%s\"", match->source_md5); /* Output file_url (same as url when match type = url) */ - char * file_url_enabled = getenv("SCANOSS_FILE_CONTENTS"); - if (!file_url_enabled || strcmp(file_url_enabled, "false")) + char * file_contents_url = getenv("SCANOSS_FILE_CONTENTS_URL"); + if (file_contents_url && *file_contents_url && strcmp(file_contents_url, "false")) { if (!match->component_list.headp.lh_first->component->url_match) { - char *custom_url = getenv("SCANOSS_API_URL"); - printf(",\"file_url\": \"%s/file_contents/%s\"", custom_url ? custom_url : API_URL, file_id); + printf(",\"file_url\": \"%s/%s\"", file_contents_url, file_id); } else printf(",\"file_url\": \"%s\"", match->component_list.headp.lh_first->component->url); } + else //return an empty string + printf(",\"file_url\": \" \""); + free(file_id); diff --git a/src/scan.c b/src/scan.c index 608fdfd..8d586d0 100644 --- a/src/scan.c +++ b/src/scan.c @@ -59,7 +59,7 @@ scan_data_t * scan_data_init(char *target, int max_snippets, int max_components) scan_data_t * scan = calloc(1, sizeof(*scan)); scan->file_path = strdup(target); scan->file_size = malloc(32); - scan->hashes = malloc(MAX_FILE_SIZE); + scan->hashes = calloc(MAX_FILE_SIZE,1); scan->lines = malloc(MAX_FILE_SIZE); scan->match_type = MATCH_NONE; diff --git a/src/snippets.c b/src/snippets.c index 6bb4fb2..1c3717c 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -40,6 +40,7 @@ #include "match.h" #include "match_list.h" #include "stdlib.h" +#include "snippets.h" int matchmap_max_files = DEFAULT_MATCHMAP_FILES; /** @@ -91,6 +92,20 @@ static bool hit_test(match_data_t *a, match_data_t *b) else return false; } + +bool ranges_intersection(match_data_t *a, match_data_t *b) +{ + for (int i = 0; i < a->matchmap_reg->ranges_number; i++) + { + for (int j = 0; j < b->matchmap_reg->ranges_number; j++) + { + if (a->matchmap_reg->range[i].from <= b->matchmap_reg->range[j].to && + b->matchmap_reg->range[j].from <= a->matchmap_reg->range[i].to) + return true; + } + } + return false; +} /** * @brief Fill the matches list array based on the matchmap. The possible matches will be sorted by hits number. * @@ -103,9 +118,8 @@ void biggest_snippet(scan_data_t *scan) for (int i = 0; i < scan->max_snippets_to_process; i++) scan->matches_list_array_indirection[i] = -1; - int snippet_tolerance = range_tolerance / scan->max_snippets_to_process + min_match_lines; /* Used to define bounds between two possible snippets */ /*Fill the matches list with the files from the matchmap */ - for (int sector = 0; sector < 255; sector++) + for (int sector = 0; sector < 256; sector++) { int j = scan->matchmap_rank_by_sector[sector]; @@ -115,48 +129,65 @@ void biggest_snippet(scan_data_t *scan) if (scan->matchmap[j].hits >= min_match_hits) /* Only consider file with more than min_match_hits */ { match_data_t *match_new = calloc(1, sizeof(match_data_t)); /* Create a match object */ - memcpy(match_new->file_md5, scan->matchmap[j].md5, MD5_LEN); + memcpy(match_new->file_md5, scan->matchmap[j].md5, oss_file.key_ln); match_new->hits = scan->matchmap[j].hits; match_new->matchmap_reg = &scan->matchmap[j]; match_new->type = scan->match_type; match_new->from = scan->matchmap[j].range->from; strcpy(match_new->source_md5, scan->source_md5); match_new->scan_ower = scan; - bool found = false; int i = 0; - for (; i < scan->matches_list_array_index; i++) /*Check if there is already a list for this line ranges */ + + if (snippet_extension_discard(match_new)) { - if (scan->matches_list_array_indirection[i] > -1 && - abs(scan->matches_list_array_indirection[i] - match_new->from) < snippet_tolerance) - { - found = true; - break; - } + match_data_free(match_new); + continue; + } + + int matched_lines = compile_ranges(match_new); + if (matched_lines < min_match_lines) { + match_data_free(match_new); + continue; } - if (!found) /*If there is no list for the snippet range we have to create a new one */ + float percent = (matched_lines * 100) / match_new->scan_ower->total_lines; + int matched_percent = floor(percent); + if (matched_percent > 99) + matched_percent = 99; + if (matched_percent < 1) + matched_percent = 1; + asprintf(&match_new->matched_percent, "%u%%", matched_percent); + match_new->lines_matched = matched_lines; + //match_new->hits = hits; + + do /*Check if there is already a list for this line ranges */ { - if (scan->matches_list_array_index < scan->max_snippets_to_process) /* Check for the list limit */ + if (!scan->matches_list_array[scan->matches_list_array_index] && scan->matches_list_array_index < scan->max_snippets_to_process) { - scan->matches_list_array_indirection[scan->matches_list_array_index] = match_new->from; /*update indirection*/ - scan->matches_list_array[scan->matches_list_array_index] = match_list_init(true, 1); /*create the list*/ - i = scan->matches_list_array_index; /* update index*/ + scan->matches_list_array[scan->matches_list_array_index] = match_list_init(true, 1); /*create the list if it doesnt exist*/ scan->matches_list_array_index++; + if(!match_list_add(scan->matches_list_array[i], match_new, hit_test, true)) + { + match_data_free(match_new); + } + break; } - else - i = scan->max_snippets_to_process - 1; /*add in the last available list if there is no more space for new lists*/ - } - - if (snippet_extension_discard(match_new) || !match_list_add(scan->matches_list_array[i], match_new, hit_test, true)) /*Add the match in the selected list */ - { - scanlog("Rejected match with %d hits\n", match_new->hits); - match_data_free(match_new); /* the the memory if the match was not accepted in the list */ - } + if (match_list_eval(scan->matches_list_array[i], match_new, ranges_intersection) || i == scan->max_snippets_to_process -1) + { + if(!match_list_add(scan->matches_list_array[i], match_new, hit_test, true)) + { + match_data_free(match_new); + } + break; + } + i++; + } while(i < scan->matches_list_array_index); /*Check if there is already a list for this line ranges */ } } /*just for loging*/ if (debug_on) { + scanlog("Match list array index: %d\n", scan->matches_list_array_index); for (int i = 0; i < scan->matches_list_array_index; i++) { scanlog("Match list N %d, with %d matches. %d <= HITS <= %d \n", i, scan->matches_list_array[i]->items, @@ -165,8 +196,8 @@ void biggest_snippet(scan_data_t *scan) struct entry *item = NULL; LIST_FOREACH(item, &scan->matches_list_array[i]->headp, entries) { - char md5_hex[MD5_LEN * 2 + 1]; - ldb_bin_to_hex(item->match->file_md5, MD5_LEN, md5_hex); + char md5_hex[oss_file.key_ln * 2 + 1]; + ldb_bin_to_hex(item->match->file_md5, oss_file.key_ln, md5_hex); scanlog("%s - %d\n", md5_hex, item->match->hits); } } @@ -348,8 +379,11 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) { if(out_ranges_index >= 0 && (ranges[i].from - tolerance <= out_ranges[out_ranges_index].to)) { + if (out_ranges[out_ranges_index].to > ranges[i].to) + continue; + out_ranges[out_ranges_index].to = ranges[i].to; - scanlog("join range %d with %d\n", i, out_ranges_index); + //scanlog("join range %d with %d: %d - %d\n", i, out_ranges_index, out_ranges[out_ranges_index].from, out_ranges[out_ranges_index].to); } else { @@ -397,35 +431,7 @@ uint32_t compile_ranges(match_data_t *match) return 0; } - uint16_t reported_hits = match->matchmap_reg->hits; int hits = 0; - /* Revise hits and decrease if needed */ - for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) - { - long from = match->matchmap_reg->range[i].from; //uint16_read(match->matchmap_reg + MD5_LEN + 2 + i * 6); - long to = match->matchmap_reg->range[i].to; //uint16_read(match->matchmap_reg + MD5_LEN + 2 + i * 6 + 2); - long delta = to - from; - - if (to < 1) - break; - - /* Ranges to be ignored (under min_match_lines) should decrease hits counter */ - if (delta < min_match_lines) - { - /* Single-line range decreases by 1, otherwise decrease by 2 (from and to) */ - reported_hits -= ((delta == 0) ? 1 : 2); - } - - /* Exit if hits is below two */ - if (reported_hits < min_match_hits) - { - scanlog("Discarted ranges brings hits count to %u\n", reported_hits); - return 0; - } - - scanlog("compile_ranges #%d = %ld to %ld - OSS from: %d\n", i, from, to, match->matchmap_reg->range[i].oss_line); - } - /* Add tolerances and assemble line ranges */ ranges_sort(match->matchmap_reg->range, match->matchmap_reg->ranges_number); @@ -800,7 +806,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) } if (cat_limit > scan->max_matchmap_size) { - if ((hashes_to_process < scan->hash_count / 10 || (float) lines_coverage / scan->hash_count < 0.6) && cat_limit < MAX_MATCHMAP_FILES) + if ((hashes_to_process < scan->hash_count / 10 || (float) lines_coverage / scan->hash_count < MIN_LINES_COVERAGE) && cat_limit < MAX_MATCHMAP_FILES) { scan->max_matchmap_size += map[map_indirection[i][j]].size; } diff --git a/src/util.c b/src/util.c index cfbcb99..4151d66 100644 --- a/src/util.c +++ b/src/util.c @@ -330,10 +330,10 @@ void free_and_null(void * pr) bool path_is_third_party(const char* path) { - // Array de patrones comunes const char* patterns[] = { "third_party", "3rdparty", + "site-packages", "vendor", "external", "dependencies", @@ -376,10 +376,8 @@ bool path_is_third_party(const char* path) "LibResources" }; - // Número de patrones a verificar const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); - // Verificar cada patrón for (int i = 0; i < numPatterns; i++) { if (strstr(path, patterns[i]) != NULL) diff --git a/src/versions.c b/src/versions.c index 405beed..2639bf8 100644 --- a/src/versions.c +++ b/src/versions.c @@ -55,10 +55,10 @@ void normalise_version(char *version, char *component) return; char aux[MAX_FIELD_LN] = "\0"; + int compt_len = strlen(component); /* Remove leading component name from version */ - if ((version && component) && stristart(version, component)) + if ((version && component) && stristart(version, component) && strlen(version) > compt_len + 1) { - int compt_len = strlen(component); sprintf(aux, "%s",version + compt_len + 1); }