Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

[System.Globalization.Native] Fix small issues in CloneCollatorWithOptions and GetCollatorFromSortHandle #24100

Merged
merged 5 commits into from
Apr 25, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 41 additions & 83 deletions src/corefx/System.Globalization.Native/pal_collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const int32_t CompareOptionsIgnoreNonSpace = 0x2;
const int32_t CompareOptionsIgnoreSymbols = 0x4;
const int32_t CompareOptionsIgnoreKanaType = 0x8;
const int32_t CompareOptionsIgnoreWidth = 0x10;
const int32_t CompareOptionsMask = 0x1f;
// const int32_t CompareOptionsStringSort = 0x20000000;
// ICU's default is to use "StringSort", i.e. nonalphanumeric symbols come before alphanumeric.
// When StringSort is not specified (.NET's default), the sort order will be different between
Expand All @@ -41,21 +42,11 @@ typedef struct { int32_t key; UCollator* UCollator; } TCollatorMap;
*/
struct SortHandle
{
UCollator* regular;
pthread_mutex_t collatorsLockObject;
void* collatorsPerOptionRoot;
UCollator* collatorsPerOption[CompareOptionsMask + 1];
};

typedef struct { UChar* items; size_t capacity; size_t size; } UCharList;

static int TreeComparer(const void* left, const void* right)
{
const TCollatorMap* leftMap = left;
const TCollatorMap* rightMap = right;
if (leftMap->key < rightMap->key) return -1;
if (leftMap->key > rightMap->key) return 1;
return 0;
}
typedef struct { UChar* items; size_t size; } UCharList;

// Hiragana character range
const UChar hiraganaStart = 0x3041;
Expand Down Expand Up @@ -137,24 +128,6 @@ static int IsHalfFullHigherSymbol(UChar character)
|| (0xff61 <= character && character <= 0xff65);
}

static int AddItem(UCharList* list, const UChar item)
{
size_t size = list->size++;
if (size >= list->capacity)
{
list->capacity *= 2;
UChar* ptr = (UChar*)realloc(list->items, list->capacity * sizeof(UChar*));
if (ptr == NULL)
{
return FALSE;
}
list->items = ptr;
}

list->items[size] = item;
return TRUE;
}

/*
Gets a string of custom collation rules, if necessary.

Expand Down Expand Up @@ -184,19 +157,19 @@ static UCharList* GetCustomRules(int32_t options, UColAttributeValue strength, i
}

// If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long
// and the Width custom rule will be at least 215 halfwidth characters * 4 = 860 chars long.
// Use 512 as the starting size, so the customRules won't have to grow if we are just
// doing the KanaType custom rule.
customRules->capacity = 512;
customRules->items = calloc(customRules->capacity, sizeof(UChar));
// and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

212 [](start = 49, length = 3)

How did you get 212 here?

looking at https://unicode.org/charts/PDF/UFF00.pdf, If considering all half width characters then this number would be really 201.

Also, the Kana characters should be 96?

I am ok to use a bigger numbers as I am not expecting the allocation will be much bigger anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

linking the original PR introduced creating this custom rule #2196


In reply to: 277037987 [](ancestors = 277037987)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you get 212 here?

The constant g_HalfFullCharsLength declared in the file. The description could be wrong though.

Also, the Kana characters should be 96?

Taken from the actual loop and comments in the same function.

int capacity =
((needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) +
((needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) ? 5 * g_HalfFullCharsLength : 0);

UChar* items;
customRules->items = items = malloc(capacity * sizeof(UChar));
if (customRules->items == NULL)
{
free(customRules);
return NULL;
}

customRules->size = 0;

if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule)
{
UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<';
Expand All @@ -206,15 +179,11 @@ static UCharList* GetCustomRules(int32_t options, UColAttributeValue strength, i
// Hiragana is the range 3041 to 3096 & 309D & 309E
if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0x309D [](start = 58, length = 6)

This should be 309F now?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://unicode.org/charts/PDF/U3040.pdf


In reply to: 277039452 [](ancestors = 277039452)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you prefer me to update it here or in a separate PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe @eerhardt can comment why we didn't include 309F in the first place as this introduced awhile ago in Unicode 3.2. but you shouldn't block on this.

{
if(!(AddItem(customRules, '&') &&
AddItem(customRules, hiraganaChar) &&
AddItem(customRules, compareChar) &&
AddItem(customRules, hiraganaChar + hiraganaToKatakanaOffset)))
{
free(customRules->items);
free(customRules);
return NULL;
}
assert(items - customRules->items <= capacity - 4);
*(items++) = '&';
*(items++) = hiraganaChar;
*(items++) = compareChar;
*(items++) = hiraganaChar + hiraganaToKatakanaOffset;
}
}
}
Expand All @@ -237,20 +206,21 @@ static UCharList* GetCustomRules(int32_t options, UColAttributeValue strength, i
// this character is a symbol, and if so skip it
if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar))))
{
if(!(AddItem(customRules, '&') &&
(!needsEscape || AddItem(customRules, '\\')) &&
AddItem(customRules, lowerChar) &&
AddItem(customRules, compareChar) &&
AddItem(customRules, higherChar)))
assert(items - customRules->items <= capacity - 5);
*(items++) = '&';
if (needsEscape)
{
free(customRules->items);
free(customRules);
return NULL;
*(items++) = '\\';
}
*(items++) = lowerChar;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be hardened for potential buffer overruns. It is dangerous to allocate a buffer using complex buffer size formula and then start filling it using complex algorithm assuming that the formula and algorithm are in sync.

I would keep the Add method method and assert+fail if the buffer is not big enough.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already an assert on both places where this is used (line 206 and 179).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make the method fail in release build too.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's necessary. There's a very limited number of input values (31) and they should all be exercised through the CoreFX tests. The CI should be able to catch any error in the debug build and bail out on the assert.

I will change the calculation to get rid of the magic 88 value in favour of the constraints from the first loop.

I'll also go and check that CoreFX actually has tests for this.

Copy link
Member Author

@filipnavara filipnavara Apr 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests in https://github.com/dotnet/corefx/blob/a10890f4ffe0fadf090c922578ba0e606ebdd16c/src/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs hit both of the loops in the GetCustomRules function. If the code was broken the asserts would fire on the CI.

*(items++) = compareChar;
*(items++) = higherChar;
}
}
}

customRules->size = items - customRules->items;

return customRules;
}

Expand Down Expand Up @@ -280,7 +250,7 @@ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options,

UCollator* pClonedCollator;
UCharList* customRules = GetCustomRules(options, strength, isIgnoreSymbols);
if (customRules == NULL)
if (customRules == NULL || customRules->size == 0)
{
pClonedCollator = ucol_safeClone(pCollator, NULL, NULL, pErr);
}
Expand All @@ -305,8 +275,8 @@ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options,

pClonedCollator = ucol_openRules(completeRules, completeRulesLength, UCOL_DEFAULT, strength, NULL, pErr);
free(completeRules);
free(customRules);
}
free(customRules);

if (isIgnoreSymbols)
{
Expand Down Expand Up @@ -371,9 +341,9 @@ void CreateSortHandle(SortHandle** ppSortHandle)
return;
}

(*ppSortHandle)->collatorsPerOptionRoot = NULL;
int result = pthread_mutex_init(&(*ppSortHandle)->collatorsLockObject, NULL);
memset(*ppSortHandle, 0, sizeof(SortHandle));

int result = pthread_mutex_init(&(*ppSortHandle)->collatorsLockObject, NULL);
if (result != 0)
{
assert(FALSE && "Unexpected pthread_mutex_init return value.");
Expand All @@ -392,7 +362,7 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl

UErrorCode err = U_ZERO_ERROR;

(*ppSortHandle)->regular = ucol_open(lpLocaleName, &err);
(*ppSortHandle)->collatorsPerOption[0] = ucol_open(lpLocaleName, &err);

if (U_FAILURE(err))
{
Expand All @@ -406,15 +376,13 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl

void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
{
ucol_close(pSortHandle->regular);
pSortHandle->regular = NULL;

while (pSortHandle->collatorsPerOptionRoot != NULL)
for (int i = 0; i <= CompareOptionsMask; i++)
{
TCollatorMap* data = *(TCollatorMap **)pSortHandle->collatorsPerOptionRoot;
tdelete(data, &pSortHandle->collatorsPerOptionRoot, TreeComparer);
ucol_close(data->UCollator);
free(data);
if (pSortHandle->collatorsPerOption[i] != NULL)
{
ucol_close(pSortHandle->collatorsPerOption[i]);
pSortHandle->collatorsPerOption[i] = NULL;
}
}

pthread_mutex_destroy(&pSortHandle->collatorsLockObject);
Expand All @@ -427,7 +395,7 @@ const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32_t opti
UCollator* pCollator;
if (options == 0)
{
pCollator = pSortHandle->regular;
pCollator = pSortHandle->collatorsPerOption[0];
}
else
{
Expand All @@ -437,22 +405,12 @@ const UCollator* GetCollatorFromSortHandle(SortHandle* pSortHandle, int32_t opti
assert(FALSE && "Unexpected pthread_mutex_lock return value.");
}

TCollatorMap* map = (TCollatorMap*)malloc(sizeof(TCollatorMap));
map->key = options;
// tfind on glibc is significantly faster than tsearch and we expect
// to hit the cache here often so it's benefitial to prefer lookup time
// over addition time
void* entry = tfind(map, &pSortHandle->collatorsPerOptionRoot, TreeComparer);
if (entry == NULL)
{
pCollator = CloneCollatorWithOptions(pSortHandle->regular, options, pErr);
map->UCollator = pCollator;
tsearch(map, &pSortHandle->collatorsPerOptionRoot, TreeComparer);
}
else
options &= CompareOptionsMask;
pCollator = pSortHandle->collatorsPerOption[options];
if (pCollator == NULL)
{
free(map);
pCollator = (*(TCollatorMap**)entry)->UCollator;
pCollator = CloneCollatorWithOptions(pSortHandle->collatorsPerOption[0], options, pErr);
pSortHandle->collatorsPerOption[options] = pCollator;
}

pthread_mutex_unlock(&pSortHandle->collatorsLockObject);
Expand Down