Skip to content

Commit 8968a5c

Browse files
authored
FEAT: Support Emojis for data types (#166)
### Work Item / Issue Reference <!-- IMPORTANT: Please follow the PR template guidelines below. For mssql-python maintainers: Insert your ADO Work Item ID below (e.g. AB#37452) For external contributors: Insert Github Issue number below (e.g. #149) Only one reference is required - either GitHub issue OR ADO Work Item. --> <!-- mssql-python maintainers: ADO Work Item --> > [AB#37664](https://sqlclientdrivers.visualstudio.com/c6d89619-62de-46a0-8b46-70b92a84d85e/_workitems/edit/37664) <!-- External contributors: GitHub Issue --> > GitHub Issue: #<ISSUE_NUMBER> ------------------------------------------------------------------- ### Summary <!-- Insert your summary of changes below. Minimum 10 characters required. --> This pull request improves Unicode string handling in the SQL type mapping logic and adds comprehensive tests for round-tripping emoji and special characters in the database. The main focus is on ensuring that Unicode strings, including those with emojis and special characters, are correctly mapped and stored in SQL Server. **Unicode string mapping improvements:** * Updated the `_map_sql_type` method in `cursor.py` to use the UTF-16 code unit length for Unicode string parameters instead of the Python string length, ensuring correct sizing for both long and short Unicode strings. [[1]](diffhunk://#diff-deceea46ae01082ce8400e14fa02f4b7585afb7b5ed9885338b66494f5f38280R326-R330) [[2]](diffhunk://#diff-deceea46ae01082ce8400e14fa02f4b7585afb7b5ed9885338b66494f5f38280R340-R344) **Testing enhancements:** * Added a new test `test_emoji_round_trip` in `test_004_cursor.py` to verify that various emoji, accented, and non-Latin characters can be inserted and retrieved accurately from the database, improving coverage for Unicode edge cases. <!-- ### PR Title Guide > For feature requests FEAT: (short-description) > For non-feature requests like test case updates, config updates , dependency updates etc CHORE: (short-description) > For Fix requests FIX: (short-description) > For doc update requests DOC: (short-description) > For Formatting, indentation, or styling update STYLE: (short-description) > For Refactor, without any feature changes REFACTOR: (short-description) > For release related changes, without any feature changes RELEASE: #<RELEASE_VERSION> (short-description) ### Contribution Guidelines External contributors: - Create a GitHub issue first: https://github.com/microsoft/mssql-python/issues/new - Link the GitHub issue in the "GitHub Issue" section above - Follow the PR title format and provide a meaningful summary mssql-python maintainers: - Create an ADO Work Item following internal processes - Link the ADO Work Item in the "ADO Work Item" section above - Follow the PR title format and provide a meaningful summary -->
1 parent 12245e8 commit 8968a5c

File tree

4 files changed

+156
-28
lines changed

4 files changed

+156
-28
lines changed

mssql_python/cursor.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,10 +332,11 @@ def _map_sql_type(self, param, parameters_list, i):
332332
# TODO: revisit
333333
if len(param) > 4000: # Long strings
334334
if is_unicode:
335+
utf16_len = len(param.encode("utf-16-le")) // 2
335336
return (
336337
ddbc_sql_const.SQL_WLONGVARCHAR.value,
337338
ddbc_sql_const.SQL_C_WCHAR.value,
338-
len(param),
339+
utf16_len,
339340
0,
340341
)
341342
return (
@@ -345,10 +346,11 @@ def _map_sql_type(self, param, parameters_list, i):
345346
0,
346347
)
347348
if is_unicode: # Short Unicode strings
349+
utf16_len = len(param.encode("utf-16-le")) // 2
348350
return (
349351
ddbc_sql_const.SQL_WVARCHAR.value,
350352
ddbc_sql_const.SQL_C_WCHAR.value,
351-
len(param),
353+
utf16_len,
352354
0,
353355
)
354356
return (

mssql_python/pybind/ddbc_bindings.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -275,15 +275,19 @@ SQLRETURN BindParameters(SQLHANDLE hStmt, const py::list& params,
275275
AllocateParamBuffer<std::vector<SQLWCHAR>>(paramBuffers);
276276

277277
// Reserve space and convert from wstring to SQLWCHAR array
278-
sqlwcharBuffer->resize(strParam->size() + 1, 0); // +1 for null terminator
279-
280-
// Convert each wchar_t (4 bytes on macOS) to SQLWCHAR (2 bytes)
281-
for (size_t i = 0; i < strParam->size(); i++) {
282-
(*sqlwcharBuffer)[i] = static_cast<SQLWCHAR>((*strParam)[i]);
278+
std::vector<SQLWCHAR> utf16 = WStringToSQLWCHAR(*strParam);
279+
if (utf16.size() < strParam->size()) {
280+
LOG("Warning: UTF-16 encoding shrank string? input={} output={}",
281+
strParam->size(), utf16.size());
282+
}
283+
if (utf16.size() > strParam->size() * 2 + 1) {
284+
LOG("Warning: UTF-16 expansion unusually large: input={} output={}",
285+
strParam->size(), utf16.size());
283286
}
287+
*sqlwcharBuffer = std::move(utf16);
284288
// Use the SQLWCHAR buffer instead of the wstring directly
285289
dataPtr = sqlwcharBuffer->data();
286-
bufferLength = (strParam->size() + 1) * sizeof(SQLWCHAR);
290+
bufferLength = sqlwcharBuffer->size() * sizeof(SQLWCHAR);
287291
LOG("macOS: Created SQLWCHAR buffer for parameter with size: {} bytes", bufferLength);
288292
#else
289293
// On Windows, wchar_t and SQLWCHAR are the same size, so direct cast works
@@ -1705,7 +1709,16 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
17051709
if (numCharsInData < dataBuffer.size()) {
17061710
// SQLGetData will null-terminate the data
17071711
#if defined(__APPLE__) || defined(__linux__)
1708-
row.append(SQLWCHARToWString(dataBuffer.data(), SQL_NTS));
1712+
auto raw_bytes = reinterpret_cast<const char*>(dataBuffer.data());
1713+
size_t actualBufferSize = dataBuffer.size() * sizeof(SQLWCHAR);
1714+
if (dataLen < 0 || static_cast<size_t>(dataLen) > actualBufferSize) {
1715+
LOG("Error: py::bytes creation request exceeds buffer size. dataLen={} buffer={}",
1716+
dataLen, actualBufferSize);
1717+
ThrowStdException("Invalid buffer length for py::bytes");
1718+
}
1719+
py::bytes py_bytes(raw_bytes, dataLen);
1720+
py::str decoded = py_bytes.attr("decode")("utf-16-le");
1721+
row.append(decoded);
17091722
#else
17101723
row.append(std::wstring(dataBuffer.data()));
17111724
#endif

mssql_python/pybind/ddbc_bindings.h

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,33 +33,107 @@ using namespace pybind11::literals;
3333
#include <sqlext.h>
3434

3535
#if defined(__APPLE__) || defined(__linux__)
36-
// macOS-specific headers
37-
#include <dlfcn.h>
36+
#include <dlfcn.h>
37+
38+
// Unicode constants for surrogate ranges and max scalar value
39+
constexpr uint32_t UNICODE_SURROGATE_HIGH_START = 0xD800;
40+
constexpr uint32_t UNICODE_SURROGATE_HIGH_END = 0xDBFF;
41+
constexpr uint32_t UNICODE_SURROGATE_LOW_START = 0xDC00;
42+
constexpr uint32_t UNICODE_SURROGATE_LOW_END = 0xDFFF;
43+
constexpr uint32_t UNICODE_MAX_CODEPOINT = 0x10FFFF;
44+
constexpr uint32_t UNICODE_REPLACEMENT_CHAR = 0xFFFD;
45+
46+
// Validate whether a code point is a legal Unicode scalar value
47+
// (excludes surrogate halves and values beyond U+10FFFF)
48+
inline bool IsValidUnicodeScalar(uint32_t cp) {
49+
return cp <= UNICODE_MAX_CODEPOINT &&
50+
!(cp >= UNICODE_SURROGATE_HIGH_START && cp <= UNICODE_SURROGATE_LOW_END);
51+
}
3852

39-
inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
40-
if (!sqlwStr) return std::wstring();
53+
inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
54+
if (!sqlwStr) return std::wstring();
4155

42-
if (length == SQL_NTS) {
43-
size_t i = 0;
44-
while (sqlwStr[i] != 0) ++i;
45-
length = i;
46-
}
56+
if (length == SQL_NTS) {
57+
size_t i = 0;
58+
while (sqlwStr[i] != 0) ++i;
59+
length = i;
60+
}
61+
std::wstring result;
62+
result.reserve(length);
4763

48-
std::wstring result;
49-
result.reserve(length);
64+
if constexpr (sizeof(SQLWCHAR) == 2) {
65+
// Decode UTF-16 to UTF-32 (with surrogate pair handling)
66+
for (size_t i = 0; i < length; ++i) {
67+
uint16_t wc = static_cast<uint16_t>(sqlwStr[i]);
68+
// Check if this is a high surrogate (U+D800–U+DBFF)
69+
if (wc >= UNICODE_SURROGATE_HIGH_START && wc <= UNICODE_SURROGATE_HIGH_END && i + 1 < length) {
70+
uint16_t low = static_cast<uint16_t>(sqlwStr[i + 1]);
71+
// Check if the next code unit is a low surrogate (U+DC00–U+DFFF)
72+
if (low >= UNICODE_SURROGATE_LOW_START && low <= UNICODE_SURROGATE_LOW_END) {
73+
// Combine surrogate pair into a single code point
74+
uint32_t cp = (((wc - UNICODE_SURROGATE_HIGH_START) << 10) | (low - UNICODE_SURROGATE_LOW_START)) + 0x10000;
75+
result.push_back(static_cast<wchar_t>(cp));
76+
++i; // Skip the low surrogate
77+
continue;
78+
}
79+
}
80+
// If valid scalar then append, else append replacement char (U+FFFD)
81+
if (IsValidUnicodeScalar(wc)) {
82+
result.push_back(static_cast<wchar_t>(wc));
83+
} else {
84+
result.push_back(static_cast<wchar_t>(UNICODE_REPLACEMENT_CHAR));
85+
}
86+
}
87+
} else {
88+
// SQLWCHAR is UTF-32, so just copy with validation
5089
for (size_t i = 0; i < length; ++i) {
51-
result.push_back(static_cast<wchar_t>(sqlwStr[i]));
90+
uint32_t cp = static_cast<uint32_t>(sqlwStr[i]);
91+
if (IsValidUnicodeScalar(cp)) {
92+
result.push_back(static_cast<wchar_t>(cp));
93+
} else {
94+
result.push_back(static_cast<wchar_t>(UNICODE_REPLACEMENT_CHAR));
95+
}
5296
}
53-
return result;
5497
}
98+
return result;
99+
}
55100

56-
inline std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
57-
std::vector<SQLWCHAR> result(str.size() + 1, 0); // +1 for null terminator
58-
for (size_t i = 0; i < str.size(); ++i) {
59-
result[i] = static_cast<SQLWCHAR>(str[i]);
101+
inline std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
102+
std::vector<SQLWCHAR> result;
103+
result.reserve(str.size() + 2);
104+
if constexpr (sizeof(SQLWCHAR) == 2) {
105+
// Encode UTF-32 to UTF-16
106+
for (wchar_t wc : str) {
107+
uint32_t cp = static_cast<uint32_t>(wc);
108+
if (!IsValidUnicodeScalar(cp)) {
109+
cp = UNICODE_REPLACEMENT_CHAR;
110+
}
111+
if (cp <= 0xFFFF) {
112+
// Fits in a single UTF-16 code unit
113+
result.push_back(static_cast<SQLWCHAR>(cp));
114+
} else {
115+
// Encode as surrogate pair
116+
cp -= 0x10000;
117+
SQLWCHAR high = static_cast<SQLWCHAR>((cp >> 10) + UNICODE_SURROGATE_HIGH_START);
118+
SQLWCHAR low = static_cast<SQLWCHAR>((cp & 0x3FF) + UNICODE_SURROGATE_LOW_START);
119+
result.push_back(high);
120+
result.push_back(low);
121+
}
122+
}
123+
} else {
124+
// Encode UTF-32 directly
125+
for (wchar_t wc : str) {
126+
uint32_t cp = static_cast<uint32_t>(wc);
127+
if (IsValidUnicodeScalar(cp)) {
128+
result.push_back(static_cast<SQLWCHAR>(cp));
129+
} else {
130+
result.push_back(static_cast<SQLWCHAR>(UNICODE_REPLACEMENT_CHAR));
131+
}
60132
}
61-
return result;
62133
}
134+
result.push_back(0); // null terminator
135+
return result;
136+
}
63137
#endif
64138

65139
#if defined(__APPLE__) || defined(__linux__)

tests/test_004_cursor.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5086,6 +5086,45 @@ def test_tables_cleanup(cursor, db_connection):
50865086
except Exception as e:
50875087
pytest.fail(f"Test cleanup failed: {e}")
50885088

5089+
def test_emoji_round_trip(cursor, db_connection):
5090+
"""Test round-trip of emoji and special characters"""
5091+
test_inputs = [
5092+
"Hello 😄",
5093+
"Flags 🇮🇳🇺🇸",
5094+
"Family 👨‍👩‍👧‍👦",
5095+
"Skin tone 👍🏽",
5096+
"Brain 🧠",
5097+
"Ice 🧊",
5098+
"Melting face 🫠",
5099+
"Accented éüñç",
5100+
"Chinese: 中文",
5101+
"Japanese: 日本語",
5102+
"Hello 🚀 World",
5103+
"admin🔒user",
5104+
"1🚀' OR '1'='1",
5105+
]
5106+
5107+
cursor.execute("""
5108+
CREATE TABLE #pytest_emoji_test (
5109+
id INT IDENTITY PRIMARY KEY,
5110+
content NVARCHAR(MAX)
5111+
);
5112+
""")
5113+
db_connection.commit()
5114+
5115+
for text in test_inputs:
5116+
try:
5117+
cursor.execute("INSERT INTO #pytest_emoji_test (content) OUTPUT INSERTED.id VALUES (?)", [text])
5118+
inserted_id = cursor.fetchone()[0]
5119+
cursor.execute("SELECT content FROM #pytest_emoji_test WHERE id = ?", [inserted_id])
5120+
result = cursor.fetchone()
5121+
assert result is not None, f"No row returned for ID {inserted_id}"
5122+
assert result[0] == text, f"Mismatch! Sent: {text}, Got: {result[0]}"
5123+
5124+
except Exception as e:
5125+
pytest.fail(f"Error for input {repr(text)}: {e}")
5126+
5127+
50895128
def test_close(db_connection):
50905129
"""Test closing the cursor"""
50915130
try:
@@ -5095,4 +5134,4 @@ def test_close(db_connection):
50955134
except Exception as e:
50965135
pytest.fail(f"Cursor close test failed: {e}")
50975136
finally:
5098-
cursor = db_connection.cursor()
5137+
cursor = db_connection.cursor()

0 commit comments

Comments
 (0)