FEAT: Support Emojis for data types (#166)

gargsaumya · web-flow · commit 8968a5c90bee · 2025-08-28T21:43:40.000+05:30
### Work Item / Issue Reference   > [AB#37664](https://sqlclientdrivers.visualstudio.com/c6d89619-62de-46a0-8b46-70b92a84d85e/_workitems/edit/37664)  > GitHub Issue: #<ISSUE_NUMBER> ------------------------------------------------------------------- ### Summary  This pull request improves Unicode string handling in the SQL type mapping logic and adds comprehensive tests for round-tripping emoji and special characters in the database. The main focus is on ensuring that Unicode strings, including those with emojis and special characters, are correctly mapped and stored in SQL Server. **Unicode string mapping improvements:** * Updated the `_map_sql_type` method in `cursor.py` to use the UTF-16 code unit length for Unicode string parameters instead of the Python string length, ensuring correct sizing for both long and short Unicode strings. [[1]](diffhunk://#diff-deceea46ae01082ce8400e14fa02f4b7585afb7b5ed9885338b66494f5f38280R326-R330) [[2]](diffhunk://#diff-deceea46ae01082ce8400e14fa02f4b7585afb7b5ed9885338b66494f5f38280R340-R344) **Testing enhancements:** * Added a new test `test_emoji_round_trip` in `test_004_cursor.py` to verify that various emoji, accented, and non-Latin characters can be inserted and retrieved accurately from the database, improving coverage for Unicode edge cases.
diff --git a/mssql_python/cursor.py b/mssql_python/cursor.py
@@ -332,10 +332,11 @@ def _map_sql_type(self, param, parameters_list, i):
             # TODO: revisit
             if len(param) > 4000:  # Long strings
                 if is_unicode:
+                    utf16_len = len(param.encode("utf-16-le")) // 2
                     return (
                         ddbc_sql_const.SQL_WLONGVARCHAR.value,
                         ddbc_sql_const.SQL_C_WCHAR.value,
-                        len(param),
+                        utf16_len,
                         0,
                     )
                 return (
@@ -345,10 +346,11 @@ def _map_sql_type(self, param, parameters_list, i):
                     0,
                 )
             if is_unicode:  # Short Unicode strings
+                utf16_len = len(param.encode("utf-16-le")) // 2
                 return (
                     ddbc_sql_const.SQL_WVARCHAR.value,
                     ddbc_sql_const.SQL_C_WCHAR.value,
-                    len(param),
+                    utf16_len,
                     0,
                 )
             return (
diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp
@@ -275,15 +275,19 @@ SQLRETURN BindParameters(SQLHANDLE hStmt, const py::list& params,
                     AllocateParamBuffer<std::vector<SQLWCHAR>>(paramBuffers);
 
                 // Reserve space and convert from wstring to SQLWCHAR array
-                sqlwcharBuffer->resize(strParam->size() + 1, 0); // +1 for null terminator
-
-                // Convert each wchar_t (4 bytes on macOS) to SQLWCHAR (2 bytes)
-                for (size_t i = 0; i < strParam->size(); i++) {
-                    (*sqlwcharBuffer)[i] = static_cast<SQLWCHAR>((*strParam)[i]);
+                std::vector<SQLWCHAR> utf16 = WStringToSQLWCHAR(*strParam);
+                if (utf16.size() < strParam->size()) {
+                    LOG("Warning: UTF-16 encoding shrank string? input={} output={}",
+                        strParam->size(), utf16.size());
+                }
+                if (utf16.size() > strParam->size() * 2 + 1) {
+                    LOG("Warning: UTF-16 expansion unusually large: input={} output={}",
+                        strParam->size(), utf16.size());
                 }
+                *sqlwcharBuffer = std::move(utf16);
                 // Use the SQLWCHAR buffer instead of the wstring directly
                 dataPtr = sqlwcharBuffer->data();
-                bufferLength = (strParam->size() + 1) * sizeof(SQLWCHAR);
+                bufferLength = sqlwcharBuffer->size() * sizeof(SQLWCHAR);
                 LOG("macOS: Created SQLWCHAR buffer for parameter with size: {} bytes", bufferLength);
 #else
                 // On Windows, wchar_t and SQLWCHAR are the same size, so direct cast works
@@ -1705,7 +1709,16 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
 						if (numCharsInData < dataBuffer.size()) {
                             // SQLGetData will null-terminate the data
 #if defined(__APPLE__) || defined(__linux__)
-                            row.append(SQLWCHARToWString(dataBuffer.data(), SQL_NTS));
+                            auto raw_bytes = reinterpret_cast<const char*>(dataBuffer.data());
+                            size_t actualBufferSize = dataBuffer.size() * sizeof(SQLWCHAR);
+                            if (dataLen < 0 || static_cast<size_t>(dataLen) > actualBufferSize) {
+                                LOG("Error: py::bytes creation request exceeds buffer size. dataLen={} buffer={}",
+                                    dataLen, actualBufferSize);
+                                ThrowStdException("Invalid buffer length for py::bytes");
+                            }
+                            py::bytes py_bytes(raw_bytes, dataLen);
+                            py::str decoded = py_bytes.attr("decode")("utf-16-le");
+                            row.append(decoded);
 #else
                             row.append(std::wstring(dataBuffer.data()));
 #endif
diff --git a/mssql_python/pybind/ddbc_bindings.h b/mssql_python/pybind/ddbc_bindings.h
@@ -33,33 +33,107 @@ using namespace pybind11::literals;
 #include <sqlext.h>
 
 #if defined(__APPLE__) || defined(__linux__)
-    // macOS-specific headers
-    #include <dlfcn.h>
+#include <dlfcn.h>
+
+// Unicode constants for surrogate ranges and max scalar value
+constexpr uint32_t UNICODE_SURROGATE_HIGH_START = 0xD800;
+constexpr uint32_t UNICODE_SURROGATE_HIGH_END   = 0xDBFF;
+constexpr uint32_t UNICODE_SURROGATE_LOW_START  = 0xDC00;
+constexpr uint32_t UNICODE_SURROGATE_LOW_END    = 0xDFFF;
+constexpr uint32_t UNICODE_MAX_CODEPOINT        = 0x10FFFF;
+constexpr uint32_t UNICODE_REPLACEMENT_CHAR     = 0xFFFD;
+
+// Validate whether a code point is a legal Unicode scalar value
+// (excludes surrogate halves and values beyond U+10FFFF)
+inline bool IsValidUnicodeScalar(uint32_t cp) {
+    return cp <= UNICODE_MAX_CODEPOINT &&
+           !(cp >= UNICODE_SURROGATE_HIGH_START && cp <= UNICODE_SURROGATE_LOW_END);
+}
 
-    inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
-        if (!sqlwStr) return std::wstring();
+inline std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
+    if (!sqlwStr) return std::wstring();
 
-        if (length == SQL_NTS) {
-            size_t i = 0;
-            while (sqlwStr[i] != 0) ++i;
-            length = i;
-        }
+    if (length == SQL_NTS) {
+        size_t i = 0;
+        while (sqlwStr[i] != 0) ++i;
+        length = i;
+    }
+    std::wstring result;
+    result.reserve(length);
 
-        std::wstring result;
-        result.reserve(length);
+    if constexpr (sizeof(SQLWCHAR) == 2) {
+        // Decode UTF-16 to UTF-32 (with surrogate pair handling)
+        for (size_t i = 0; i < length; ++i) {
+            uint16_t wc = static_cast<uint16_t>(sqlwStr[i]);
+            // Check if this is a high surrogate (U+D800–U+DBFF)
+            if (wc >= UNICODE_SURROGATE_HIGH_START && wc <= UNICODE_SURROGATE_HIGH_END && i + 1 < length) {
+                uint16_t low = static_cast<uint16_t>(sqlwStr[i + 1]);
+                // Check if the next code unit is a low surrogate (U+DC00–U+DFFF)
+                if (low >= UNICODE_SURROGATE_LOW_START && low <= UNICODE_SURROGATE_LOW_END) {
+                    // Combine surrogate pair into a single code point
+                    uint32_t cp = (((wc - UNICODE_SURROGATE_HIGH_START) << 10) | (low - UNICODE_SURROGATE_LOW_START)) + 0x10000;
+                    result.push_back(static_cast<wchar_t>(cp));
+                    ++i; // Skip the low surrogate
+                    continue;
+                }
+            }
+            // If valid scalar then append, else append replacement char (U+FFFD)
+            if (IsValidUnicodeScalar(wc)) {
+                result.push_back(static_cast<wchar_t>(wc));
+            } else {
+                result.push_back(static_cast<wchar_t>(UNICODE_REPLACEMENT_CHAR));
+            }
+        }
+    } else {
+        // SQLWCHAR is UTF-32, so just copy with validation
         for (size_t i = 0; i < length; ++i) {
-            result.push_back(static_cast<wchar_t>(sqlwStr[i]));
+            uint32_t cp = static_cast<uint32_t>(sqlwStr[i]);
+            if (IsValidUnicodeScalar(cp)) {
+                result.push_back(static_cast<wchar_t>(cp));
+            } else {
+                result.push_back(static_cast<wchar_t>(UNICODE_REPLACEMENT_CHAR));
+            }
         }
-        return result;
     }
+    return result;
+}
 
-    inline std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
-        std::vector<SQLWCHAR> result(str.size() + 1, 0);  // +1 for null terminator
-        for (size_t i = 0; i < str.size(); ++i) {
-            result[i] = static_cast<SQLWCHAR>(str[i]);
+inline std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
+    std::vector<SQLWCHAR> result;
+    result.reserve(str.size() + 2);
+    if constexpr (sizeof(SQLWCHAR) == 2) {
+        // Encode UTF-32 to UTF-16
+        for (wchar_t wc : str) {
+            uint32_t cp = static_cast<uint32_t>(wc);
+            if (!IsValidUnicodeScalar(cp)) {
+                cp = UNICODE_REPLACEMENT_CHAR;
+            }
+            if (cp <= 0xFFFF) {
+                // Fits in a single UTF-16 code unit
+                result.push_back(static_cast<SQLWCHAR>(cp));
+            } else {
+                // Encode as surrogate pair
+                cp -= 0x10000;
+                SQLWCHAR high = static_cast<SQLWCHAR>((cp >> 10) + UNICODE_SURROGATE_HIGH_START);
+                SQLWCHAR low  = static_cast<SQLWCHAR>((cp & 0x3FF) + UNICODE_SURROGATE_LOW_START);
+                result.push_back(high);
+                result.push_back(low);
+            }
+        }
+    } else {
+        // Encode UTF-32 directly
+        for (wchar_t wc : str) {
+            uint32_t cp = static_cast<uint32_t>(wc);
+            if (IsValidUnicodeScalar(cp)) {
+                result.push_back(static_cast<SQLWCHAR>(cp));
+            } else {
+                result.push_back(static_cast<SQLWCHAR>(UNICODE_REPLACEMENT_CHAR));
+            }
         }
-        return result;
     }
+    result.push_back(0); // null terminator
+    return result;
+}
 #endif
 
 #if defined(__APPLE__) || defined(__linux__)
diff --git a/tests/test_004_cursor.py b/tests/test_004_cursor.py
@@ -5086,6 +5086,45 @@ def test_tables_cleanup(cursor, db_connection):
     except Exception as e:
         pytest.fail(f"Test cleanup failed: {e}")
 
+def test_emoji_round_trip(cursor, db_connection):
+    """Test round-trip of emoji and special characters"""
+    test_inputs = [
+        "Hello 😄",
+        "Flags 🇮🇳🇺🇸",
+        "Family 👨‍👩‍👧‍👦",
+        "Skin tone 👍🏽",
+        "Brain 🧠",
+        "Ice 🧊",
+        "Melting face 🫠",
+        "Accented éüñç",
+        "Chinese: 中文",
+        "Japanese: 日本語",
+        "Hello 🚀 World",
+        "admin🔒user",
+        "1🚀' OR '1'='1",
+    ]
+
+    cursor.execute("""
+        CREATE TABLE #pytest_emoji_test (
+            id INT IDENTITY PRIMARY KEY,
+            content NVARCHAR(MAX)
+        );
+    """)
+    db_connection.commit()
+
+    for text in test_inputs:
+        try:
+            cursor.execute("INSERT INTO #pytest_emoji_test (content) OUTPUT INSERTED.id VALUES (?)", [text])
+            inserted_id = cursor.fetchone()[0]
+            cursor.execute("SELECT content FROM #pytest_emoji_test WHERE id = ?", [inserted_id])
+            result = cursor.fetchone()
+            assert result is not None, f"No row returned for ID {inserted_id}"
+            assert result[0] == text, f"Mismatch! Sent: {text}, Got: {result[0]}"
+
+        except Exception as e:
+            pytest.fail(f"Error for input {repr(text)}: {e}")
+
+
 def test_close(db_connection):
     """Test closing the cursor"""
     try:
@@ -5095,4 +5134,4 @@ def test_close(db_connection):
     except Exception as e:
         pytest.fail(f"Cursor close test failed: {e}")
     finally:
-        cursor = db_connection.cursor()
+        cursor = db_connection.cursor()