[UNDERTOW-2655] Fix text corruption in FileUtils.readFile when reading multi-byte characters

finalchild · claude · finalchild · commit 28f3eebbcac0 · 2025-10-31T12:27:12.000+09:00
The readFile method was reading the InputStream into a fixed-size byte buffer and decoding each chunk independently. This caused multi-byte UTF-8 character sequences to be split across buffer boundaries, resulting in text corruption with replacement characters. Replaced BufferedInputStream with InputStreamReader to handle buffering and character decoding together in a streaming fashion, ensuring multi-byte character sequences are never split. This issue became more significant after UNDERTOW-2337, as large form-data field values are now processed by this function. Originally reported in Spring Framework issue #35292. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/core/src/main/java/io/undertow/util/FileUtils.java b/core/src/main/java/io/undertow/util/FileUtils.java
@@ -18,9 +18,9 @@
 
 package io.undertow.util;
 
-import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.net.URL;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
@@ -73,14 +73,14 @@ public static String readFile(InputStream file) {
      * Reads the {@link InputStream file} and converting it to {@link String} using <code>charSet</code> encoding.
      */
     public static String readFile(InputStream file, Charset charSet) {
-        try (BufferedInputStream stream = new BufferedInputStream(file)) {
-            byte[] buff = new byte[1024];
-            StringBuilder builder = new StringBuilder();
-            int read;
-            while ((read = stream.read(buff)) != -1) {
-                builder.append(new String(buff, 0, read, charSet));
+        try (InputStreamReader reader = new InputStreamReader(file, charSet)) {
+            StringBuilder result = new StringBuilder();
+            char[] cbuf = new char[8192];
+            int nread;
+            while ((nread = reader.read(cbuf, 0, cbuf.length)) != -1) {
+                result.append(cbuf, 0, nread);
             }
-            return builder.toString();
+            return result.toString();
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
diff --git a/core/src/test/java/io/undertow/util/FileUtilsTestCase.java b/core/src/test/java/io/undertow/util/FileUtilsTestCase.java
@@ -0,0 +1,122 @@
+/*
+ * JBoss, Home of Professional Open Source.
+ * Copyright 2025 Red Hat, Inc., and individual contributors
+ * as indicated by the @author tags.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package io.undertow.util;
+
+import io.undertow.testutils.category.UnitTest;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * @author Park Jaeon
+ */
+@Category(UnitTest.class)
+public class FileUtilsTestCase {
+
+    @Test
+    public void testMultiByteCharactersAtBufferBoundary() {
+        StringBuilder sb = new StringBuilder();
+
+        // Create content larger than 1024 bytes (the old buffer size)
+        // Fill with ASCII 'a' characters up to position 1023
+        for (int i = 0; i < 1023; i++) {
+            sb.append('a');
+        }
+
+        // Add a 3-byte UTF-8 character (Chinese character) at position 1023-1025
+        // This would span across the 1024-byte boundary in the old implementation
+        sb.append('世');  // 3-byte UTF-8 character
+
+        // Add more content to ensure we're reading beyond the first buffer
+        for (int i = 0; i < 2000; i++) {
+            sb.append('b');
+        }
+
+        // Add some more multi-byte characters
+        sb.append(" Hello 世界 Testing 🎉");
+
+        String expected = sb.toString();
+        InputStream stream = new ByteArrayInputStream(expected.getBytes(StandardCharsets.UTF_8));
+
+        String result = FileUtils.readFile(stream);
+
+        // The bug would cause replacement character (�) to appear instead of the correct character
+        Assert.assertFalse("Result should not contain replacement character (�)",
+                          result.contains("\uFFFD"));
+        Assert.assertEquals("Content should be read correctly without corruption",
+                          expected, result);
+    }
+
+    @Test
+    public void testEmojisAtBufferBoundary() {
+        StringBuilder sb = new StringBuilder();
+
+        // Fill up to just before 1024 bytes
+        for (int i = 0; i < 1022; i++) {
+            sb.append('x');
+        }
+
+        // Add 4-byte emoji that would span the boundary
+        sb.append("🎉");  // 4-byte UTF-8 character
+
+        // Add more content
+        for (int i = 0; i < 500; i++) {
+            sb.append('y');
+        }
+
+        String expected = sb.toString();
+        InputStream stream = new ByteArrayInputStream(expected.getBytes(StandardCharsets.UTF_8));
+
+        String result = FileUtils.readFile(stream);
+
+        Assert.assertFalse("Result should not contain replacement character",
+                          result.contains("\uFFFD"));
+        Assert.assertEquals("Emoji should be preserved correctly", expected, result);
+    }
+
+    @Test
+    public void testLargeContentWithMultiByteCharacters() {
+        StringBuilder sb = new StringBuilder();
+
+        // Create content that's definitely larger than 1024 bytes and includes
+        // various multi-byte characters throughout
+        String testPattern = "Hello 世界! Testing 🎉 multi-byte encoding. ";
+
+        // Repeat pattern to create large content (each pattern is ~50 bytes)
+        for (int i = 0; i < 100; i++) {
+            sb.append(testPattern);
+            sb.append(i).append(" ");
+        }
+
+        String expected = sb.toString();
+        Assert.assertTrue("Content should be larger than 1024 bytes",
+                         expected.getBytes(StandardCharsets.UTF_8).length > 1024);
+
+        InputStream stream = new ByteArrayInputStream(expected.getBytes(StandardCharsets.UTF_8));
+        String result = FileUtils.readFile(stream);
+
+        Assert.assertEquals("Large content with multi-byte characters should be read correctly",
+                          expected, result);
+        Assert.assertFalse("No replacement characters should be present",
+                          result.contains("\uFFFD"));
+    }
+}