apache · gszadovszky · Feb 7, 2019 · Jan 24, 2019
diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWrite.java
@@ -18,7 +18,6 @@
  */
 package org.apache.parquet.avro;
 
-import com.google.common.base.Charsets;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.google.common.io.Resources;
@@ -27,6 +26,7 @@
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -369,7 +369,7 @@ public void testAll() throws Exception {
         .set("mylong", 2L)
         .set("myfloat", 3.1f)
         .set("mydouble", 4.1)
-        .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
+        .set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
         .set("mystring", "hello")
         .set("mynestedrecord", nestedRecord)
         .set("myenum", "a")
@@ -398,7 +398,7 @@ public void testAll() throws Exception {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
@@ -567,7 +567,7 @@ public void write(Map<String, Object> record) {
     record.put("mylong", 2L);
     record.put("myfloat", 3.1f);
     record.put("mydouble", 4.1);
-    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
+    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
     record.put("mystring", "hello");
     record.put("myenum", "a");
     record.put("mynestedint", 1);
@@ -615,7 +615,7 @@ public void write(Map<String, Object> record) {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(str("a"), nextRecord.get("myenum")); // enum symbols are unknown
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));

diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestReadWriteOldListBehavior.java
@@ -18,12 +18,12 @@
  */
 package org.apache.parquet.avro;
 
-import com.google.common.base.Charsets;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.google.common.io.Resources;
 import java.io.File;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -247,7 +247,7 @@ public void testAll() throws Exception {
         .set("mylong", 2L)
         .set("myfloat", 3.1f)
         .set("mydouble", 4.1)
-        .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
+        .set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
         .set("mystring", "hello")
         .set("mynestedrecord", nestedRecord)
         .set("myenum", "a")
@@ -276,7 +276,7 @@ public void testAll() throws Exception {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
@@ -327,7 +327,7 @@ public void testArrayWithNullValues() throws Exception {
         .set("mylong", 2L)
         .set("myfloat", 3.1f)
         .set("mydouble", 4.1)
-        .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
+        .set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
         .set("mystring", "hello")
         .set("mynestedrecord", nestedRecord)
         .set("myenum", "a")
@@ -512,7 +512,7 @@ public void write(Map<String, Object> record) {
     record.put("mylong", 2L);
     record.put("myfloat", 3.1f);
     record.put("mydouble", 4.1);
-    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
+    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
     record.put("mystring", "hello");
     record.put("myenum", "a");
     record.put("mynestedint", 1);
@@ -573,7 +573,7 @@ public void write(Map<String, Object> record) {
     assertEquals(2L, nextRecord.get("mylong"));
     assertEquals(3.1f, nextRecord.get("myfloat"));
     assertEquals(4.1, nextRecord.get("mydouble"));
-    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
+    assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
     assertEquals(str("hello"), nextRecord.get("mystring"));
     assertEquals(str("a"), nextRecord.get("myenum"));
     assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));

diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java
@@ -20,7 +20,6 @@
 package org.apache.parquet.cli;
 
 import com.beust.jcommander.internal.Lists;
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.io.CharStreams;
 import com.google.common.io.Resources;
@@ -52,17 +51,14 @@
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.security.AccessController;
 import java.util.Iterator;
 import java.util.List;
 import java.util.NoSuchElementException;
 
 public abstract class BaseCommand implements Command, Configurable {
 
-  @VisibleForTesting
-  static final Charset UTF8 = Charset.forName("utf8");
-
   private static final String RESOURCE_URI_SCHEME = "resource";
   private static final String STDIN_AS_SOURCE = "stdin";
 
@@ -103,7 +99,7 @@ public void output(String content, Logger console, String filename)
     } else {
       FSDataOutputStream outgoing = create(filename);
       try {
-        outgoing.write(content.getBytes(UTF8));
+        outgoing.write(content.getBytes(StandardCharsets.UTF_8));
       } finally {
         outgoing.close();
       }

diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
@@ -23,20 +23,16 @@
 import java.io.ObjectStreamException;
 import java.io.OutputStream;
 import java.io.Serializable;
-import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
-import org.apache.parquet.io.ParquetDecodingException;
 import org.apache.parquet.io.ParquetEncodingException;
 import org.apache.parquet.schema.PrimitiveComparator;
 
-import static org.apache.parquet.bytes.BytesUtils.UTF8;
-
 abstract public class Binary implements Comparable<Binary>, Serializable {
 
   protected boolean isBackingBytesReused;
@@ -133,11 +129,10 @@ public ByteArraySliceBackedBinary(byte[] value, int offset, int length, boolean
 
     @Override
     public String toStringUsingUTF8() {
-      return UTF8.decode(ByteBuffer.wrap(value, offset, length)).toString();
-      // TODO: figure out why the following line was much slower
-      // rdb: new String(...) is slower because it instantiates a new Decoder,
-      //      while Charset#decode uses a thread-local decoder cache
-      // return new String(value, offset, length, BytesUtils.UTF8);
+      // Charset#decode uses a thread-local decoder cache and is faster than
+      // new String(...) which instantiates a new Decoder per invocation
+      return StandardCharsets.UTF_8
+          .decode(ByteBuffer.wrap(value, offset, length)).toString();
     }
 
     @Override
@@ -220,11 +215,7 @@ public String toString() {
     }
 
     private static ByteBuffer encodeUTF8(String value) {
-      try {
-        return ByteBuffer.wrap(value.getBytes("UTF-8"));
-      } catch (UnsupportedEncodingException e) {
-        throw new ParquetEncodingException("UTF-8 not supported.", e);
-      }
+      return ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8));
     }
   }
 
@@ -284,7 +275,7 @@ public ByteArrayBackedBinary(byte[] value, boolean isBackingBytesReused) {
 
     @Override
     public String toStringUsingUTF8() {
-      return UTF8.decode(ByteBuffer.wrap(value)).toString();
+      return StandardCharsets.UTF_8.decode(ByteBuffer.wrap(value)).toString();
     }
 
     @Override
@@ -393,11 +384,8 @@ public ByteBufferBackedBinary(ByteBuffer value, int offset, int length, boolean
     public String toStringUsingUTF8() {
       String ret;
       if (value.hasArray()) {
-        try {
-          ret = new String(value.array(), value.arrayOffset() + offset, length, "UTF-8");
-        } catch (UnsupportedEncodingException e) {
-          throw new ParquetDecodingException("UTF-8 not supported");
-        }
+        ret = new String(value.array(), value.arrayOffset() + offset, length,
+            StandardCharsets.UTF_8);
       } else {
         int limit = value.limit();
         value.limit(offset+length);
@@ -406,7 +394,7 @@ public String toStringUsingUTF8() {
         // no corresponding interface to read a subset of a buffer, would have to slice it
         // which creates another ByteBuffer object or do what is done here to adjust the
         // limit/offset and set them back after
-        ret = UTF8.decode(value).toString();
+        ret = StandardCharsets.UTF_8.decode(value).toString();
         value.limit(limit);
         value.position(position);
       }

diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -27,8 +27,8 @@
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
 
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.junit.Assert;
@@ -627,9 +627,8 @@ private void writeRepeated(int COUNT, ValuesWriter cw, String prefix) {
     }
   }
 
-  private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw,
-                                      String prefix) throws UnsupportedEncodingException {
-    Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes("UTF-8"));
+  private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
+    Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes(StandardCharsets.UTF_8));
     for (int i = 0; i < COUNT; i++) {
       Binary content = Binary.fromString(prefix + i % 10);
       System.arraycopy(content.getBytesUnsafe(), 0, reused.getBytesUnsafe(), 0, reused.length());

diff --git a/parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java b/parquet-common/src/main/java/org/apache/parquet/bytes/BytesUtils.java
@@ -24,6 +24,7 @@
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -34,6 +35,8 @@
 public class BytesUtils {
   private static final Logger LOG = LoggerFactory.getLogger(BytesUtils.class);
 
+  /** @deprecated Use {@link StandardCharsets#UTF_8} instead */
+  @Deprecated
   public static final Charset UTF8 = Charset.forName("UTF-8");
 
   /**

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java
@@ -24,7 +24,7 @@
 import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
 
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -101,7 +101,7 @@ public class ParquetFileWriter {
 
   public static final String PARQUET_METADATA_FILE = "_metadata";
   public static final String MAGIC_STR = "PAR1";
-  public static final byte[] MAGIC = MAGIC_STR.getBytes(Charset.forName("ASCII"));
+  public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
   public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
   public static final int CURRENT_VERSION = 1;