Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.apache.parquet.avro;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
Expand All @@ -27,6 +26,7 @@
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
Expand Down Expand Up @@ -369,7 +369,7 @@ public void testAll() throws Exception {
.set("mylong", 2L)
.set("myfloat", 3.1f)
.set("mydouble", 4.1)
.set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
.set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
.set("mystring", "hello")
.set("mynestedrecord", nestedRecord)
.set("myenum", "a")
Expand Down Expand Up @@ -398,7 +398,7 @@ public void testAll() throws Exception {
assertEquals(2L, nextRecord.get("mylong"));
assertEquals(3.1f, nextRecord.get("myfloat"));
assertEquals(4.1, nextRecord.get("mydouble"));
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(str("hello"), nextRecord.get("mystring"));
assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
Expand Down Expand Up @@ -567,7 +567,7 @@ public void write(Map<String, Object> record) {
record.put("mylong", 2L);
record.put("myfloat", 3.1f);
record.put("mydouble", 4.1);
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
record.put("mystring", "hello");
record.put("myenum", "a");
record.put("mynestedint", 1);
Expand Down Expand Up @@ -615,7 +615,7 @@ public void write(Map<String, Object> record) {
assertEquals(2L, nextRecord.get("mylong"));
assertEquals(3.1f, nextRecord.get("myfloat"));
assertEquals(4.1, nextRecord.get("mydouble"));
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(str("hello"), nextRecord.get("mystring"));
assertEquals(str("a"), nextRecord.get("myenum")); // enum symbols are unknown
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
*/
package org.apache.parquet.avro;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import java.io.File;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
Expand Down Expand Up @@ -247,7 +247,7 @@ public void testAll() throws Exception {
.set("mylong", 2L)
.set("myfloat", 3.1f)
.set("mydouble", 4.1)
.set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
.set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
.set("mystring", "hello")
.set("mynestedrecord", nestedRecord)
.set("myenum", "a")
Expand Down Expand Up @@ -276,7 +276,7 @@ public void testAll() throws Exception {
assertEquals(2L, nextRecord.get("mylong"));
assertEquals(3.1f, nextRecord.get("myfloat"));
assertEquals(4.1, nextRecord.get("mydouble"));
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(str("hello"), nextRecord.get("mystring"));
assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
Expand Down Expand Up @@ -327,7 +327,7 @@ public void testArrayWithNullValues() throws Exception {
.set("mylong", 2L)
.set("myfloat", 3.1f)
.set("mydouble", 4.1)
.set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
.set("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
.set("mystring", "hello")
.set("mynestedrecord", nestedRecord)
.set("myenum", "a")
Expand Down Expand Up @@ -512,7 +512,7 @@ public void write(Map<String, Object> record) {
record.put("mylong", 2L);
record.put("myfloat", 3.1f);
record.put("mydouble", 4.1);
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
record.put("mybytes", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)));
record.put("mystring", "hello");
record.put("myenum", "a");
record.put("mynestedint", 1);
Expand Down Expand Up @@ -573,7 +573,7 @@ public void write(Map<String, Object> record) {
assertEquals(2L, nextRecord.get("mylong"));
assertEquals(3.1f, nextRecord.get("myfloat"));
assertEquals(4.1, nextRecord.get("mydouble"));
assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)), nextRecord.get("mybytes"));
assertEquals(str("hello"), nextRecord.get("mystring"));
assertEquals(str("a"), nextRecord.get("myenum"));
assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
package org.apache.parquet.cli;

import com.beust.jcommander.internal.Lists;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.io.CharStreams;
import com.google.common.io.Resources;
Expand Down Expand Up @@ -52,17 +51,14 @@
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.security.AccessController;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

public abstract class BaseCommand implements Command, Configurable {

@VisibleForTesting
static final Charset UTF8 = Charset.forName("utf8");

private static final String RESOURCE_URI_SCHEME = "resource";
private static final String STDIN_AS_SOURCE = "stdin";

Expand Down Expand Up @@ -103,7 +99,7 @@ public void output(String content, Logger console, String filename)
} else {
FSDataOutputStream outgoing = create(filename);
try {
outgoing.write(content.getBytes(UTF8));
outgoing.write(content.getBytes(StandardCharsets.UTF_8));
} finally {
outgoing.close();
}
Expand Down
30 changes: 9 additions & 21 deletions parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,16 @@
import java.io.ObjectStreamException;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.schema.PrimitiveComparator;

import static org.apache.parquet.bytes.BytesUtils.UTF8;

abstract public class Binary implements Comparable<Binary>, Serializable {

protected boolean isBackingBytesReused;
Expand Down Expand Up @@ -133,11 +129,10 @@ public ByteArraySliceBackedBinary(byte[] value, int offset, int length, boolean

@Override
public String toStringUsingUTF8() {
return UTF8.decode(ByteBuffer.wrap(value, offset, length)).toString();
// TODO: figure out why the following line was much slower
// rdb: new String(...) is slower because it instantiates a new Decoder,
// while Charset#decode uses a thread-local decoder cache
// return new String(value, offset, length, BytesUtils.UTF8);
// Charset#decode uses a thread-local decoder cache and is faster than
// new String(...) which instantiates a new Decoder per invocation
return StandardCharsets.UTF_8
.decode(ByteBuffer.wrap(value, offset, length)).toString();
}

@Override
Expand Down Expand Up @@ -220,11 +215,7 @@ public String toString() {
}

private static ByteBuffer encodeUTF8(String value) {
try {
return ByteBuffer.wrap(value.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new ParquetEncodingException("UTF-8 not supported.", e);
}
return ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8));
}
}

Expand Down Expand Up @@ -284,7 +275,7 @@ public ByteArrayBackedBinary(byte[] value, boolean isBackingBytesReused) {

@Override
public String toStringUsingUTF8() {
return UTF8.decode(ByteBuffer.wrap(value)).toString();
return StandardCharsets.UTF_8.decode(ByteBuffer.wrap(value)).toString();
}

@Override
Expand Down Expand Up @@ -393,11 +384,8 @@ public ByteBufferBackedBinary(ByteBuffer value, int offset, int length, boolean
public String toStringUsingUTF8() {
String ret;
if (value.hasArray()) {
try {
ret = new String(value.array(), value.arrayOffset() + offset, length, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new ParquetDecodingException("UTF-8 not supported");
}
ret = new String(value.array(), value.arrayOffset() + offset, length,
StandardCharsets.UTF_8);
} else {
int limit = value.limit();
value.limit(offset+length);
Expand All @@ -406,7 +394,7 @@ public String toStringUsingUTF8() {
// no corresponding interface to read a subset of a buffer, would have to slice it
// which creates another ByteBuffer object or do what is done here to adjust the
// limit/offset and set them back after
ret = UTF8.decode(value).toString();
ret = StandardCharsets.UTF_8.decode(value).toString();
value.limit(limit);
value.position(position);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

import org.apache.parquet.bytes.ByteBufferInputStream;
import org.junit.Assert;
Expand Down Expand Up @@ -627,9 +627,8 @@ private void writeRepeated(int COUNT, ValuesWriter cw, String prefix) {
}
}

private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw,
String prefix) throws UnsupportedEncodingException {
Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes("UTF-8"));
private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes(StandardCharsets.UTF_8));
for (int i = 0; i < COUNT; i++) {
Binary content = Binary.fromString(prefix + i % 10);
System.arraycopy(content.getBytesUnsafe(), 0, reused.getBytesUnsafe(), 0, reused.length());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -34,6 +35,8 @@
public class BytesUtils {
private static final Logger LOG = LoggerFactory.getLogger(BytesUtils.class);

/** @deprecated Use {@link StandardCharsets#UTF_8} instead */
@Deprecated
public static final Charset UTF8 = Charset.forName("UTF-8");

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -101,7 +101,7 @@ public class ParquetFileWriter {

public static final String PARQUET_METADATA_FILE = "_metadata";
public static final String MAGIC_STR = "PAR1";
public static final byte[] MAGIC = MAGIC_STR.getBytes(Charset.forName("ASCII"));
public static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII);
public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
public static final int CURRENT_VERSION = 1;

Expand Down