From 78230e4132464f03903df3015b2c306ccc1bc33e Mon Sep 17 00:00:00 2001 From: liuxiaoyu Date: Sat, 28 Feb 2026 14:51:12 +0800 Subject: [PATCH] fix: add Parquet UUID type support for read/write Add UUID (OID 2950) handling to ParquetFileAccessor by mapping it to BINARY with STRING logical type annotation, consistent with how ORC and JDBC connectors handle UUID. UUIDARRAY is also supported automatically through the existing array type framework. --- .../pxf/plugins/hdfs/ParquetFileAccessor.java | 1 + .../plugins/hdfs/ParquetFileAccessorTest.java | 39 ++++++++++ .../pxf/plugins/hdfs/ParquetResolverTest.java | 72 +++++++++++++++++++ .../hdfs/parquet/ParquetUtilitiesTest.java | 9 +++ 4 files changed, 121 insertions(+) diff --git a/server/pxf-hdfs/src/main/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessor.java b/server/pxf-hdfs/src/main/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessor.java index c73b1a71..526a0096 100644 --- a/server/pxf-hdfs/src/main/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessor.java +++ b/server/pxf-hdfs/src/main/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessor.java @@ -582,6 +582,7 @@ private Type getTypeForColumnDescriptor(ColumnDescriptor columnDescriptor) { logicalTypeAnnotation = LogicalTypeAnnotation.dateType(); break; case TIME: + case UUID: case VARCHAR: case BPCHAR: case TEXT: diff --git a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessorTest.java b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessorTest.java index 17ca9df3..18dedaed 100644 --- a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessorTest.java +++ b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetFileAccessorTest.java @@ -1,11 +1,22 @@ package org.apache.cloudberry.pxf.plugins.hdfs; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.cloudberry.pxf.api.io.DataType; import org.apache.cloudberry.pxf.api.model.RequestContext; +import org.apache.cloudberry.pxf.api.utilities.ColumnDescriptor; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ParquetFileAccessorTest { ParquetFileAccessor accessor; @@ -26,4 +37,32 @@ public void testInitialize() { accessor.setRequestContext(context); assertNull(context.getMetadata()); } + + @Test + public void testGetTypeForColumnDescriptor_UUID() throws Exception { + ColumnDescriptor uuidColumn = new ColumnDescriptor("id", DataType.UUID.getOID(), 0, "uuid", new Integer[]{}); + + Method method = ParquetFileAccessor.class.getDeclaredMethod("getTypeForColumnDescriptor", ColumnDescriptor.class); + method.setAccessible(true); + Type result = (Type) method.invoke(accessor, uuidColumn); + + assertEquals("id", result.getName()); + assertTrue(result.isPrimitive()); + PrimitiveType primitiveType = result.asPrimitiveType(); + assertEquals(PrimitiveType.PrimitiveTypeName.BINARY, primitiveType.getPrimitiveTypeName()); + assertEquals(LogicalTypeAnnotation.stringType(), primitiveType.getLogicalTypeAnnotation()); + } + + @Test + public void testGetTypeForColumnDescriptor_UUIDArray() throws Exception { + ColumnDescriptor uuidArrayColumn = new ColumnDescriptor("ids", DataType.UUIDARRAY.getOID(), 0, "uuid[]", new Integer[]{}); + + Method method = ParquetFileAccessor.class.getDeclaredMethod("getTypeForColumnDescriptor", ColumnDescriptor.class); + method.setAccessible(true); + Type result = (Type) method.invoke(accessor, uuidArrayColumn); + + assertEquals("ids", result.getName()); + // array types are wrapped in a list group + assertTrue(result.asGroupType().isRepetition(Type.Repetition.OPTIONAL)); + } } diff --git a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetResolverTest.java b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetResolverTest.java index 936978ff..a29a7f52 100644 --- a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetResolverTest.java +++ b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/ParquetResolverTest.java @@ -193,6 +193,78 @@ public void testSetFields_RightTrimCharNoLeftTrim() throws IOException { testSetFields_RightTrimCharHelper(" abcd ", " abc ", " abc"); } + @Test + @SuppressWarnings("deprecation") + public void testSetFields_UUID() throws IOException { + List typeFields = new ArrayList<>(); + typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "id", org.apache.parquet.schema.OriginalType.UTF8)); + typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "value", org.apache.parquet.schema.OriginalType.UTF8)); + schema = new MessageType("hive_schema", typeFields); + context.setMetadata(schema); + + List columnDescriptors = new ArrayList<>(); + columnDescriptors.add(new ColumnDescriptor("id", DataType.UUID.getOID(), 0, "uuid", null)); + columnDescriptors.add(new ColumnDescriptor("value", DataType.VARCHAR.getOID(), 1, "varchar", null)); + context.setTupleDescription(columnDescriptors); + + resolver.setRequestContext(context); + resolver.afterPropertiesSet(); + + String uuidValue = "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11"; + List fields = new ArrayList<>(); + fields.add(new OneField(DataType.TEXT.getOID(), uuidValue)); + fields.add(new OneField(DataType.TEXT.getOID(), "test")); + + OneRow row = resolver.setFields(fields); + assertNotNull(row); + Object data = row.getData(); + assertNotNull(data); + assertTrue(data instanceof Group); + Group group = (Group) data; + + // assert UUID value is stored as string in BINARY column + assertEquals(uuidValue, group.getString(0, 0)); + assertEquals("test", group.getString(1, 0)); + + // assert value repetition count + for (int i = 0; i < 2; i++) { + assertEquals(1, group.getFieldRepetitionCount(i)); + } + } + + @Test + @SuppressWarnings("deprecation") + public void testSetFields_UUID_Null() throws IOException { + List typeFields = new ArrayList<>(); + typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "id", org.apache.parquet.schema.OriginalType.UTF8)); + typeFields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "value", org.apache.parquet.schema.OriginalType.UTF8)); + schema = new MessageType("hive_schema", typeFields); + context.setMetadata(schema); + + List columnDescriptors = new ArrayList<>(); + columnDescriptors.add(new ColumnDescriptor("id", DataType.UUID.getOID(), 0, "uuid", null)); + columnDescriptors.add(new ColumnDescriptor("value", DataType.VARCHAR.getOID(), 1, "varchar", null)); + context.setTupleDescription(columnDescriptors); + + resolver.setRequestContext(context); + resolver.afterPropertiesSet(); + + List fields = new ArrayList<>(); + fields.add(new OneField(DataType.TEXT.getOID(), null)); + fields.add(new OneField(DataType.TEXT.getOID(), "test")); + + OneRow row = resolver.setFields(fields); + assertNotNull(row); + Object data = row.getData(); + assertNotNull(data); + assertTrue(data instanceof Group); + Group group = (Group) data; + + // assert null UUID is not written (repetition count 0) + assertEquals(0, group.getFieldRepetitionCount(0)); + assertEquals(1, group.getFieldRepetitionCount(1)); + } + @Test public void testSetFields_Primitive_Nulls() throws IOException { schema = getParquetSchemaForPrimitiveTypes(Type.Repetition.OPTIONAL, false); diff --git a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/parquet/ParquetUtilitiesTest.java b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/parquet/ParquetUtilitiesTest.java index c590b4ca..7503bd41 100644 --- a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/parquet/ParquetUtilitiesTest.java +++ b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/parquet/ParquetUtilitiesTest.java @@ -67,6 +67,15 @@ public void testParsePostgresArrayStringArray() { assertIterableEquals(Arrays.asList("fizz", "buzz", "fizzbuzz"), result); } + @Test + public void testParsePostgresArrayUuidArray() { + // GPDB UUID is a parquet BINARY primitive type with String annotation (same as text) + String value = "{a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11,b1ffcd00-0d1c-5f09-cc7e-7ccace491b22}"; + + List result = parquetUtilities.parsePostgresArray(value, PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()); + assertIterableEquals(Arrays.asList("a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11", "b1ffcd00-0d1c-5f09-cc7e-7ccace491b22"), result); + } + @Test public void testParsePostgresArrayDateArray() { // GPDB Date is an parquet INT64 primitive type with String annotation