diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst index a4f02084c4a6..57a5e824fab1 100644 --- a/docs/source/python/api/formats.rst +++ b/docs/source/python/api/formats.rst @@ -119,6 +119,8 @@ Encrypted Parquet Files KmsConnectionConfig EncryptionConfiguration DecryptionConfiguration + create_encryption_properties + create_decryption_properties .. _api.orc: diff --git a/python/pyarrow/_parquet_encryption.pxd b/python/pyarrow/_parquet_encryption.pxd index 48939fe277fe..1a12a6d6785e 100644 --- a/python/pyarrow/_parquet_encryption.pxd +++ b/python/pyarrow/_parquet_encryption.pxd @@ -20,6 +20,11 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libparquet_encryption cimport * +from pyarrow.includes.libparquet cimport ( + CSecureString, + CFileDecryptionPropertiesBuilder, + CFileEncryptionPropertiesBuilder, +) from pyarrow._parquet cimport (ParquetCipher, CFileEncryptionProperties, CFileDecryptionProperties, diff --git a/python/pyarrow/_parquet_encryption.pyx b/python/pyarrow/_parquet_encryption.pyx index db6a6b56ac4c..729ce874217f 100644 --- a/python/pyarrow/_parquet_encryption.pyx +++ b/python/pyarrow/_parquet_encryption.pyx @@ -430,13 +430,13 @@ cdef class CryptoFactory(_Weakrefable): parquet_file_path : str, pathlib.Path, or None, default None Path to the parquet file to be encrypted. Only required when the internal_key_material attribute of EncryptionConfiguration is set - to False. Used to derive the path for storing key material + to False. Used to derive the path for storing key material specific to this parquet file. filesystem : FileSystem or None, default None - Used only when internal_key_material is set to False on + Used only when internal_key_material is set to False on EncryptionConfiguration. If None, the file system will be inferred - based on parquet_file_path. + based on parquet_file_path. Returns ------- @@ -491,7 +491,7 @@ cdef class CryptoFactory(_Weakrefable): filesystem : FileSystem or None, default None Used only when the parquet file uses external key material. If - None, the file system will be inferred based on parquet_file_path. + None, the file system will be inferred based on parquet_file_path. Returns ------- @@ -552,7 +552,7 @@ cdef class CryptoFactory(_Weakrefable): filesystem : FileSystem or None, default None Used only when the parquet file uses external key material. If - None, the file system will be inferred based on parquet_file_path. + None, the file system will be inferred based on parquet_file_path. double_wrapping : bool, default True In the single wrapping mode, encrypts data encryption keys with @@ -665,7 +665,7 @@ cdef class FileSystemKeyMaterialStore(_Weakrefable): filesystem : FileSystem, default None FileSystem where the parquet file is located. If None, - will be inferred based on parquet_file_path. + will be inferred based on parquet_file_path. Returns ------- @@ -711,3 +711,186 @@ cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object if isinstance(decryptionconfig, DecryptionConfiguration): return ( decryptionconfig).unwrap() raise TypeError("Expected DecryptionConfiguration, got %s" % type(decryptionconfig)) + + +def create_decryption_properties( + footer_key, + *, + aad_prefix=None, + bint check_footer_integrity=True, + bint allow_plaintext_files=False, +): + """ + Create FileDecryptionProperties using a direct footer key. + + This bypasses the KMS-based :class:`CryptoFactory` API and directly + constructs decryption properties from a plaintext key. This is useful + when the caller manages key wrapping externally (e.g. via an + application-level envelope encryption scheme). + + For most use cases, prefer the higher-level :class:`CryptoFactory` + with :class:`DecryptionConfiguration`, which handles envelope + encryption and key rotation automatically. + + Parameters + ---------- + footer_key : bytes + The decryption key for the file footer (and all columns if + uniform encryption was used). Must be 16, 24, or 32 bytes + for AES-128, AES-192, or AES-256 respectively. + aad_prefix : bytes, optional + Additional Authenticated Data prefix. Must match the AAD prefix + that was used during encryption. Required if the file was written + with ``store_aad_prefix=False``. + check_footer_integrity : bool, default True + Whether to verify footer integrity using the signature stored + in the file. Set to False only for debugging. + allow_plaintext_files : bool, default False + Whether to allow reading plaintext (unencrypted) files with + these decryption properties without raising an error. + + Returns + ------- + FileDecryptionProperties + Properties that can be passed to :func:`read_table`, + :class:`ParquetFile`, or + :class:`~pyarrow.dataset.ParquetFragmentScanOptions`. + + Examples + -------- + >>> import pyarrow.parquet as pq + >>> import pyarrow.parquet.encryption as pe + >>> props = pe.create_decryption_properties( + ... footer_key=b'0123456789abcdef', + ... aad_prefix=b'table_id' + ... ) + >>> table = pq.read_table('encrypted.parquet', decryption_properties=props) + """ + cdef: + CSecureString c_footer_key + c_string c_aad_prefix + CFileDecryptionPropertiesBuilder* builder + shared_ptr[CFileDecryptionProperties] props + + footer_key_bytes = tobytes(footer_key) + if len(footer_key_bytes) not in (16, 24, 32): + raise ValueError( + f"footer_key must be 16, 24, or 32 bytes, got {len(footer_key_bytes)}" + ) + + c_footer_key = CSecureString(footer_key_bytes) + builder = new CFileDecryptionPropertiesBuilder() + + try: + builder.footer_key(c_footer_key) + + if aad_prefix is not None: + c_aad_prefix = tobytes(aad_prefix) + builder.aad_prefix(c_aad_prefix) + + if not check_footer_integrity: + builder.disable_footer_signature_verification() + + if allow_plaintext_files: + builder.plaintext_files_allowed() + + props = builder.build() + finally: + del builder + + return FileDecryptionProperties.wrap(props) + + +def create_encryption_properties( + footer_key, + *, + aad_prefix=None, + bint store_aad_prefix=True, + encryption_algorithm="AES_GCM_V1", + bint plaintext_footer=False, +): + """ + Create FileEncryptionProperties using a direct footer key. + + This bypasses the KMS-based :class:`CryptoFactory` API and directly + constructs encryption properties from a plaintext key. This is useful + when the caller manages key wrapping externally (e.g. via an + application-level envelope encryption scheme). + + For most use cases, prefer the higher-level :class:`CryptoFactory` + with :class:`EncryptionConfiguration`, which handles envelope + encryption, key rotation, and unique-per-file data keys + automatically. + + Parameters + ---------- + footer_key : bytes + The encryption key for the file footer (and all columns unless + per-column keys are specified). Must be 16, 24, or 32 bytes + for AES-128, AES-192, or AES-256 respectively. + aad_prefix : bytes, optional + Additional Authenticated Data prefix for cryptographic binding. + store_aad_prefix : bool, default True + Whether to store the AAD prefix in the Parquet file metadata. + Set to False when the AAD prefix will be supplied externally + at read time. + Only meaningful when *aad_prefix* is provided. + encryption_algorithm : str, default "AES_GCM_V1" + Encryption algorithm. Either ``"AES_GCM_V1"`` or + ``"AES_GCM_CTR_V1"``. + plaintext_footer : bool, default False + Whether to leave the file footer unencrypted. When True, file + schema and column statistics are readable without a key. + + Returns + ------- + FileEncryptionProperties + Properties that can be passed to :func:`write_table` or + :class:`ParquetWriter`. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> import pyarrow.parquet.encryption as pe + >>> props = pe.create_encryption_properties( + ... footer_key=b'0123456789abcdef', + ... aad_prefix=b'table_id', + ... store_aad_prefix=False + ... ) + >>> pq.write_table(table, 'encrypted.parquet', encryption_properties=props) + """ + cdef: + CSecureString c_footer_key + c_string c_aad_prefix + CFileEncryptionPropertiesBuilder* builder + shared_ptr[CFileEncryptionProperties] props + ParquetCipher cipher + + footer_key_bytes = tobytes(footer_key) + if len(footer_key_bytes) not in (16, 24, 32): + raise ValueError( + f"footer_key must be 16, 24, or 32 bytes, got {len(footer_key_bytes)}" + ) + + cipher = cipher_from_name(encryption_algorithm) + c_footer_key = CSecureString(footer_key_bytes) + builder = new CFileEncryptionPropertiesBuilder(c_footer_key) + + try: + builder.algorithm(cipher) + + if aad_prefix is not None: + c_aad_prefix = tobytes(aad_prefix) + builder.aad_prefix(c_aad_prefix) + if not store_aad_prefix: + builder.disable_aad_prefix_storage() + + if plaintext_footer: + builder.set_plaintext_footer() + + props = builder.build() + finally: + del builder + + return FileEncryptionProperties.wrap(props) diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index bbbac67c02c0..36faa0114820 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -22,7 +22,8 @@ from pyarrow.includes.libarrow cimport (Type, CChunkedArray, CScalar, CSchema, CStatus, CTable, CMemoryPool, CBuffer, CKeyValueMetadata, CRandomAccessFile, COutputStream, CCacheOptions, - TimeUnit, CRecordBatchReader) + TimeUnit, CRecordBatchReader, + CSecureString) cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: @@ -633,6 +634,28 @@ cdef extern from "parquet/encryption/encryption.h" namespace "parquet" nogil: " parquet::FileDecryptionProperties": pass + cdef cppclass CFileDecryptionPropertiesBuilder\ + " parquet::FileDecryptionProperties::Builder": + CFileDecryptionPropertiesBuilder() except + + CFileDecryptionPropertiesBuilder* footer_key( + CSecureString footer_key) except + + CFileDecryptionPropertiesBuilder* aad_prefix( + c_string aad_prefix) except + + CFileDecryptionPropertiesBuilder* disable_footer_signature_verification() except + + CFileDecryptionPropertiesBuilder* plaintext_files_allowed() except + + shared_ptr[CFileDecryptionProperties] build() except + + cdef cppclass CFileEncryptionProperties\ " parquet::FileEncryptionProperties": pass + + cdef cppclass CFileEncryptionPropertiesBuilder\ + " parquet::FileEncryptionProperties::Builder": + CFileEncryptionPropertiesBuilder(CSecureString footer_key) except + + CFileEncryptionPropertiesBuilder* set_plaintext_footer() except + + CFileEncryptionPropertiesBuilder* algorithm( + ParquetCipher parquet_cipher) except + + CFileEncryptionPropertiesBuilder* aad_prefix( + c_string aad_prefix) except + + CFileEncryptionPropertiesBuilder* disable_aad_prefix_storage() except + + shared_ptr[CFileEncryptionProperties] build() except + diff --git a/python/pyarrow/parquet/encryption.py b/python/pyarrow/parquet/encryption.py index df6eed913fa5..ce95e5d45075 100644 --- a/python/pyarrow/parquet/encryption.py +++ b/python/pyarrow/parquet/encryption.py @@ -20,4 +20,6 @@ EncryptionConfiguration, DecryptionConfiguration, KmsConnectionConfig, - KmsClient) + KmsClient, + create_encryption_properties, + create_decryption_properties) diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index 4e2fb069bd06..62127088ed7d 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -722,3 +722,156 @@ def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_conf result_table = pq.read_table( tempdir, decryption_properties=file_decryption_properties) assert data_table.equals(result_table) + + +class TestDirectKeyEncryption: + """Tests for create_encryption_properties / create_decryption_properties.""" + + KEY_128 = b"0123456789abcdef" + KEY_192 = b"0123456789abcdef01234567" + KEY_256 = b"0123456789abcdef0123456789abcdef" + AAD_PREFIX = b"test_aad_prefix" + + def test_roundtrip_aes128(self, tempdir, data_table): + path = tempdir / "direct_aes128.parquet" + + enc_props = pe.create_encryption_properties(footer_key=self.KEY_128) + pq.write_table(data_table, path, encryption_properties=enc_props) + + dec_props = pe.create_decryption_properties(footer_key=self.KEY_128) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_roundtrip_aes256(self, tempdir, data_table): + path = tempdir / "direct_aes256.parquet" + + enc_props = pe.create_encryption_properties(footer_key=self.KEY_256) + pq.write_table(data_table, path, encryption_properties=enc_props) + + dec_props = pe.create_decryption_properties(footer_key=self.KEY_256) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_roundtrip_with_aad_prefix(self, tempdir, data_table): + path = tempdir / "direct_aad.parquet" + + enc_props = pe.create_encryption_properties( + footer_key=self.KEY_128, + aad_prefix=self.AAD_PREFIX, + ) + pq.write_table(data_table, path, encryption_properties=enc_props) + + dec_props = pe.create_decryption_properties( + footer_key=self.KEY_128, + aad_prefix=self.AAD_PREFIX, + ) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_roundtrip_aad_prefix_not_stored(self, tempdir, data_table): + """When store_aad_prefix=False, reader must supply aad_prefix.""" + path = tempdir / "direct_aad_not_stored.parquet" + + enc_props = pe.create_encryption_properties( + footer_key=self.KEY_128, + aad_prefix=self.AAD_PREFIX, + store_aad_prefix=False, + ) + pq.write_table(data_table, path, encryption_properties=enc_props) + + # Reading without aad_prefix should fail + dec_props_no_aad = pe.create_decryption_properties( + footer_key=self.KEY_128, + ) + with pytest.raises(IOError): + pq.read_table(path, decryption_properties=dec_props_no_aad) + + # Reading with correct aad_prefix should succeed + dec_props = pe.create_decryption_properties( + footer_key=self.KEY_128, + aad_prefix=self.AAD_PREFIX, + ) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_encrypted_file_has_pare_magic(self, tempdir, data_table): + path = tempdir / "direct_magic.parquet" + + enc_props = pe.create_encryption_properties(footer_key=self.KEY_128) + pq.write_table(data_table, path, encryption_properties=enc_props) + + with open(path, "rb") as f: + magic = f.read(4) + assert magic == b"PARE" + + def test_plaintext_footer(self, tempdir, data_table): + path = tempdir / "direct_plaintext_footer.parquet" + + enc_props = pe.create_encryption_properties( + footer_key=self.KEY_128, + plaintext_footer=True, + ) + pq.write_table(data_table, path, encryption_properties=enc_props) + + dec_props = pe.create_decryption_properties(footer_key=self.KEY_128) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_aes_gcm_ctr_v1_algorithm(self, tempdir, data_table): + path = tempdir / "direct_ctr.parquet" + + enc_props = pe.create_encryption_properties( + footer_key=self.KEY_128, + encryption_algorithm="AES_GCM_CTR_V1", + ) + pq.write_table(data_table, path, encryption_properties=enc_props) + + dec_props = pe.create_decryption_properties(footer_key=self.KEY_128) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_wrong_key_fails(self, tempdir, data_table): + path = tempdir / "direct_wrong_key.parquet" + + enc_props = pe.create_encryption_properties(footer_key=self.KEY_128) + pq.write_table(data_table, path, encryption_properties=enc_props) + + wrong_key = b"fedcba9876543210" + dec_props = pe.create_decryption_properties(footer_key=wrong_key) + with pytest.raises(IOError): + pq.read_table(path, decryption_properties=dec_props) + + def test_reading_without_decryption_fails(self, tempdir, data_table): + path = tempdir / "direct_no_decrypt.parquet" + + enc_props = pe.create_encryption_properties(footer_key=self.KEY_128) + pq.write_table(data_table, path, encryption_properties=enc_props) + + with pytest.raises(IOError): + pq.read_table(path) + + def test_allow_plaintext_files(self, tempdir, data_table): + """Plaintext file reads should work when allow_plaintext_files=True.""" + path = tempdir / "plaintext.parquet" + pq.write_table(data_table, path) + + dec_props = pe.create_decryption_properties( + footer_key=self.KEY_128, + allow_plaintext_files=True, + ) + result = pq.read_table(path, decryption_properties=dec_props) + assert data_table.equals(result) + + def test_invalid_key_length_raises(self): + with pytest.raises(ValueError, match="16, 24, or 32 bytes"): + pe.create_encryption_properties(footer_key=b"short") + + with pytest.raises(ValueError, match="16, 24, or 32 bytes"): + pe.create_decryption_properties(footer_key=b"short") + + def test_invalid_algorithm_raises(self, tempdir): + with pytest.raises(ValueError): + pe.create_encryption_properties( + footer_key=self.KEY_128, + encryption_algorithm="INVALID", + )