Skip to content

Commit ae16b3d

Browse files
committed
Add ALDS support for PyArrow
1 parent e8e2c91 commit ae16b3d

File tree

4 files changed

+329
-197
lines changed

4 files changed

+329
-197
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@
8383
)
8484
from pyiceberg.expressions.visitors import visit as boolean_expression_visit
8585
from pyiceberg.io import (
86+
ADLS_ACCOUNT_KEY,
87+
ADLS_ACCOUNT_NAME,
8688
AWS_ACCESS_KEY_ID,
8789
AWS_REGION,
8890
AWS_ROLE_ARN,
@@ -366,6 +368,9 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste
366368
elif scheme in {"file"}:
367369
return self._initialize_local_fs()
368370

371+
elif scheme in {"abfs", "abfss"}:
372+
return self._initialize_adls_fs()
373+
369374
else:
370375
raise ValueError(f"Unrecognized filesystem type in URI: {scheme}")
371376

@@ -476,6 +481,14 @@ def _initialize_gcs_fs(self) -> FileSystem:
476481

477482
return GcsFileSystem(**gcs_kwargs)
478483

484+
def _initialize_adls_fs(self) -> FileSystem:
485+
from pyarrow.fs import AzureFileSystem
486+
487+
return AzureFileSystem(
488+
account_name=self.properties.get(ADLS_ACCOUNT_NAME),
489+
account_key=self.properties.get(ADLS_ACCOUNT_KEY),
490+
)
491+
479492
def _initialize_local_fs(self) -> FileSystem:
480493
return PyArrowLocalFileSystem()
481494

tests/conftest.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
Generator,
4343
List,
4444
Optional,
45+
Tuple,
4546
)
4647

4748
import boto3
@@ -53,10 +54,13 @@
5354
from pyiceberg.catalog.noop import NoopCatalog
5455
from pyiceberg.expressions import BoundReference
5556
from pyiceberg.io import (
57+
ADLS_ACCOUNT_NAME,
58+
ADLS_CONNECTION_STRING,
5659
GCS_PROJECT_ID,
5760
GCS_SERVICE_HOST,
5861
GCS_TOKEN,
5962
GCS_TOKEN_EXPIRES_AT_MS,
63+
FileIO,
6064
fsspec,
6165
load_file_io,
6266
)
@@ -90,6 +94,7 @@
9094

9195
if TYPE_CHECKING:
9296
import pyarrow as pa
97+
from azure.storage.blob import BlobServiceClient
9398
from moto.server import ThreadedMotoServer # type: ignore
9499
from pyspark.sql import SparkSession
95100

@@ -2077,24 +2082,54 @@ def fixture_dynamodb(_aws_credentials: None) -> Generator[boto3.client, None, No
20772082
yield boto3.client("dynamodb", region_name="us-east-1")
20782083

20792084

2080-
@pytest.fixture
2081-
def adls_fsspec_fileio(request: pytest.FixtureRequest) -> Generator[FsspecFileIO, None, None]:
2082-
from azure.storage.blob import BlobServiceClient
2083-
2085+
def _get_account_name_and_connection_string(request: pytest.FixtureRequest) -> Tuple[str, str]:
20842086
azurite_url = request.config.getoption("--adls.endpoint")
20852087
azurite_account_name = request.config.getoption("--adls.account-name")
20862088
azurite_account_key = request.config.getoption("--adls.account-key")
20872089
azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};"
2088-
properties = {
2089-
"adls.connection-string": azurite_connection_string,
2090-
"adls.account-name": azurite_account_name,
2091-
}
2090+
2091+
return azurite_account_name, azurite_connection_string
2092+
2093+
2094+
def _setup_blob(azurite_connection_string: str) -> "BlobServiceClient":
2095+
from azure.storage.blob import BlobServiceClient
20922096

20932097
bbs = BlobServiceClient.from_connection_string(conn_str=azurite_connection_string)
2098+
2099+
# Recreate container if needed
2100+
if list(bbs.list_containers(name_starts_with="tests")):
2101+
bbs.delete_container("tests")
20942102
bbs.create_container("tests")
2095-
yield fsspec.FsspecFileIO(properties=properties)
2096-
bbs.delete_container("tests")
2097-
bbs.close()
2103+
2104+
return bbs
2105+
2106+
2107+
@pytest.fixture
2108+
def adls_fsspec_fileio(request: pytest.FixtureRequest) -> Generator[FileIO, None, None]:
2109+
azurite_account_name, azurite_connection_string = _get_account_name_and_connection_string(request)
2110+
2111+
with _setup_blob(azurite_connection_string):
2112+
yield FsspecFileIO(
2113+
properties={
2114+
ADLS_CONNECTION_STRING: azurite_connection_string,
2115+
ADLS_ACCOUNT_NAME: azurite_account_name,
2116+
}
2117+
)
2118+
2119+
2120+
@pytest.fixture
2121+
def adls_pyarrow_fileio(request: pytest.FixtureRequest) -> Generator[FileIO, None, None]:
2122+
from pyiceberg.io.pyarrow import PyArrowFileIO
2123+
2124+
azurite_account_name, azurite_connection_string = _get_account_name_and_connection_string(request)
2125+
2126+
with _setup_blob(azurite_connection_string):
2127+
yield PyArrowFileIO(
2128+
properties={
2129+
ADLS_CONNECTION_STRING: azurite_connection_string,
2130+
ADLS_ACCOUNT_NAME: azurite_account_name,
2131+
}
2132+
)
20982133

20992134

21002135
@pytest.fixture(scope="session")

0 commit comments

Comments
 (0)