From 5b24172e8eef51caad90866dc65c3544c788c856 Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Mon, 2 Dec 2024 10:24:58 +0700 Subject: [PATCH 1/7] Added force virtual addressing configuration for S3. Also added oss and r2 protocol. --- mkdocs/docs/configuration.md | 1 + pyiceberg/io/__init__.py | 2 ++ pyiceberg/io/pyarrow.py | 6 +++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 133f02060a..47bb144ff1 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -115,6 +115,7 @@ For the FileIO there are several configuration options available: | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | | s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | +| s3.force-virtual-addressing | False | Configure the style of requests. Set `False` to use path-style request and `True` for virtual-hosted-style request. | diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 3769c31947..40186069d4 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -74,6 +74,7 @@ S3_SIGNER_ENDPOINT_DEFAULT = "v1/aws/s3/sign" S3_ROLE_ARN = "s3.role-arn" S3_ROLE_SESSION_NAME = "s3.role-session-name" +S3_FORCE_VIRTUAL_ADDRESSING = "s3.force-virtual-addressing" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" @@ -304,6 +305,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: "s3": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3a": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3n": [ARROW_FILE_IO, FSSPEC_FILE_IO], + "oss": [ARROW_FILE_IO], "gs": [ARROW_FILE_IO], "file": [ARROW_FILE_IO, FSSPEC_FILE_IO], "hdfs": [ARROW_FILE_IO], diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index bd4e969df4..3b014b86d8 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -108,6 +108,7 @@ S3_ROLE_SESSION_NAME, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + S3_FORCE_VIRTUAL_ADDRESSING, FileIO, InputFile, InputStream, @@ -350,7 +351,7 @@ def parse_location(location: str) -> Tuple[str, str, str]: return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem: - if scheme in {"s3", "s3a", "s3n"}: + if scheme in {"s3", "s3a", "s3n", "oss", "r2"}: from pyarrow.fs import S3FileSystem client_kwargs: Dict[str, Any] = { @@ -373,6 +374,9 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste if session_name := get_first_property_value(self.properties, S3_ROLE_SESSION_NAME, AWS_ROLE_SESSION_NAME): client_kwargs["session_name"] = session_name + if force_virtual_addressing := self.properties.get(S3_FORCE_VIRTUAL_ADDRESSING): + client_kwargs["force_virtual_addressing"] = property_as_bool(self.properties, force_virtual_addressing, False) + return S3FileSystem(**client_kwargs) elif scheme in ("hdfs", "viewfs"): from pyarrow.fs import HadoopFileSystem From 360e305919c275ab0fd97c741aa75f5f3b0f2999 Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Tue, 3 Dec 2024 09:50:54 +0700 Subject: [PATCH 2/7] Rewrote force virtual addressing as written in PyArrow documentation --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 47bb144ff1..20a43466a8 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -115,7 +115,7 @@ For the FileIO there are several configuration options available: | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | | s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | -| s3.force-virtual-addressing | False | Configure the style of requests. Set `False` to use path-style request and `True` for virtual-hosted-style request. | +| s3.force-virtual-addressing | False | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. | From 31291f44caccffb6798af1586faa11695780f5e6 Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Tue, 3 Dec 2024 09:52:15 +0700 Subject: [PATCH 3/7] Added the missing r2 key value in schema_to_file_io --- pyiceberg/io/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 40186069d4..95046bcf90 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -306,6 +306,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: "s3a": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3n": [ARROW_FILE_IO, FSSPEC_FILE_IO], "oss": [ARROW_FILE_IO], + "r2": [ARROW_FILE_IO], "gs": [ARROW_FILE_IO], "file": [ARROW_FILE_IO, FSSPEC_FILE_IO], "hdfs": [ARROW_FILE_IO], From 657555286a1d27e84b98c069bd08d459284bc446 Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Tue, 3 Dec 2024 15:09:45 +0700 Subject: [PATCH 4/7] Removed R2 protocol for now --- pyiceberg/io/__init__.py | 1 - pyiceberg/io/pyarrow.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 95046bcf90..40186069d4 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -306,7 +306,6 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: "s3a": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3n": [ARROW_FILE_IO, FSSPEC_FILE_IO], "oss": [ARROW_FILE_IO], - "r2": [ARROW_FILE_IO], "gs": [ARROW_FILE_IO], "file": [ARROW_FILE_IO, FSSPEC_FILE_IO], "hdfs": [ARROW_FILE_IO], diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 3b014b86d8..c6331faa51 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -351,7 +351,7 @@ def parse_location(location: str) -> Tuple[str, str, str]: return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem: - if scheme in {"s3", "s3a", "s3n", "oss", "r2"}: + if scheme in {"s3", "s3a", "s3n", "oss"}: from pyarrow.fs import S3FileSystem client_kwargs: Dict[str, Any] = { From 282bd47a229887d35590a1dc5136ff399315b4c5 Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Wed, 4 Dec 2024 09:44:26 +0700 Subject: [PATCH 5/7] Linter fix --- pyiceberg/io/pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index c6331faa51..7956a83242 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -102,13 +102,13 @@ S3_ACCESS_KEY_ID, S3_CONNECT_TIMEOUT, S3_ENDPOINT, + S3_FORCE_VIRTUAL_ADDRESSING, S3_PROXY_URI, S3_REGION, S3_ROLE_ARN, S3_ROLE_SESSION_NAME, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, - S3_FORCE_VIRTUAL_ADDRESSING, FileIO, InputFile, InputStream, From 7315525e8a29094722da15dff9d8c536930f95ac Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Wed, 4 Dec 2024 10:57:05 +0700 Subject: [PATCH 6/7] Updated documentation for OSS support --- mkdocs/docs/configuration.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 20a43466a8..6a3b4746b4 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -88,6 +88,7 @@ Iceberg works with the concept of a FileIO which is a pluggable module for readi - **file**: `PyArrowFileIO` - **hdfs**: `PyArrowFileIO` - **abfs**, **abfss**: `FsspecFileIO` +- **oss**: `PyArrowFileIO` You can also set the FileIO explicitly: @@ -166,6 +167,23 @@ For the FileIO there are several configuration options available: | gcs.default-location | US | Configure the default location where buckets are created, like 'US' or 'EUROPE-WEST3'. | | gcs.version-aware | False | Configure whether to support object versioning on the GCS bucket. | + + +### Alibaba Cloud Object Storage Service (OSS) + + + +PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html) class to connect to OSS bucket as the service is [compatible with S3 SDK](https://www.alibabacloud.com/help/en/oss/developer-reference/use-amazon-s3-sdks-to-access-oss) as long as the endpoint is addressed with virtual hosted style. + +| Key | Example | Description | +| -------------------- | ------------------- | ------------------------------------------------ | +| s3.endpoint | | Configure an endpoint of the OSS service for the FileIO to access. Be sure to use S3 compatible endpoint as given in the example. | +| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | +| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | +| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | +| s3.force-virtual-addressing | True | Whether to use virtual addressing of buckets. This must be set to True as OSS can only be accessed with virtual hosted style address. | + + ### PyArrow From 727666d967657cbed3f720ee09276a81f691f75b Mon Sep 17 00:00:00 2001 From: helmiazizm Date: Wed, 4 Dec 2024 11:26:38 +0700 Subject: [PATCH 7/7] Another linter fix --- mkdocs/docs/configuration.md | 1 - 1 file changed, 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 6a3b4746b4..1c88c7cb3b 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -183,7 +183,6 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya | s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | | s3.force-virtual-addressing | True | Whether to use virtual addressing of buckets. This must be set to True as OSS can only be accessed with virtual hosted style address. | - ### PyArrow