From 523d9d4df0f07e86789df42dc0360e75d9a1d9a8 Mon Sep 17 00:00:00 2001 From: adrianqin Date: Thu, 14 Dec 2023 14:37:22 -0500 Subject: [PATCH 1/6] add timeout config --- pyiceberg/io/fsspec.py | 3 +++ pyiceberg/io/pyarrow.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 97a01f238a..c2339f308e 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -127,6 +127,9 @@ def _s3(properties: Properties) -> AbstractFileSystem: if proxy_uri := properties.get(S3_PROXY_URI): config_kwargs["proxies"] = {"http": proxy_uri, "https": proxy_uri} + if connect_timeout := properties.get("connect_timeout"): + config_kwargs["connect_timeout"] = connect_timeout + fs = S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs) for event_name, event_function in register_events.items(): diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 67a16ebefb..be0fd24273 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -330,6 +330,9 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste if proxy_uri := self.properties.get(S3_PROXY_URI): client_kwargs["proxy_options"] = proxy_uri + if connect_timeout := self.properties.get("connect_timeout"): + client_kwargs["connect_timeout"] = connect_timeout + return S3FileSystem(**client_kwargs) elif scheme == "hdfs": from pyarrow.fs import HadoopFileSystem From 2509d8623dbe98f186e245ea297e23e5766b129a Mon Sep 17 00:00:00 2001 From: adrianqin Date: Thu, 14 Dec 2023 16:40:45 -0500 Subject: [PATCH 2/6] wrap into a constant --- pyiceberg/io/__init__.py | 1 + pyiceberg/io/fsspec.py | 3 ++- pyiceberg/io/pyarrow.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index b55a896284..409e8d5190 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -51,6 +51,7 @@ S3_SESSION_TOKEN = "s3.session-token" S3_REGION = "s3.region" S3_PROXY_URI = "s3.proxy-uri" +S3_CONNECT_TIMEOUT = "s3.connect_timeout" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index c2339f308e..42bdcf2f9e 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -54,6 +54,7 @@ S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + S3_CONNECT_TIMEOUT, FileIO, InputFile, InputStream, @@ -127,7 +128,7 @@ def _s3(properties: Properties) -> AbstractFileSystem: if proxy_uri := properties.get(S3_PROXY_URI): config_kwargs["proxies"] = {"http": proxy_uri, "https": proxy_uri} - if connect_timeout := properties.get("connect_timeout"): + if connect_timeout := properties.get(S3_CONNECT_TIMEOUT): config_kwargs["connect_timeout"] = connect_timeout fs = S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index be0fd24273..aea099813a 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -97,6 +97,7 @@ S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + S3_CONNECT_TIMEOUT, FileIO, InputFile, InputStream, @@ -330,7 +331,7 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste if proxy_uri := self.properties.get(S3_PROXY_URI): client_kwargs["proxy_options"] = proxy_uri - if connect_timeout := self.properties.get("connect_timeout"): + if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): client_kwargs["connect_timeout"] = connect_timeout return S3FileSystem(**client_kwargs) From 388232a53df5a197b515389649336b8916e52d7a Mon Sep 17 00:00:00 2001 From: adrianqin Date: Thu, 14 Dec 2023 16:47:55 -0500 Subject: [PATCH 3/6] property format consistency --- pyiceberg/io/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 409e8d5190..dc94dc8a4e 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -51,7 +51,7 @@ S3_SESSION_TOKEN = "s3.session-token" S3_REGION = "s3.region" S3_PROXY_URI = "s3.proxy-uri" -S3_CONNECT_TIMEOUT = "s3.connect_timeout" +S3_CONNECT_TIMEOUT = "s3.connect-timeout" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" From b2de3364e0c5dd505712b95741d8f5bade9da0f0 Mon Sep 17 00:00:00 2001 From: adrianqin Date: Mon, 18 Dec 2023 14:39:14 -0500 Subject: [PATCH 4/6] linting --- pyiceberg/io/fsspec.py | 2 +- pyiceberg/io/pyarrow.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 42bdcf2f9e..18b1f5b885 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -49,12 +49,12 @@ GCS_TOKEN, GCS_VERSION_AWARE, S3_ACCESS_KEY_ID, + S3_CONNECT_TIMEOUT, S3_ENDPOINT, S3_PROXY_URI, S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, - S3_CONNECT_TIMEOUT, FileIO, InputFile, InputStream, diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index aea099813a..a537cf7a30 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -92,12 +92,12 @@ HDFS_PORT, HDFS_USER, S3_ACCESS_KEY_ID, + S3_CONNECT_TIMEOUT, S3_ENDPOINT, S3_PROXY_URI, S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, - S3_CONNECT_TIMEOUT, FileIO, InputFile, InputStream, From 0178597da31d00ede7de8c38ae0240ca6f795f43 Mon Sep 17 00:00:00 2001 From: jqin61 Date: Mon, 18 Dec 2023 15:22:53 -0500 Subject: [PATCH 5/6] update doc --- mkdocs/docs/configuration.md | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index fa861908f9..927ef5a333 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -74,6 +74,7 @@ For the FileIO there are several configuration options available: | s3.signer | bearer | Configure the signature version of the FileIO. | | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | http://my.proxy.com:8080 | Configure the proxy server to be used by the FileIO. | +| s3.connect-timeout | 60 | Configure socket connection timeout, in seconds. | ### HDFS From 16c128c5d3cd3ea79d4fd95850febc0deb6ceff4 Mon Sep 17 00:00:00 2001 From: jqin61 Date: Mon, 18 Dec 2023 18:34:17 -0500 Subject: [PATCH 6/6] make config sample more informative --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 927ef5a333..801bb8fc87 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -74,7 +74,7 @@ For the FileIO there are several configuration options available: | s3.signer | bearer | Configure the signature version of the FileIO. | | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | http://my.proxy.com:8080 | Configure the proxy server to be used by the FileIO. | -| s3.connect-timeout | 60 | Configure socket connection timeout, in seconds. | +| s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | ### HDFS