diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index fa861908f9..801bb8fc87 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -74,6 +74,7 @@ For the FileIO there are several configuration options available: | s3.signer | bearer | Configure the signature version of the FileIO. | | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | http://my.proxy.com:8080 | Configure the proxy server to be used by the FileIO. | +| s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | ### HDFS diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index b55a896284..dc94dc8a4e 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -51,6 +51,7 @@ S3_SESSION_TOKEN = "s3.session-token" S3_REGION = "s3.region" S3_PROXY_URI = "s3.proxy-uri" +S3_CONNECT_TIMEOUT = "s3.connect-timeout" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 97a01f238a..18b1f5b885 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -49,6 +49,7 @@ GCS_TOKEN, GCS_VERSION_AWARE, S3_ACCESS_KEY_ID, + S3_CONNECT_TIMEOUT, S3_ENDPOINT, S3_PROXY_URI, S3_REGION, @@ -127,6 +128,9 @@ def _s3(properties: Properties) -> AbstractFileSystem: if proxy_uri := properties.get(S3_PROXY_URI): config_kwargs["proxies"] = {"http": proxy_uri, "https": proxy_uri} + if connect_timeout := properties.get(S3_CONNECT_TIMEOUT): + config_kwargs["connect_timeout"] = connect_timeout + fs = S3FileSystem(client_kwargs=client_kwargs, config_kwargs=config_kwargs) for event_name, event_function in register_events.items(): diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 67a16ebefb..a537cf7a30 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -92,6 +92,7 @@ HDFS_PORT, HDFS_USER, S3_ACCESS_KEY_ID, + S3_CONNECT_TIMEOUT, S3_ENDPOINT, S3_PROXY_URI, S3_REGION, @@ -330,6 +331,9 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste if proxy_uri := self.properties.get(S3_PROXY_URI): client_kwargs["proxy_options"] = proxy_uri + if connect_timeout := self.properties.get(S3_CONNECT_TIMEOUT): + client_kwargs["connect_timeout"] = connect_timeout + return S3FileSystem(**client_kwargs) elif scheme == "hdfs": from pyarrow.fs import HadoopFileSystem