From e4c214ba374221e19bc25d257b38c852f85f2db1 Mon Sep 17 00:00:00 2001 From: toppyy <43851547+toppyy@users.noreply.github.com> Date: Fri, 13 Mar 2026 21:31:59 +0200 Subject: [PATCH 1/4] Add a working, more complete example of using a catalog --- docs/source/user-guide/data-sources.rst | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index 26f1303c4..7d6e9d216 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -227,22 +227,30 @@ a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and with the names ``datafusion`` and ``default``, respectively. The default implementation uses an in-memory approach to the catalog and schema. We have support -for adding additional in-memory catalogs and schemas. This can be done like in the following +for adding additional in-memory catalogs and schemas. You can access tables registered in a schema +either through the Dataframe API or vial sql commands. This can be done like in the following example: .. code-block:: python from datafusion.catalog import Catalog, Schema + from datafusion import SessionContext + + ctx = SessionContext() - my_catalog = Catalog.memory_catalog() - my_schema = Schema.memory_schema() + my_catalog = Catalog.memory_catalog() + my_schema = Schema.memory_schema() my_catalog.register_schema("my_schema_name", my_schema) + ctx.register_catalog_provider("my_catalog_name", my_catalog) + + df = ctx.read_csv("pokemon.csv") + + my_schema.register_table('pokemon',df) - ctx.register_catalog("my_catalog_name", my_catalog) + pokemon = ctx.sql("SELECT * FROM my_catalog_name.my_schema_name.pokemon") -You could then register tables in ``my_schema`` and access them either through the DataFrame -API or via sql commands such as ``"SELECT * from my_catalog_name.my_schema_name.my_table"``. + pokemon.show() User Defined Catalog and Schema ------------------------------- From b3123fc7adbb5e2af0e2f9f204b5809d8f2840d7 Mon Sep 17 00:00:00 2001 From: toppyy <43851547+toppyy@users.noreply.github.com> Date: Sat, 14 Mar 2026 19:48:51 +0200 Subject: [PATCH 2/4] the default schema is 'public', not 'default' --- docs/source/user-guide/data-sources.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index 7d6e9d216..0ddf6a17e 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -224,7 +224,7 @@ A common technique for organizing tables is using a three level hierarchical app supports this form of organizing using the :py:class:`~datafusion.catalog.Catalog`, :py:class:`~datafusion.catalog.Schema`, and :py:class:`~datafusion.catalog.Table`. By default, a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema -with the names ``datafusion`` and ``default``, respectively. +with the names ``datafusion`` and ``public``, respectively. The default implementation uses an in-memory approach to the catalog and schema. We have support for adding additional in-memory catalogs and schemas. You can access tables registered in a schema From 366696a50a9cdc7dca82fd0fe5f1d7f12a5258c9 Mon Sep 17 00:00:00 2001 From: toppyy <43851547+toppyy@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:04:24 +0200 Subject: [PATCH 3/4] in-memory table instead of imaginary csv for standalone example --- docs/source/user-guide/data-sources.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index 0ddf6a17e..ce314baa5 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -233,24 +233,28 @@ example: .. code-block:: python + import pyarrow as pa from datafusion.catalog import Catalog, Schema from datafusion import SessionContext ctx = SessionContext() - my_catalog = Catalog.memory_catalog() - my_schema = Schema.memory_schema() - - my_catalog.register_schema("my_schema_name", my_schema) - ctx.register_catalog_provider("my_catalog_name", my_catalog) - - df = ctx.read_csv("pokemon.csv") + my_catalog = Catalog.memory_catalog() + my_schema = Schema.memory_schema() + my_catalog.register_schema('my_schema_name', my_schema) + ctx.register_catalog_provider('my_catalog_name', my_catalog) - my_schema.register_table('pokemon',df) + # Create an in-memory table + table = pa.table({ + 'name': ['Bulbasaur', 'Charmander', 'Squirtle'], + 'type': ['Grass', 'Fire', 'Water'], + 'hp': [45, 39, 44], + }) + df = ctx.create_dataframe([table.to_batches()], name='pokemon') - pokemon = ctx.sql("SELECT * FROM my_catalog_name.my_schema_name.pokemon") + my_schema.register_table('pokemon', df) - pokemon.show() + ctx.sql('SELECT * FROM my_catalog_name.my_schema_name.pokemon').show() User Defined Catalog and Schema ------------------------------- From 482490565a44bc68ab618b213127c982c2997618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Topias=20Pyykk=C3=B6nen?= <43851547+toppyy@users.noreply.github.com> Date: Thu, 19 Mar 2026 19:04:49 +0200 Subject: [PATCH 4/4] typo fix Co-authored-by: Kevin Liu --- docs/source/user-guide/data-sources.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst index ce314baa5..48ff4c014 100644 --- a/docs/source/user-guide/data-sources.rst +++ b/docs/source/user-guide/data-sources.rst @@ -228,7 +228,7 @@ with the names ``datafusion`` and ``public``, respectively. The default implementation uses an in-memory approach to the catalog and schema. We have support for adding additional in-memory catalogs and schemas. You can access tables registered in a schema -either through the Dataframe API or vial sql commands. This can be done like in the following +either through the Dataframe API or via sql commands. This can be done like in the following example: .. code-block:: python