Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ANALYZE TABLE #105

Merged
merged 16 commits into from
Jan 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dask_sql/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __init__(self):
RelConverter.add_plugin_class(logical.LogicalTableScanPlugin, replace=False)
RelConverter.add_plugin_class(logical.LogicalUnionPlugin, replace=False)
RelConverter.add_plugin_class(logical.LogicalValuesPlugin, replace=False)
RelConverter.add_plugin_class(custom.AnalyzeTablePlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateModelPlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateTableAsPlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateTablePlugin, replace=False)
Expand Down
8 changes: 8 additions & 0 deletions dask_sql/datacontainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ def get_backend_by_frontend_index(self, index: int) -> str:
backend_column = self._frontend_backend_mapping[frontend_column]
return backend_column

def get_backend_by_frontend_name(self, column: str) -> str:
"""
Get back the dask column, which is referenced by the
frontend (SQL) column with the given name.
"""
backend_column = self._frontend_backend_mapping[column]
return backend_column

def make_unique(self, prefix="col"):
"""
Make sure we have unique column names by calling each column
Expand Down
2 changes: 2 additions & 0 deletions dask_sql/physical/rel/custom/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .analyze import AnalyzeTablePlugin
from .columns import ShowColumnsPlugin
from .create_model import CreateModelPlugin
from .create_table import CreateTablePlugin
Expand All @@ -9,6 +10,7 @@
from .tables import ShowTablesPlugin

__all__ = [
AnalyzeTablePlugin,
CreateModelPlugin,
CreateTableAsPlugin,
CreateTablePlugin,
Expand Down
64 changes: 64 additions & 0 deletions dask_sql/physical/rel/custom/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pandas as pd
import dask.dataframe as dd

from dask_sql.physical.rel.base import BaseRelPlugin
from dask_sql.datacontainer import DataContainer, ColumnContainer
from dask_sql.mappings import python_to_sql_type
from dask_sql.utils import get_table_from_compound_identifier


class AnalyzeTablePlugin(BaseRelPlugin):
"""
Show information on the table (like mean, max etc.)
on all or a subset of the columns..
The SQL is:

ANALYZE TABLE <table> COMPUTE STATISTICS [FOR ALL COLUMNS | FOR COLUMNS a, b, ...]

The result is also a table, although it is created on the fly.

Please note: even though the syntax is very similar to e.g.
[the spark version](https://spark.apache.org/docs/3.0.0/sql-ref-syntax-aux-analyze-table.html),
this call does not help with query optimization (as the spark call would do),
as this is currently not implemented in dask-sql.
"""

class_name = "com.dask.sql.parser.SqlAnalyzeTable"

def convert(
self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
) -> DataContainer:
components = list(map(str, sql.getTableName().names))
dc = get_table_from_compound_identifier(context, components)
columns = list(map(str, sql.getColumnList()))

if not columns:
columns = dc.column_container.columns

# Define some useful shortcuts
mapping = dc.column_container.get_backend_by_frontend_name
df = dc.df

# Calculate statistics
statistics = dd.from_pandas(
pd.DataFrame({col: [] for col in columns}), npartitions=1
)
statistics = statistics.append(df[[mapping(col) for col in columns]].describe())

# Add additional information
statistics = statistics.append(
pd.Series(
{
col: str(python_to_sql_type(df[mapping(col)].dtype)).lower()
for col in columns
},
name="data_type",
)
)
statistics = statistics.append(
pd.Series({col: col for col in columns}, name="col_name",)
)

cc = ColumnContainer(statistics.columns)
dc = DataContainer(statistics, cc)
return dc
16 changes: 2 additions & 14 deletions dask_sql/physical/rel/custom/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dask_sql.physical.rel.base import BaseRelPlugin
from dask_sql.datacontainer import DataContainer, ColumnContainer
from dask_sql.mappings import python_to_sql_type
from dask_sql.utils import get_table_from_compound_identifier


class ShowColumnsPlugin(BaseRelPlugin):
Expand All @@ -22,20 +23,7 @@ def convert(
self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
) -> DataContainer:
components = list(map(str, sql.getTable().names))

# some queries might also include the database
# as we do not have such a concept, we just get rid of it
components = components[-2:]
tableName = components[-1]

if len(components) == 2:
if components[0] != context.schema_name:
raise AttributeError(f"Schema {components[0]} is not defined.")

try:
dc = context.tables[tableName]
except KeyError:
raise AttributeError(f"Table {tableName} is not defined.")
dc = get_table_from_compound_identifier(context, components)

cols = dc.column_container.columns
dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))
Expand Down
24 changes: 24 additions & 0 deletions dask_sql/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import List
import importlib
from typing import Any, Dict
from collections import defaultdict
Expand Down Expand Up @@ -185,6 +186,29 @@ def __str__(self):
return f"Literal: {df}"


def get_table_from_compound_identifier(
context: "dask_sql.Context", components: List[str]
) -> DataContainer:
"""
Helper function to return the correct table
from the stored tables in the context
with the given name (and maybe also schema name)
"""
# some queries might also include the database
# as we do not have such a concept, we just get rid of it
components = components[-2:]
tableName = components[-1]

if len(components) == 2:
if components[0] != context.schema_name:
raise AttributeError(f"Schema {components[0]} is not defined.")

try:
return context.tables[tableName]
except KeyError:
raise AttributeError(f"Table {tableName} is not defined.")


def convert_sql_kwargs(
sql_kwargs: "java.util.HashMap[org.apache.calcite.sql.SqlNode, org.apache.calcite.sql.SqlNode]",
) -> Dict[str, Any]:
Expand Down
2 changes: 1 addition & 1 deletion docs/pages/sql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
SQL Syntax
==========

``dask-sql`` understands SQL in (mostly) postgreSQL syntax.
``dask-sql`` understands SQL in (mostly) presto SQL syntax.
So far, not every valid SQL operator and keyword is already
implemented in ``dask-sql``, but a large fraction of it.
Have a look into our `issue tracker <https://github.com/nils-braun/dask-sql/issues>`_
Expand Down
118 changes: 90 additions & 28 deletions docs/pages/sql/describe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@ Metadata Information
====================

With these operations, it is possible to get information on the currently registered tables
and their columns:
and their columns.
The output format is mostly compatible with the presto format.

.. code-block:: sql
.. raw:: html

SHOW SCHEMAS
SHOW TABLES FROM "schema-name"
SHOW COLUMNS FROM "table-name"
DESCRIBE "table-name"
<div class="highlight"><pre>
<span class="k">SHOW SCHEMAS</span>
<span class="k">SHOW TABLES FROM</span> <span class="ss">&lt;schema-name&gt;</span>
<span class="k">SHOW COLUMNS FROM</span> <span class="ss">&lt;table-name></span>
<span class="k">DESCRIBE</span> <span class="ss">&lt;table-name></span>
<span class="k">ANALYZE TABLE</span> <span class="ss">&lt;table-name&gt;</span> <span class="k">COMPUTE STATISTICS</span>
[ <span class="k">FOR ALL COLUMNS</span> | <span class="k">FOR COLUMNS</span> <span class="ss">&lt;column&gt;</span>, [ ,... ] ]
</pre></div>

See :ref:`sql` for information on how to reference schemas and tables correctly.

Expand All @@ -23,17 +28,21 @@ which is needed by some BI tools (which is empty).

Example:

.. code-block:: sql
.. raw:: html

SHOW SCHEMAS
<div class="highlight"><pre>
<span class="k">SHOW SCHEMAS</span>
</pre></div>

Result:

.. code-block:: none

Schema
0 schema
1 information_schema
+------------------------+
| Schema |
+========================+
| schema |
+------------------------+
| information_schema |
+------------------------+

``SHOW TABLES``
---------------
Expand All @@ -42,17 +51,19 @@ Show the registered tables in a given schema.

Example:

.. code-block:: sql
.. raw:: html

SHOW TABLES FROM "schema"
<div class="highlight"><pre>
<span class="k">SHOW TABLES FROM</span> <span class="ss">"schema"</span>
</pre></div>

Result:

.. code-block:: none

Table
0 timeseries

+------------+
| Table |
+============+
| timeseries |
+------------+

``SHOW COLUMNS`` and ``DESCRIBE``
---------------------------------
Expand All @@ -61,16 +72,67 @@ Show column information on a specific table.

Example:

.. code-block:: sql
.. raw:: html

SHOW COLUMNS FROM "timeseries"
<div class="highlight"><pre>
<span class="k">SHOW COLUMNS FROM</span> <span class="ss">"timeseries"</span>
</pre></div>

Result:

.. code-block:: none
+--------+---------+---------------+
| Column | Type | Extra Comment |
+========+=========+===============+
| id | bigint | |
+--------+---------+---------------+
| name | varchar | |
+--------+---------+---------------+
| x | double | |
+--------+---------+---------------+
| y | double | |
+--------+---------+---------------+

The column "Extra Comment" is shown for compatibility with presto.


``ANALYZE TABLE``
-----------------

Calculate statistics on a given table (and the given columns or all columns)
and return it as a query result.
Please note, that this process can be time consuming on large tables.
Even though this statement is very similar to the ``ANALYZE TABLE`` statement in e.g. `Apache Spark <https://spark.apache.org/docs/3.0.0/sql-ref-syntax-aux-analyze-table.html>`_, it does not optimize subsequent queries (as the pendent in Spark will do).

Example:

.. raw:: html

<div class="highlight"><pre>
<span class="k">ANALYZE TABLE</span> <span class="ss">"timeseries"</span> <span class="k">COMPUTE STATISTICS</span> <span class="k">FOR COLUMNS</span> <span class="ss">x</span>, <span class="ss">y</span>
</pre></div>

Result:

Column Type Extra Comment
0 id bigint
1 name varchar
2 x double
3 y double
+-----------+-----------+-----------+
| | x | y |
+===========+===========+===========+
| count | 30 | 30 |
+-----------+-----------+-----------+
| mean | 0.140374 | -0.107481 |
+-----------+-----------+-----------+
| std | 0.568248 | 0.573106 |
+-----------+-----------+-----------+
| min | -0.795112 | -0.966043 |
+-----------+-----------+-----------+
| 25% | -0.379635 | -0.561234 |
+-----------+-----------+-----------+
| 50% | 0.0104101 | -0.237795 |
+-----------+-----------+-----------+
| 75% | 0.70208 | 0.263459 |
+-----------+-----------+-----------+
| max | 0.990747 | 0.947069 |
+-----------+-----------+-----------+
| data_type | double | double |
+-----------+-----------+-----------+
| col_name | x | y |
+-----------+-----------+-----------+
8 changes: 8 additions & 0 deletions planner/src/main/codegen/config.fmpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ data: {
"org.apache.calcite.sql.SqlCreate",
"org.apache.calcite.sql.SqlDrop",
"java.util.*",
"com.dask.sql.parser.SqlAnalyzeTable",
"com.dask.sql.parser.SqlCreateModel",
"com.dask.sql.parser.SqlCreateTable",
"com.dask.sql.parser.SqlCreateTableAs",
Expand All @@ -39,27 +40,34 @@ data: {

# List of keywords.
keywords: [
"ANALYZE"
"COLUMNS"
"COMPUTE"
"IF"
"MODEL"
"PREDICT"
"SCHEMAS"
"STATISTICS"
"TABLES"
]

# The keywords can only be used in a specific context,
# so we can keep them non-reserved
nonReservedKeywordsToAdd: [
"ANALYZE"
"COLUMNS"
"COMPUTE"
"IF"
"MODEL"
"PREDICT"
"SCHEMAS"
"STATISTICS"
"TABLES"
]

# List of methods for parsing custom SQL statements
statementParserMethods: [
"SqlAnalyzeTable()"
"SqlDescribeTable()"
"SqlShowColumns()"
"SqlShowSchemas()"
Expand Down
Loading