dask-contrib · nils-braun · Jan 6, 2021 · Dec 30, 2020 · Dec 30, 2020 · Jan 2, 2021
@@ -86,6 +86,7 @@ def __init__(self):
  RelConverter.add_plugin_class(logical.LogicalTableScanPlugin, replace=False)
  RelConverter.add_plugin_class(logical.LogicalUnionPlugin, replace=False)
  RelConverter.add_plugin_class(logical.LogicalValuesPlugin, replace=False)
+ RelConverter.add_plugin_class(custom.AnalyzeTablePlugin, replace=False)
  RelConverter.add_plugin_class(custom.CreateModelPlugin, replace=False)
  RelConverter.add_plugin_class(custom.CreateTableAsPlugin, replace=False)
  RelConverter.add_plugin_class(custom.CreateTablePlugin, replace=False)

@@ -117,6 +117,14 @@ def get_backend_by_frontend_index(self, index: int) -> str:
  backend_column = self._frontend_backend_mapping[frontend_column]
  return backend_column
 
+ def get_backend_by_frontend_name(self, column: str) -> str:
+ """
+ Get back the dask column, which is referenced by the
+ frontend (SQL) column with the given name.
+ """
+ backend_column = self._frontend_backend_mapping[column]
+ return backend_column
+
  def make_unique(self, prefix="col"):
  """
  Make sure we have unique column names by calling each column

@@ -1,3 +1,4 @@
+from .analyze import AnalyzeTablePlugin
 from .columns import ShowColumnsPlugin
 from .create_model import CreateModelPlugin
 from .create_table import CreateTablePlugin
@@ -9,6 +10,7 @@
 from .tables import ShowTablesPlugin
 
 __all__ = [
+ AnalyzeTablePlugin,
  CreateModelPlugin,
  CreateTableAsPlugin,
  CreateTablePlugin,

@@ -0,0 +1,64 @@
+import pandas as pd
+import dask.dataframe as dd
+
+from dask_sql.physical.rel.base import BaseRelPlugin
+from dask_sql.datacontainer import DataContainer, ColumnContainer
+from dask_sql.mappings import python_to_sql_type
+from dask_sql.utils import get_table_from_compound_identifier
+
+
+class AnalyzeTablePlugin(BaseRelPlugin):
+ """
+ Show information on the table (like mean, max etc.)
+ on all or a subset of the columns..
+ The SQL is:
+
+ ANALYZE TABLE <table> COMPUTE STATISTICS [FOR ALL COLUMNS | FOR COLUMNS a, b, ...]
+
+ The result is also a table, although it is created on the fly.
+
+ Please note: even though the syntax is very similar to e.g.
+ [the spark version](https://spark.apache.org/docs/3.0.0/sql-ref-syntax-aux-analyze-table.html),
+ this call does not help with query optimization (as the spark call would do),
+ as this is currently not implemented in dask-sql.
+ """
+
+ class_name = "com.dask.sql.parser.SqlAnalyzeTable"
+
+ def convert(
+ self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
+ ) -> DataContainer:
+ components = list(map(str, sql.getTableName().names))
+ dc = get_table_from_compound_identifier(context, components)
+ columns = list(map(str, sql.getColumnList()))
+
+ if not columns:
+ columns = dc.column_container.columns
+
+ # Define some useful shortcuts
+ mapping = dc.column_container.get_backend_by_frontend_name
+ df = dc.df
+
+ # Calculate statistics
+ statistics = dd.from_pandas(
+ pd.DataFrame({col: [] for col in columns}), npartitions=1
+ )
+ statistics = statistics.append(df[[mapping(col) for col in columns]].describe())
+
+ # Add additional information
+ statistics = statistics.append(
+ pd.Series(
+ {
+ col: str(python_to_sql_type(df[mapping(col)].dtype)).lower()
+ for col in columns
+ },
+ name="data_type",
+ )
+ )
+ statistics = statistics.append(
+ pd.Series({col: col for col in columns}, name="col_name",)
+ )
+
+ cc = ColumnContainer(statistics.columns)
+ dc = DataContainer(statistics, cc)
+ return dc
@@ -4,6 +4,7 @@
 from dask_sql.physical.rel.base import BaseRelPlugin
 from dask_sql.datacontainer import DataContainer, ColumnContainer
 from dask_sql.mappings import python_to_sql_type
+from dask_sql.utils import get_table_from_compound_identifier
 
 
 class ShowColumnsPlugin(BaseRelPlugin):
@@ -22,20 +23,7 @@ def convert(
  self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
  ) -> DataContainer:
  components = list(map(str, sql.getTable().names))
-
- # some queries might also include the database
- # as we do not have such a concept, we just get rid of it
- components = components[-2:]
- tableName = components[-1]
-
- if len(components) == 2:
- if components[0] != context.schema_name:
- raise AttributeError(f"Schema {components[0]} is not defined.")
-
- try:
- dc = context.tables[tableName]
- except KeyError:
- raise AttributeError(f"Table {tableName} is not defined.")
+ dc = get_table_from_compound_identifier(context, components)
 
  cols = dc.column_container.columns
  dtypes = list(map(lambda x: str(python_to_sql_type(x)).lower(), dc.df.dtypes))

@@ -1,3 +1,4 @@
+from typing import List
 import importlib
 from typing import Any, Dict
 from collections import defaultdict
@@ -185,6 +186,29 @@ def __str__(self):
  return f"Literal: {df}"
 
 
+def get_table_from_compound_identifier(
+ context: "dask_sql.Context", components: List[str]
+) -> DataContainer:
+ """
+ Helper function to return the correct table
+ from the stored tables in the context
+ with the given name (and maybe also schema name)
+ """
+ # some queries might also include the database
+ # as we do not have such a concept, we just get rid of it
+ components = components[-2:]
+ tableName = components[-1]
+
+ if len(components) == 2:
+ if components[0] != context.schema_name:
+ raise AttributeError(f"Schema {components[0]} is not defined.")
+
+ try:
+ return context.tables[tableName]
+ except KeyError:
+ raise AttributeError(f"Table {tableName} is not defined.")
+
+
 def convert_sql_kwargs(
  sql_kwargs: "java.util.HashMap[org.apache.calcite.sql.SqlNode, org.apache.calcite.sql.SqlNode]",
 ) -> Dict[str, Any]:

@@ -3,7 +3,7 @@
 SQL Syntax
 ==========
 
-``dask-sql`` understands SQL in (mostly) postgreSQL syntax.
+``dask-sql`` understands SQL in (mostly) presto SQL syntax.
 So far, not every valid SQL operator and keyword is already
 implemented in ``dask-sql``, but a large fraction of it.
 Have a look into our `issue tracker <https://github.com/nils-braun/dask-sql/issues>`_

@@ -2,14 +2,19 @@ Metadata Information
 ====================
 
 With these operations, it is possible to get information on the currently registered tables
-and their columns:
+and their columns.
+The output format is mostly compatible with the presto format.
 
-.. code-block:: sql
+.. raw:: html
 
- SHOW SCHEMAS
- SHOW TABLES FROM "schema-name"
- SHOW COLUMNS FROM "table-name"
- DESCRIBE "table-name"
+ <div class="highlight"><pre>
+ <span class="k">SHOW SCHEMAS</span>
+ <span class="k">SHOW TABLES FROM</span> <span class="ss">&lt;schema-name&gt;</span>
+ <span class="k">SHOW COLUMNS FROM</span> <span class="ss">&lt;table-name></span>
+ <span class="k">DESCRIBE</span> <span class="ss">&lt;table-name></span>
+ <span class="k">ANALYZE TABLE</span> <span class="ss">&lt;table-name&gt;</span> <span class="k">COMPUTE STATISTICS</span>
+ [ <span class="k">FOR ALL COLUMNS</span> | <span class="k">FOR COLUMNS</span> <span class="ss">&lt;column&gt;</span>, [ ,... ] ]
+ </pre></div>
 
 See :ref:`sql` for information on how to reference schemas and tables correctly.
 
@@ -23,17 +28,21 @@ which is needed by some BI tools (which is empty).
 
 Example:
 
-.. code-block:: sql
+.. raw:: html
 
- SHOW SCHEMAS
+ <div class="highlight"><pre>
+ <span class="k">SHOW SCHEMAS</span>
+ </pre></div>
 
 Result:
 
-.. code-block:: none
-
- Schema
- 0 schema
- 1 information_schema
++------------------------+
+| Schema |
++========================+
+| schema |
++------------------------+
+| information_schema |
++------------------------+
 
 ``SHOW TABLES``
 ---------------
@@ -42,17 +51,19 @@ Show the registered tables in a given schema.
 
 Example:
 
-.. code-block:: sql
+.. raw:: html
 
- SHOW TABLES FROM "schema"
+ <div class="highlight"><pre>
+ <span class="k">SHOW TABLES FROM</span> <span class="ss">"schema"</span>
+ </pre></div>
 
 Result:
 
-.. code-block:: none
-
- Table
- 0 timeseries
-
++------------+
+| Table |
++============+
+| timeseries |
++------------+
 
 ``SHOW COLUMNS`` and ``DESCRIBE``
 ---------------------------------
@@ -61,16 +72,67 @@ Show column information on a specific table.
 
 Example:
 
-.. code-block:: sql
+.. raw:: html
 
- SHOW COLUMNS FROM "timeseries"
+ <div class="highlight"><pre>
+ <span class="k">SHOW COLUMNS FROM</span> <span class="ss">"timeseries"</span>
+ </pre></div>
 
 Result:
 
-.. code-block:: none
++--------+---------+---------------+
+| Column | Type | Extra Comment |
++========+=========+===============+
+| id | bigint | |
++--------+---------+---------------+
+| name | varchar | |
++--------+---------+---------------+
+| x | double | |
++--------+---------+---------------+
+| y | double | |
++--------+---------+---------------+
+
+The column "Extra Comment" is shown for compatibility with presto.
+
+
+``ANALYZE TABLE``
+-----------------
+
+Calculate statistics on a given table (and the given columns or all columns)
+and return it as a query result.
+Please note, that this process can be time consuming on large tables.
+Even though this statement is very similar to the ``ANALYZE TABLE`` statement in e.g. `Apache Spark <https://spark.apache.org/docs/3.0.0/sql-ref-syntax-aux-analyze-table.html>`_, it does not optimize subsequent queries (as the pendent in Spark will do).
+
+Example:
+
+.. raw:: html
+
+ <div class="highlight"><pre>
+ <span class="k">ANALYZE TABLE</span> <span class="ss">"timeseries"</span> <span class="k">COMPUTE STATISTICS</span> <span class="k">FOR COLUMNS</span> <span class="ss">x</span>, <span class="ss">y</span>
+ </pre></div>
+
+Result:
 
- Column Type Extra Comment
- 0 id bigint
- 1 name varchar
- 2 x double
- 3 y double
++-----------+-----------+-----------+
+| | x | y |
++===========+===========+===========+
+| count | 30 | 30 |
++-----------+-----------+-----------+
+| mean | 0.140374 | -0.107481 |
++-----------+-----------+-----------+
+| std | 0.568248 | 0.573106 |
++-----------+-----------+-----------+
+| min | -0.795112 | -0.966043 |
++-----------+-----------+-----------+
+| 25% | -0.379635 | -0.561234 |
++-----------+-----------+-----------+
+| 50% | 0.0104101 | -0.237795 |
++-----------+-----------+-----------+
+| 75% | 0.70208 | 0.263459 |
++-----------+-----------+-----------+
+| max | 0.990747 | 0.947069 |
++-----------+-----------+-----------+
+| data_type | double | double |
++-----------+-----------+-----------+
+| col_name | x | y |
++-----------+-----------+-----------+
@@ -24,6 +24,7 @@ data: {
  "org.apache.calcite.sql.SqlCreate",
  "org.apache.calcite.sql.SqlDrop",
  "java.util.*",
+ "com.dask.sql.parser.SqlAnalyzeTable",
  "com.dask.sql.parser.SqlCreateModel",
  "com.dask.sql.parser.SqlCreateTable",
  "com.dask.sql.parser.SqlCreateTableAs",
@@ -39,27 +40,34 @@ data: {
 
  # List of keywords.
  keywords: [
+ "ANALYZE"
  "COLUMNS"
+ "COMPUTE"
  "IF"
  "MODEL"
  "PREDICT"
  "SCHEMAS"
+ "STATISTICS"
  "TABLES"
  ]
 
  # The keywords can only be used in a specific context,
  # so we can keep them non-reserved
  nonReservedKeywordsToAdd: [
+ "ANALYZE"
  "COLUMNS"
+ "COMPUTE"
  "IF"
  "MODEL"
  "PREDICT"
  "SCHEMAS"
+ "STATISTICS"
  "TABLES"
  ]
 
  # List of methods for parsing custom SQL statements
  statementParserMethods: [
+ "SqlAnalyzeTable()"
  "SqlDescribeTable()"
  "SqlShowColumns()"
  "SqlShowSchemas()"