Collect and use multi-column dependency stats

Follow on patch in the multi-variate statistics patch series. CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t; ANALYZE; will collect dependency stats on (a, b) and then use the measured dependency in subsequent query planning. Commit 7b504eb added CREATE STATISTICS with n-distinct coefficients. These are now specified using the mutually exclusive option WITH (ndistinct). Author: Tomas Vondra, David Rowley Reviewed-by: Kyotaro HORIGUCHI, Álvaro Herrera, Dean Rasheed, Robert Haas and many other comments and contributions Discussion: https://postgr.es/m/[email protected]
sunchao · Apr 5, 2017 · 2686ee1 · 2686ee1
1 parent 00b6b6f
commit 2686ee1
Show file tree

Hide file tree

Showing 31 changed files with 2,035 additions and 79 deletions.
diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
@@ -1013,6 +1013,7 @@ estimate_size(PlannerInfo *root, RelOptInfo *baserel,
  baserel->baserestrictinfo,
  0,
  JOIN_INNER,
+ NULL,
  NULL);
 
  nrows = clamp_row_est(nrows);

diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
@@ -591,6 +591,7 @@ postgresGetForeignRelSize(PlannerInfo *root,
  fpinfo->local_conds,
  baserel->relid,
  JOIN_INNER,
+ NULL,
  NULL);
 
  cost_qual_eval(&fpinfo->local_conds_cost, fpinfo->local_conds, root);
@@ -2572,6 +2573,7 @@ estimate_path_cost_size(PlannerInfo *root,
  local_param_join_conds,
  foreignrel->relid,
  JOIN_INNER,
+ NULL,
  NULL);
  local_sel *= fpinfo->local_conds_sel;
 
@@ -4455,6 +4457,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
  fpinfo->local_conds,
  0,
  JOIN_INNER,
+ NULL,
  NULL);
  cost_qual_eval(&fpinfo->local_conds_cost, fpinfo->local_conds, root);
 
@@ -4465,7 +4468,7 @@ postgresGetForeignJoinPaths(PlannerInfo *root,
  if (!fpinfo->use_remote_estimate)
  fpinfo->joinclause_sel = clauselist_selectivity(root, fpinfo->joinclauses,
  0, fpinfo->jointype,
- extra->sjinfo);
+ extra->sjinfo, NULL);
 
  /* Estimate costs for bare join relation */
  estimate_path_cost_size(root, joinrel, NIL, NIL, &rows,

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
@@ -4339,6 +4339,15 @@
  </entry>
  </row>
 
+ <row>
+ <entry><structfield>stadependencies</structfield></entry>
+ <entry><type>pg_dependencies</type></entry>
+ <entry></entry>
+ <entry>
+ Functional dependencies, serialized as <structname>pg_dependencies</> type.
+ </entry>
+ </row>
+
  </tbody>
  </tgroup>
  </table>

diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml
@@ -446,6 +446,160 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
  in <filename>src/backend/utils/adt/selfuncs.c</filename>.
  </para>
 
+ <sect2 id="functional-dependencies">
+ <title>Functional Dependencies</title>
+
+ <para>
+ The simplest type of extended statistics are functional dependencies,
+ used in definitions of database normal forms. When simplified, saying that
+ <literal>b</> is functionally dependent on <literal>a</> means that
+ knowledge of value of <literal>a</> is sufficient to determine value of
+ <literal>b</>.
+ </para>
+
+ <para>
+ In normalized databases, only functional dependencies on primary keys
+ and superkeys are allowed. However, in practice, many data sets are not
+ fully normalized, for example, due to intentional denormalization for
+ performance reasons.
+ </para>
+
+ <para>
+ Functional dependencies directly affect accuracy of the estimates, as
+ conditions on the dependent column(s) do not restrict the result set,
+ resulting in underestimates.
+ </para>
+
+ <para>
+ To inform the planner about the functional dependencies, we collect
+ measurements of dependency during <command>ANALYZE</>. Assessing
+ dependency between all sets of columns would be prohibitively
+ expensive, so we limit our search to potential dependencies defined
+ using the <command>CREATE STATISTICS</> command.
+
+<programlisting>
+CREATE TABLE t (a INT, b INT);
+INSERT INTO t SELECT i/100, i/100 FROM generate_series(1,10000) s(i);
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;
+ANALYZE t;
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
+ QUERY PLAN 
+-------------------------------------------------------------------------------------------------
+ Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=0.095..3.118 rows=100 loops=1)
+ Filter: ((a = 1) AND (b = 1))
+ Rows Removed by Filter: 9900
+ Planning time: 0.367 ms
+ Execution time: 3.380 ms
+(5 rows)
+</programlisting>
+
+ The planner is now aware of the functional dependencies and considers
+ them when computing the selectivity of the second condition. Running
+ the query without the statistics would lead to quite different estimates.
+
+<programlisting>
+DROP STATISTICS s1;
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
+ QUERY PLAN 
+-----------------------------------------------------------------------------------------------
+ Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual time=0.000..6.379 rows=100 loops=1)
+ Filter: ((a = 1) AND (b = 1))
+ Rows Removed by Filter: 9900
+ Planning time: 0.000 ms
+ Execution time: 6.379 ms
+(5 rows)
+</programlisting>
+ </para>
+
+ <para>
+ If no dependency exists, the collected statistics do not influence the
+ query plan. The only effect is to slow down <command>ANALYZE</>. Should
+ partial dependencies exist these will also be stored and applied
+ during planning.
+ </para>
+
+ <para>
+ Similarly to per-column statistics, extended statistics are stored in
+ a system catalog called <structname>pg_statistic_ext</structname>, but
+ there is also a more convenient view <structname>pg_stats_ext</structname>.
+ To inspect the statistics <literal>s1</literal> defined above,
+ you may do this:
+
+<programlisting>
+SELECT tablename, staname, attnums, depsbytes
+ FROM pg_stats_ext WHERE staname = 's1';
+ tablename | staname | attnums | depsbytes 
+-----------+---------+---------+-----------
+ t | s1 | 1 2 | 40
+(1 row)
+</programlisting>
+
+ This shows that the statistics are defined on table <structname>t</>,
+ <structfield>attnums</structfield> lists attribute numbers of columns
+ (references <structname>pg_attribute</structname>). It also shows
+ the length in bytes of the functional dependencies, as found by
+ <command>ANALYZE</> when serialized into a <literal>bytea</> column.
+ </para>
+
+ <para>
+ When computing the selectivity, the planner inspects all conditions and
+ attempts to identify which conditions are already implied by other
+ conditions. The selectivity estimates from any redundant conditions are
+ ignored from a selectivity point of view. In the example query above,
+ the selectivity estimates for either of the conditions may be eliminated,
+ thus improving the overall estimate.
+ </para>
+
+ <sect3 id="functional-dependencies-limitations">
+ <title>Limitations of functional dependencies</title>
+
+ <para>
+ Functional dependencies are a very simple type of statistics, and
+ as such have several limitations. The first limitation is that they
+ only work with simple equality conditions, comparing columns and constant
+ values. It's not possible to use them to eliminate equality conditions
+ comparing two columns or a column to an expression, range clauses,
+ <literal>LIKE</> or any other type of conditions.
+ </para>
+
+ <para>
+ When eliminating the implied conditions, the planner assumes that the
+ conditions are compatible. Consider the following example, violating
+ this assumption:
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 10;
+ QUERY PLAN
+-----------------------------------------------------------------------------------------------
+ Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual time=2.992..2.992 rows=0 loops=1)
+ Filter: ((a = 1) AND (b = 10))
+ Rows Removed by Filter: 10000
+ Planning time: 0.232 ms
+ Execution time: 3.033 ms
+(5 rows)
+</programlisting>
+
+ While there are no rows with such combination of values, the planner
+ is unable to verify whether the values match - it only knows that
+ the columns are functionally dependent.
+ </para>
+
+ <para>
+ This assumption is more about queries executed on the database - in many
+ cases, it's actually satisfied (e.g. when the GUI only allows selecting
+ compatible values). But if that's not the case, functional dependencies
+ may not be a viable option.
+ </para>
+
+ <para>
+ For additional information about functional dependencies, see
+ <filename>src/backend/statistics/README.dependencies</>.
+ </para>
+
+ </sect3>
+
+ </sect2>
+
  </sect1>
 
 </chapter>
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
@@ -21,8 +21,9 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable> ON (
- <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_name</replaceable>
+ WITH ( <replaceable class="PARAMETER">option</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] )
+ ON ( <replaceable class="PARAMETER">column_name</replaceable>, <replaceable class="PARAMETER">column_name</replaceable> [, ...])
  FROM <replaceable class="PARAMETER">table_name</replaceable>
 </synopsis>
 
@@ -94,6 +95,41 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
 
  </variablelist>
 
+ <refsect2 id="SQL-CREATESTATISTICS-parameters">
+ <title id="SQL-CREATESTATISTICS-parameters-title">Parameters</title>
+
+ <indexterm zone="sql-createstatistics-parameters">
+ <primary>statistics parameters</primary>
+ </indexterm>
+
+ <para>
+ The <literal>WITH</> clause can specify <firstterm>options</>
+ for the statistics. Available options are listed below.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term><literal>dependencies</> (<type>boolean</>)</term>
+ <listitem>
+ <para>
+ Enables functional dependencies for the statistics.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>ndistinct</> (<type>boolean</>)</term>
+ <listitem>
+ <para>
+ Enables ndistinct coefficients for the statistics.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ </refsect2>
  </refsect1>
 
  <refsect1>
@@ -122,7 +158,7 @@ CREATE TABLE t1 (
 INSERT INTO t1 SELECT i/100, i/500
  FROM generate_series(1,1000000) s(i);
 
-CREATE STATISTICS s1 ON (a, b) FROM t1;
+CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t1;
 
 ANALYZE t1;
 

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
@@ -192,7 +192,8 @@ CREATE VIEW pg_stats_ext AS
  C.relname AS tablename,
  S.staname AS staname,
  S.stakeys AS attnums,
- length(s.standistinct) AS ndistbytes
+ length(s.standistinct::bytea) AS ndistbytes,
+ length(S.stadependencies::bytea) AS depsbytes
  FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid))
  LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
 

diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
@@ -62,10 +62,11 @@ CreateStatistics(CreateStatsStmt *stmt)
  Oid relid;
  ObjectAddress parentobject,
  childobject;
- Datum types[1]; /* only ndistinct defined now */
+ Datum types[2]; /* one for each possible type of statistics */
  int ntypes;
  ArrayType *staenabled;
  bool build_ndistinct;
+ bool build_dependencies;
  bool requested_type = false;
 
  Assert(IsA(stmt, CreateStatsStmt));
@@ -159,7 +160,7 @@ CreateStatistics(CreateStatsStmt *stmt)
  errmsg("statistics require at least 2 columns")));
 
  /*
- * Sort the attnums, which makes detecting duplicies somewhat easier, and
+ * Sort the attnums, which makes detecting duplicities somewhat easier, and
  * it does not hurt (it does not affect the efficiency, unlike for
  * indexes, for example).
  */
@@ -182,6 +183,7 @@ CreateStatistics(CreateStatsStmt *stmt)
  * recognized.
  */
  build_ndistinct = false;
+ build_dependencies = false;
  foreach(l, stmt->options)
  {
  DefElem *opt = (DefElem *) lfirst(l);
@@ -191,6 +193,11 @@ CreateStatistics(CreateStatsStmt *stmt)
  build_ndistinct = defGetBoolean(opt);
  requested_type = true;
  }
+ else if (strcmp(opt->defname, "dependencies") == 0)
+ {
+ build_dependencies = defGetBoolean(opt);
+ requested_type = true;
+ }
  else
  ereport(ERROR,
  (errcode(ERRCODE_SYNTAX_ERROR),
@@ -199,12 +206,17 @@ CreateStatistics(CreateStatsStmt *stmt)
  }
  /* If no statistic type was specified, build them all. */
  if (!requested_type)
+ {
  build_ndistinct = true;
+ build_dependencies = true;
+ }
 
  /* construct the char array of enabled statistic types */
  ntypes = 0;
  if (build_ndistinct)
  types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
+ if (build_dependencies)
+ types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
  Assert(ntypes > 0);
  staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c');
 
@@ -222,6 +234,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 
  /* no statistics build yet */
  nulls[Anum_pg_statistic_ext_standistinct - 1] = true;
+ nulls[Anum_pg_statistic_ext_stadependencies - 1] = true;
 
  /* insert it into pg_statistic_ext */
  statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);