From e093dcdd2853911ca1ad710581182dfcb6c78ea3 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 25 Aug 2006 04:06:58 +0000
Subject: Add the ability to create indexes 'concurrently', that is, without
 blocking concurrent writes to the table.  Greg Stark, with a little help from
 Tom Lane.

---
 doc/src/sgml/catalogs.sgml         |  13 ++++-
 doc/src/sgml/indexam.sgml          |  12 ++++-
 doc/src/sgml/indices.sgml          |  13 ++++-
 doc/src/sgml/mvcc.sgml             |   8 +--
 doc/src/sgml/ref/create_index.sgml | 104 +++++++++++++++++++++++++++++++++++--
 doc/src/sgml/ref/reindex.sgml      |  16 +++++-
 6 files changed, 155 insertions(+), 11 deletions(-)

(limited to 'doc/src')
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 80ed7d829bc..20604d73e48 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.129 2006/07/31 20:08:55 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.130 2006/08/25 04:06:44 tgl Exp $ -->
 <!--
  Documentation of the system catalogs, directed toward PostgreSQL developers
  -->
@@ -2420,6 +2420,17 @@
       <entry>If true, the table was last clustered on this index</entry>
      </row>
 
+     <row>
+      <entry><structfield>indisvalid</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>If true, the index is currently valid for queries.
+       False means the index is possibly incomplete: it must still be
+       inserted into by INSERT/UPDATE operations, but it cannot safely be
+       used for queries, and if it is unique, the uniqueness shouldn't be
+       relied on either.</entry>
+     </row>
+
      <row>
       <entry><structfield>indkey</structfield></entry>
       <entry><type>int2vector</type></entry>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 1afa120766e..10ab84278d0 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/indexam.sgml,v 2.16 2006/07/31 20:08:59 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/indexam.sgml,v 2.17 2006/08/25 04:06:44 tgl Exp $ -->
 
 <chapter id="indexam">
  <title>Index Access Method Interface Definition</title>
@@ -648,6 +648,16 @@ amrestrpos (IndexScanDesc scan);
     </itemizedlist>
   </para>
 
+  <para>
+   Furthermore, immediately before raising a uniqueness violation
+   according to the above rules, the access method must recheck the
+   liveness of the row being inserted.  If it is committed dead then
+   no error should be raised.  (This case cannot occur during the
+   ordinary scenario of inserting a row that's just been created by
+   the current transaction.  It can happen during
+   <command>CREATE UNIQUE INDEX CONCURRENTLY</>, however.) 
+  </para>
+
   <para>
    We require the index access method to apply these tests itself, which
    means that it must reach into the heap to check the commit status of
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 10669c01557..223bb81cfc8 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/indices.sgml,v 1.57 2006/05/24 11:01:39 teodor Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/indices.sgml,v 1.58 2006/08/25 04:06:44 tgl Exp $ -->
 
 <chapter id="indexes">
  <title id="indexes-title">Indexes</title>
@@ -90,6 +90,17 @@ CREATE INDEX test1_id_index ON test1 (id);
    significantly speed up queries with joins.
   </para>
 
+  <para>
+   Creating an index on a large table can take a long time.  By default,
+   <productname>PostgreSQL</productname> allows reads (selects) to occur
+   on the table in parallel with index creation, but writes (inserts,
+   updates, deletes) are blocked until the index build is finished.
+   It is possible to allow writes to occur in parallel with index
+   creation, but there are several caveats to be aware of &mdash;
+   for more information see <xref linkend="SQL-CREATEINDEX-CONCURRENTLY"
+   endterm="SQL-CREATEINDEX-CONCURRENTLY-title">.
+  </para>
+
   <para>
    After an index is created, the system has to keep it synchronized with the
    table.  This adds overhead to data manipulation operations.
diff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml
index 8ebb8205192..1a1e95db501 100644
--- a/doc/src/sgml/mvcc.sgml
+++ b/doc/src/sgml/mvcc.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/mvcc.sgml,v 2.56 2006/04/23 03:39:52 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/mvcc.sgml,v 2.57 2006/08/25 04:06:45 tgl Exp $ -->
 
  <chapter id="mvcc">
   <title>Concurrency Control</title>
@@ -622,7 +622,8 @@ SELECT SUM(value) FROM mytab WHERE class = 2;
 	</para>
 
 	<para>
-	 Acquired by <command>VACUUM</command> (without <option>FULL</option>).
+	 Acquired by <command>VACUUM</command> (without <option>FULL</option>)
+	 and by <command>CREATE INDEX CONCURRENTLY</>.
 	</para>
        </listitem>
       </varlistentry>
@@ -641,7 +642,8 @@ SELECT SUM(value) FROM mytab WHERE class = 2;
 	</para>
 
 	<para>
-	 Acquired by <command>CREATE INDEX</command>.
+	 Acquired by <command>CREATE INDEX</command>
+	 (without <option>CONCURRENTLY</option>).
 	</para>
        </listitem>
       </varlistentry>
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index 20a89713bf4..be0ca63f2c5 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/create_index.sgml,v 1.55 2006/07/11 21:05:57 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/create_index.sgml,v 1.56 2006/08/25 04:06:45 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -20,7 +20,7 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-CREATE [ UNIQUE ] INDEX <replaceable class="parameter">name</replaceable> ON <replaceable class="parameter">table</replaceable> [ USING <replaceable class="parameter">method</replaceable> ]
+CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] <replaceable class="parameter">name</replaceable> ON <replaceable class="parameter">table</replaceable> [ USING <replaceable class="parameter">method</replaceable> ]
     ( { <replaceable class="parameter">column</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ <replaceable class="parameter">opclass</replaceable> ] [, ...] )
     [ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> = <replaceable class="PARAMETER">value</replaceable> [, ... ] ) ]
     [ TABLESPACE <replaceable class="parameter">tablespace</replaceable> ]
@@ -110,6 +110,21 @@ CREATE [ UNIQUE ] INDEX <replaceable class="parameter">name</replaceable> ON <re
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><literal>CONCURRENTLY</literal></term>
+      <listitem>
+       <para>
+        When this option is used, <productname>PostgreSQL</> will build the
+        index without taking any locks that prevent concurrent inserts,
+        updates, or deletes on the table; whereas a standard index build
+        locks out writes (but not reads) on the table until it's done.
+        There are several caveats to be aware of when using this option
+        &mdash; see <xref linkend="SQL-CREATEINDEX-CONCURRENTLY"
+        endterm="SQL-CREATEINDEX-CONCURRENTLY-title">.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><replaceable class="parameter">name</replaceable></term>
       <listitem>
@@ -239,6 +254,82 @@ CREATE [ UNIQUE ] INDEX <replaceable class="parameter">name</replaceable> ON <re
    </variablelist>
 
   </refsect2>
+
+  <refsect2 id="SQL-CREATEINDEX-CONCURRENTLY">
+   <title id="SQL-CREATEINDEX-CONCURRENTLY-title">Building Indexes Concurrently</title>
+
+   <indexterm zone="SQL-CREATEINDEX-CONCURRENTLY">
+   <primary>index</primary>
+   <secondary>building concurrently</secondary>
+   </indexterm>
+
+   <para>
+    Creating an index for a large table can be a long operation. In large data
+    warehousing applications it can easily take hours or even days to build
+    indexes. It's important to understand the impact creating indexes has on a
+    system.
+   </para>
+
+   <para>
+    Normally <productname>PostgreSQL</> locks the table to be indexed against
+    writes and performs the entire index build with a single scan of the
+    table. Other transactions can still read the table, but if they try to
+    insert, update, or delete rows in the table they will block until the
+    index build is finished.
+   </para>
+
+   <para>
+    <productname>PostgreSQL</> also supports building indexes without locking
+    out writes.  This method is invoked by specifying the
+    <literal>CONCURRENTLY</> option of <command>CREATE INDEX</>.
+    When this option is used,
+    <productname>PostgreSQL</> must perform two scans of the table, and in
+    addition it must wait for all existing transactions to terminate.  Thus
+    this method requires more total work than a standard index build and takes
+    significantly longer to complete.  However, since it allows normal
+    operations to continue while the index is built, this method is useful for
+    adding new indexes in a production environment.  Of course, the extra CPU
+    and I/O load imposed by the index creation may slow other operations.
+   </para>
+
+   <para>
+    If a problem arises during the second scan of the table, such as a
+    uniqueness violation in a unique index, the <command>CREATE INDEX</>
+    command will fail but leave behind an <quote>invalid</> index. This index
+    will be ignored for querying purposes because it may be incomplete;
+    however it will still consume update overhead.  The recommended recovery
+    method in such cases is to drop the index and try again to perform
+    <command>CREATE INDEX CONCURRENTLY</>.  (Another possibility is to rebuild
+    the index with <command>REINDEX</>.  However, since <command>REINDEX</>
+    does not support concurrent builds, this option is unlikely to seem
+    attractive.)
+   </para>
+
+   <para>
+    Another caveat when building a unique index concurrently is that the
+    uniqueness constraint is already being enforced against other transactions
+    when the second table scan begins.  This means that constraint violations
+    could be reported in other queries prior to the index becoming available
+    for use, or even in cases where the index build eventually fails.  Also,
+    if a failure does occur in the second scan, the <quote>invalid</> index
+    continues to enforce its uniqueness constraint afterwards.
+   </para>
+
+   <para>
+    Concurrent builds of expression indexes and partial indexes are supported.
+    Errors occurring in the evaluation of these expressions could cause
+    behavior similar to that described above for unique constraint violations.
+   </para>
+
+   <para>
+    Regular index builds permit other regular index builds on the
+    same table to occur in parallel, but only one concurrent index build
+    can occur on a table at a time.  In both cases, no other types of schema
+    modification on the table are allowed meanwhile.  Another difference
+    is that a regular <command>CREATE INDEX</> command can be performed within
+    a transaction block, but <command>CREATE INDEX CONCURRENTLY</> cannot.
+   </para>
+  </refsect2>
  </refsect1>
 
  <refsect1>
@@ -339,15 +430,22 @@ Is this example correct?
    To create a GiST index on a point attribute so that we
    can efficiently use box operators on the result of the
    conversion function:
-  </para>
   <programlisting>
 CREATE INDEX pointloc
     ON points USING GIST (point2box(location) box_ops);
 SELECT * FROM points
     WHERE point2box(points.pointloc) = boxes.box;
   </programlisting>
+  </para>
 -->
 
+  <para>
+   To create an index without locking out writes to the table:
+<programlisting>
+CREATE INDEX CONCURRENTLY sales_quantity_index ON sales_table (quantity);
+</programlisting>
+  </para>
+
  </refsect1>
 
  <refsect1>
diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
index dbe10ca7629..9502a0daf57 100644
--- a/doc/src/sgml/ref/reindex.sgml
+++ b/doc/src/sgml/ref/reindex.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/reindex.sgml,v 1.31 2006/07/04 18:07:24 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/reindex.sgml,v 1.32 2006/08/25 04:06:45 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -30,7 +30,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
   <para>
    <command>REINDEX</command> rebuilds an index using the data
    stored in the index's table, replacing the old copy of the index. There are
-   three main reasons to use <command>REINDEX</command>:
+   several scenarios in which to use <command>REINDEX</command>:
 
    <itemizedlist>
     <listitem>
@@ -61,6 +61,18 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
       for an index, and wish to ensure that the change has taken full effect.
      </para>
     </listitem>
+
+    <listitem>
+     <para>
+      An index build with the <literal>CONCURRENTLY</> option failed, leaving
+      an <quote>invalid</> index. Such indexes are useless but it can be
+      convenient to use <command>REINDEX</> to rebuild them. Note that
+      <command>REINDEX</> will not perform a concurrent build. To build the
+      index without interfering with production you should drop the index and
+      reissue the <command>CREATE INDEX CONCURRENTLY</> command.
+     </para>
+    </listitem>
+
    </itemizedlist>
   </para>
  </refsect1>
-- 
cgit v1.2.3