[GEOT-6900] Shapefile quadtree build performance improvements (geotoo…

…ls#3528) * [GEOT-6900] ShapeFileIndexerStressTest New `ShapeFileIndexerStressTest` to assess the performance of `ShapeFileIndexer.index()` over different shapefile sizes. * [GEOT-6900] Shapefile quadtree build performance improvements Use a strategy object (`BoundsReader`) to assist `ShapeFileIndexer` in speeding up the `QuadTree` optimization phase, providing quick access to each shapefile record envelope, potentially avoiding an immense amount of random disk I/O calls through {@link ShapefileReader}, as the quad tree internal nodes get split/shrank. Since the `QuadTree` leaf nodes hold only the shapefile record ids, and not their bounds, the tree layout optimization phase may incur into too much random disk reads on the `.shp` file, which has a bigger impact the bigger the shapefile is, especially related to the size of the geometries more than the number of records itself. The `BoundsReader` strategy object is meant to avoid that to the extent possible. To a given point, record bounds will be stored in heap memory (up to 1MiB, accounting for 32K records, or 64K records if it's a points shapefile). For a bigger number of shapefile records, the strategy is to store the bounds in a temporary file (named `GeoTools_shp_qix_bounds_<random number>.tmp` under `${java.io.tmpdir}`), which is memory mapped and deleted at `BoundsReader.close()`. This leverages the Operating System's native paging, and due to the reduced size of the bounds file compared to the actual `.shp` and avoiding the parsing performed by `ShapefileReader.nextRecord()`, results in dramatically less random I/O and computing. Note, however, that if there's not enough temporary space in the file system where the `java.io.tmpdir` directory resides, a fall back strategy that reads directly from the `ShapefileReader` will be used. This should be a very edge case though, since with a bounds record size of 32 bytes, the required temporary storage is 30.1MiB per million features.
vickdw · Jun 7, 2021 · fa66664 · fa66664
1 parent 82b3054
commit fa66664
Show file tree

Hide file tree

Showing 4 changed files with 599 additions and 79 deletions.
diff --git a/modules/plugin/shapefile/src/main/java/org/geotools/data/shapefile/ShapeFileIndexer.java b/modules/plugin/shapefile/src/main/java/org/geotools/data/shapefile/ShapeFileIndexer.java
@@ -2,7 +2,7 @@
  * GeoTools - The Open Source Java GIS Toolkit
  * http:https://geotools.org
  *
- * (C) 2002-2008, Open Source Geospatial Foundation (OSGeo)
+ * (C) 2002-2021, Open Source Geospatial Foundation (OSGeo)
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -16,6 +16,8 @@
  */
 package org.geotools.data.shapefile;
 
+import static org.geotools.data.shapefile.ShapefileIndexerBoundsHelper.createBoundsReader;
+
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
@@ -26,6 +28,7 @@
 import java.util.Map;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import org.geotools.data.shapefile.ShapefileIndexerBoundsHelper.BoundsReader;
 import org.geotools.data.shapefile.files.FileWriter;
 import org.geotools.data.shapefile.files.ShpFileType;
 import org.geotools.data.shapefile.files.ShpFiles;
@@ -56,7 +59,7 @@
 class ShapeFileIndexer implements FileWriter {
  private static final Logger LOGGER = Logging.getLogger(ShapeFileIndexer.class);
 
- private int max = -1;
+ private int maxDepth = -1;
  private int leafSize = 16;
 
  private String byteOrder;
@@ -148,25 +151,8 @@ public int index(boolean verbose, ProgressListener listener)
  // Temporary file for building...
  StorageFile storage = shpFiles.getStorageFile(ShpFileType.QIX);
  File treeFile = storage.getFile();
-
- if (max == -1) {
- try (ShapefileReader reader =
- new ShapefileReader(shpFiles, true, false, new GeometryFactory())) {
-
- // compute a reasonable index max depth, considering a fully developed
- // 10 levels one already contains 200k index nodes, good for indexing up
- // to 3M features without consuming too much memory
- int features = reader.getCount(0);
- max = 1;
- int nodes = 1;
- while (nodes * leafSize < features) {
- max++;
- nodes *= 4;
- }
- if (max < 10) {
- max = 10;
- }
- }
+ if (maxDepth == -1) {
+ maxDepth = computeMaxDepth();
  }
 
  try (ShapefileReader reader =
@@ -181,69 +167,106 @@ public int index(boolean verbose, ProgressListener listener)
  return cnt;
  }
 
+ /**
+ * Compute a reasonable index max depth, considering a fully developed 10 levels one already
+ * contains 200k index nodes, good for indexing up to 3M features without consuming too much
+ * memory
+ */
+ private int computeMaxDepth() throws IOException {
+ int maxDepth;
+ try (ShapefileReader reader =
+ new ShapefileReader(shpFiles, true, false, new GeometryFactory())) {
+ int features = reader.getCount(0);
+ maxDepth = 1;
+ int nodes = 1;
+ while (nodes * leafSize < features) {
+ maxDepth++;
+ nodes *= 4;
+ }
+ if (maxDepth < 10) {
+ maxDepth = 10;
+ }
+ return maxDepth;
+ }
+ }
+
  private int buildQuadTree(ShapefileReader reader, File file, boolean verbose)
  throws IOException, StoreException {
  LOGGER.fine(
  "Building quadtree spatial index with depth "
- + max
+ + maxDepth
  + " for file "
  + file.getAbsolutePath());
 
- byte order = 0;
-
- if ((this.byteOrder == null) || this.byteOrder.equalsIgnoreCase("NM")) {
- order = IndexHeader.NEW_MSB_ORDER;
- } else if (this.byteOrder.equalsIgnoreCase("NL")) {
- order = IndexHeader.NEW_LSB_ORDER;
- } else {
- throw new StoreException(
- "Asked byte order '" + this.byteOrder + "' must be 'NL' or 'NM'!");
- }
+ final byte fileByteOrder = resolveStorageByteOrder();
 
- IndexFile shpIndex = new IndexFile(shpFiles, false);
  int cnt = 0;
- int numRecs = shpIndex.getRecordCount();
- ShapefileHeader header = reader.getHeader();
- Envelope bounds = new Envelope(header.minX(), header.maxX(), header.minY(), header.maxY());
-
- try (QuadTree tree = new QuadTree(numRecs, max, bounds, shpIndex)) {
- Record rec = null;
 
+ try (IndexFile shpIndex = new IndexFile(shpFiles, false);
+ // strategy to speed up optimizeTree()
+ BoundsReader boundsHelper = createBoundsReader(reader, shpIndex);
+ QuadTree tree =
+ new QuadTree(
+ shpIndex.getRecordCount(), maxDepth, getBounds(reader), shpIndex)) {
+ Envelope env = new Envelope();
  while (reader.hasNext()) {
- rec = reader.nextRecord();
- tree.insert(cnt++, new Envelope(rec.minX, rec.maxX, rec.minY, rec.maxY));
+ Record rec = reader.nextRecord();
+ env.init(rec.minX, rec.maxX, rec.minY, rec.maxY);
+ int recno = cnt++;
+ tree.insert(recno, env);
+ boundsHelper.insert(recno, env);
 
- if (verbose && ((cnt % 1000) == 0)) {
+ if (verbose && ((cnt % 1_000) == 0)) {
  System.out.print('.');
  }
- if (cnt % 100000 == 0) System.out.print('\n');
+ if (cnt % 100_000 == 0) System.out.print('\n');
  }
- if (verbose) System.out.println("done");
- FileSystemIndexStore store = new FileSystemIndexStore(file, order);
+ if (verbose) System.out.println("done building quadtree");
 
- if (leafSize > 0) {
- if (LOGGER.isLoggable(Level.FINE)) {
- LOGGER.fine("Optimizing the tree (this might take some time)");
- }
- optimizeTree(tree, tree.getRoot(), 0, reader, shpIndex);
- if (LOGGER.isLoggable(Level.FINE)) {
- LOGGER.fine("Tree optimized");
- }
+ if (LOGGER.isLoggable(Level.FINE)) {
+ LOGGER.fine("Optimizing the tree (this might take some time)");
+ }
+ if (verbose) System.out.println("Optimizing the tree (this might take some time)");
+ optimizeTree(tree, tree.getRoot(), 0, boundsHelper);
+ if (LOGGER.isLoggable(Level.FINE)) {
+ LOGGER.fine("Tree optimized");
  }
 
  if (LOGGER.isLoggable(Level.FINE)) {
  printStats(tree);
  }
+ if (verbose) System.out.println("Storing the tree...");
+ FileSystemIndexStore store = new FileSystemIndexStore(file, fileByteOrder);
  store.store(tree);
+ if (verbose) System.out.println("done");
  }
  return cnt;
  }
 
- private Node optimizeTree(
- QuadTree tree, Node node, int level, ShapefileReader reader, IndexFile index)
+ private Envelope getBounds(ShapefileReader reader) {
+ ShapefileHeader header = reader.getHeader();
+ Envelope bounds = new Envelope(header.minX(), header.maxX(), header.minY(), header.maxY());
+ return bounds;
+ }
+
+ private byte resolveStorageByteOrder() throws StoreException {
+ if ((this.byteOrder == null) || this.byteOrder.equalsIgnoreCase("NM")) {
+ return IndexHeader.NEW_MSB_ORDER;
+ }
+ if (this.byteOrder.equalsIgnoreCase("NL")) {
+ return IndexHeader.NEW_LSB_ORDER;
+ }
+ throw new StoreException("Asked byte order '" + this.byteOrder + "' must be 'NL' or 'NM'!");
+ }
+
+ private Node optimizeTree(QuadTree tree, Node node, int level, BoundsReader reader)
  throws StoreException, IOException {
  // recurse, with a check to avoid too deep recursion due to odd data that has a
- if (node.getNumShapeIds() > leafSize && node.getNumSubNodes() == 0 && level < max * 2) {
+ final boolean isLeafNode = node.getNumSubNodes() == 0;
+ final boolean isOverFlown = node.getNumShapeIds() > leafSize;
+ final int hardMaxDepth = maxDepth * 2;
+ final boolean canBeSplit = level < hardMaxDepth;
+ if (isLeafNode && isOverFlown && canBeSplit) {
  // ok, we need to split this baby further
  int[] shapeIds = node.getShapesId();
  int numShapesId = node.getNumShapeIds();
@@ -257,13 +280,11 @@ private Node optimizeTree(
  nodes *= 4;
  }
 
+ Envelope env = new Envelope();
  for (int i = 0; i < numShapesId; i++) {
- final int shapeId = shapeIds[i];
- int offset = index.getOffsetInBytes(shapeId);
- reader.goTo(offset);
- Record rec = reader.nextRecord();
- Envelope env = new Envelope(rec.minX, rec.maxX, rec.minY, rec.maxY);
- tree.insert(node, shapeId, env, extraLevels);
+ final int recNumber = shapeIds[i];
+ reader.read(recNumber, env);
+ tree.insert(node, recNumber, env, extraLevels);
  }
  }
 
@@ -272,7 +293,7 @@ private Node optimizeTree(
 
  // recurse
  for (int i = 0; i < node.getNumSubNodes(); i++) {
- optimizeTree(tree, node.getSubNode(i), level + 1, reader, index);
+ optimizeTree(tree, node.getSubNode(i), level + 1, reader);
  }
 
  // prune empty subnodes
@@ -301,12 +322,8 @@ private Node optimizeTree(
  Envelope bounds = new Envelope();
  if (node.getNumShapeIds() > 0) {
  int[] shapeIds = node.getShapesId();
- for (final int shapeId : shapeIds) {
- int offset = index.getOffsetInBytes(shapeId);
- reader.goTo(offset);
- Record rec = reader.nextRecord();
- Envelope env = new Envelope(rec.minX, rec.maxX, rec.minY, rec.maxY);
- bounds.expandToInclude(env);
+ for (final int recNumber : shapeIds) {
+ reader.expandEnvelope(recNumber, bounds);
  }
  }
  if (node.getNumSubNodes() > 0) {
@@ -369,7 +386,7 @@ void gatherStats(Node node, Map<Integer, Integer> stats) throws StoreException {
 
  /** For quad tree this is the max depth. I don't know what it is for RTree */
  public void setMax(int i) {
- max = i;
+ maxDepth = i;
  }
 
  /** @param shpFiles */
@@ -392,6 +409,9 @@ public int getLeafSize() {
  }
 
  public void setLeafSize(int leafSize) {
+ if (leafSize < 1) {
+ throw new IllegalArgumentException("Maximum node leaf size must be a positive integer");
+ }
  this.leafSize = leafSize;
  }
 }