Skip to content

Commit

Permalink
[GEOT-6900] Shapefile quadtree build performance improvements (geotoo…
Browse files Browse the repository at this point in the history
…ls#3528)

* [GEOT-6900] ShapeFileIndexerStressTest

New `ShapeFileIndexerStressTest` to assess the performance of
`ShapeFileIndexer.index()` over different shapefile sizes.

* [GEOT-6900] Shapefile quadtree build performance improvements

Use a strategy object (`BoundsReader`) to assist `ShapeFileIndexer` in speeding up the
`QuadTree` optimization  phase, providing quick access to each shapefile
 record envelope, potentially avoiding an immense amount of random disk I/O calls through {@link
 ShapefileReader}, as the quad tree internal nodes get split/shrank.

Since the `QuadTree` leaf nodes hold only the shapefile record ids, and not their
 bounds, the tree layout optimization phase may incur into too much random disk reads on the
`.shp` file, which has a bigger impact the bigger the shapefile is, especially related to the size of the
geometries more than the number of records itself.

The `BoundsReader` strategy object is meant to avoid that to the extent possible.

To a given point, record bounds will be stored in heap memory (up to 1MiB, accounting for 32K
 records, or 64K records if it's a points shapefile).

For a bigger number of shapefile records, the strategy is
 to store the bounds in a temporary file (named
 `GeoTools_shp_qix_bounds_<random number>.tmp`  under `${java.io.tmpdir}`),
 which is memory mapped and deleted at `BoundsReader.close()`. This leverages the Operating
 System's native paging, and due to the reduced size of the bounds file compared to the actual
 `.shp` and avoiding the parsing performed by `ShapefileReader.nextRecord()`, results in
 dramatically less random I/O and computing.

Note, however, that if there's not enough temporary space in the file system where the
`java.io.tmpdir` directory resides, a fall back strategy that reads directly from the
`ShapefileReader` will be used. This should be a very edge case though, since with a bounds record
 size of 32 bytes, the required temporary storage is 30.1MiB per million features.
  • Loading branch information
groldan committed Jun 7, 2021
1 parent 82b3054 commit fa66664
Show file tree
Hide file tree
Showing 4 changed files with 599 additions and 79 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* GeoTools - The Open Source Java GIS Toolkit
* http:https://geotools.org
*
* (C) 2002-2008, Open Source Geospatial Foundation (OSGeo)
* (C) 2002-2021, Open Source Geospatial Foundation (OSGeo)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
Expand All @@ -16,6 +16,8 @@
*/
package org.geotools.data.shapefile;

import static org.geotools.data.shapefile.ShapefileIndexerBoundsHelper.createBoundsReader;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
Expand All @@ -26,6 +28,7 @@
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.geotools.data.shapefile.ShapefileIndexerBoundsHelper.BoundsReader;
import org.geotools.data.shapefile.files.FileWriter;
import org.geotools.data.shapefile.files.ShpFileType;
import org.geotools.data.shapefile.files.ShpFiles;
Expand Down Expand Up @@ -56,7 +59,7 @@
class ShapeFileIndexer implements FileWriter {
private static final Logger LOGGER = Logging.getLogger(ShapeFileIndexer.class);

private int max = -1;
private int maxDepth = -1;
private int leafSize = 16;

private String byteOrder;
Expand Down Expand Up @@ -148,25 +151,8 @@ public int index(boolean verbose, ProgressListener listener)
// Temporary file for building...
StorageFile storage = shpFiles.getStorageFile(ShpFileType.QIX);
File treeFile = storage.getFile();

if (max == -1) {
try (ShapefileReader reader =
new ShapefileReader(shpFiles, true, false, new GeometryFactory())) {

// compute a reasonable index max depth, considering a fully developed
// 10 levels one already contains 200k index nodes, good for indexing up
// to 3M features without consuming too much memory
int features = reader.getCount(0);
max = 1;
int nodes = 1;
while (nodes * leafSize < features) {
max++;
nodes *= 4;
}
if (max < 10) {
max = 10;
}
}
if (maxDepth == -1) {
maxDepth = computeMaxDepth();
}

try (ShapefileReader reader =
Expand All @@ -181,69 +167,106 @@ public int index(boolean verbose, ProgressListener listener)
return cnt;
}

/**
* Compute a reasonable index max depth, considering a fully developed 10 levels one already
* contains 200k index nodes, good for indexing up to 3M features without consuming too much
* memory
*/
private int computeMaxDepth() throws IOException {
int maxDepth;
try (ShapefileReader reader =
new ShapefileReader(shpFiles, true, false, new GeometryFactory())) {
int features = reader.getCount(0);
maxDepth = 1;
int nodes = 1;
while (nodes * leafSize < features) {
maxDepth++;
nodes *= 4;
}
if (maxDepth < 10) {
maxDepth = 10;
}
return maxDepth;
}
}

private int buildQuadTree(ShapefileReader reader, File file, boolean verbose)
throws IOException, StoreException {
LOGGER.fine(
"Building quadtree spatial index with depth "
+ max
+ maxDepth
+ " for file "
+ file.getAbsolutePath());

byte order = 0;

if ((this.byteOrder == null) || this.byteOrder.equalsIgnoreCase("NM")) {
order = IndexHeader.NEW_MSB_ORDER;
} else if (this.byteOrder.equalsIgnoreCase("NL")) {
order = IndexHeader.NEW_LSB_ORDER;
} else {
throw new StoreException(
"Asked byte order '" + this.byteOrder + "' must be 'NL' or 'NM'!");
}
final byte fileByteOrder = resolveStorageByteOrder();

IndexFile shpIndex = new IndexFile(shpFiles, false);
int cnt = 0;
int numRecs = shpIndex.getRecordCount();
ShapefileHeader header = reader.getHeader();
Envelope bounds = new Envelope(header.minX(), header.maxX(), header.minY(), header.maxY());

try (QuadTree tree = new QuadTree(numRecs, max, bounds, shpIndex)) {
Record rec = null;

try (IndexFile shpIndex = new IndexFile(shpFiles, false);
// strategy to speed up optimizeTree()
BoundsReader boundsHelper = createBoundsReader(reader, shpIndex);
QuadTree tree =
new QuadTree(
shpIndex.getRecordCount(), maxDepth, getBounds(reader), shpIndex)) {
Envelope env = new Envelope();
while (reader.hasNext()) {
rec = reader.nextRecord();
tree.insert(cnt++, new Envelope(rec.minX, rec.maxX, rec.minY, rec.maxY));
Record rec = reader.nextRecord();
env.init(rec.minX, rec.maxX, rec.minY, rec.maxY);
int recno = cnt++;
tree.insert(recno, env);
boundsHelper.insert(recno, env);

if (verbose && ((cnt % 1000) == 0)) {
if (verbose && ((cnt % 1_000) == 0)) {
System.out.print('.');
}
if (cnt % 100000 == 0) System.out.print('\n');
if (cnt % 100_000 == 0) System.out.print('\n');
}
if (verbose) System.out.println("done");
FileSystemIndexStore store = new FileSystemIndexStore(file, order);
if (verbose) System.out.println("done building quadtree");

if (leafSize > 0) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Optimizing the tree (this might take some time)");
}
optimizeTree(tree, tree.getRoot(), 0, reader, shpIndex);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Tree optimized");
}
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Optimizing the tree (this might take some time)");
}
if (verbose) System.out.println("Optimizing the tree (this might take some time)");
optimizeTree(tree, tree.getRoot(), 0, boundsHelper);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Tree optimized");
}

if (LOGGER.isLoggable(Level.FINE)) {
printStats(tree);
}
if (verbose) System.out.println("Storing the tree...");
FileSystemIndexStore store = new FileSystemIndexStore(file, fileByteOrder);
store.store(tree);
if (verbose) System.out.println("done");
}
return cnt;
}

private Node optimizeTree(
QuadTree tree, Node node, int level, ShapefileReader reader, IndexFile index)
private Envelope getBounds(ShapefileReader reader) {
ShapefileHeader header = reader.getHeader();
Envelope bounds = new Envelope(header.minX(), header.maxX(), header.minY(), header.maxY());
return bounds;
}

private byte resolveStorageByteOrder() throws StoreException {
if ((this.byteOrder == null) || this.byteOrder.equalsIgnoreCase("NM")) {
return IndexHeader.NEW_MSB_ORDER;
}
if (this.byteOrder.equalsIgnoreCase("NL")) {
return IndexHeader.NEW_LSB_ORDER;
}
throw new StoreException("Asked byte order '" + this.byteOrder + "' must be 'NL' or 'NM'!");
}

private Node optimizeTree(QuadTree tree, Node node, int level, BoundsReader reader)
throws StoreException, IOException {
// recurse, with a check to avoid too deep recursion due to odd data that has a
if (node.getNumShapeIds() > leafSize && node.getNumSubNodes() == 0 && level < max * 2) {
final boolean isLeafNode = node.getNumSubNodes() == 0;
final boolean isOverFlown = node.getNumShapeIds() > leafSize;
final int hardMaxDepth = maxDepth * 2;
final boolean canBeSplit = level < hardMaxDepth;
if (isLeafNode && isOverFlown && canBeSplit) {
// ok, we need to split this baby further
int[] shapeIds = node.getShapesId();
int numShapesId = node.getNumShapeIds();
Expand All @@ -257,13 +280,11 @@ private Node optimizeTree(
nodes *= 4;
}

Envelope env = new Envelope();
for (int i = 0; i < numShapesId; i++) {
final int shapeId = shapeIds[i];
int offset = index.getOffsetInBytes(shapeId);
reader.goTo(offset);
Record rec = reader.nextRecord();
Envelope env = new Envelope(rec.minX, rec.maxX, rec.minY, rec.maxY);
tree.insert(node, shapeId, env, extraLevels);
final int recNumber = shapeIds[i];
reader.read(recNumber, env);
tree.insert(node, recNumber, env, extraLevels);
}
}

Expand All @@ -272,7 +293,7 @@ private Node optimizeTree(

// recurse
for (int i = 0; i < node.getNumSubNodes(); i++) {
optimizeTree(tree, node.getSubNode(i), level + 1, reader, index);
optimizeTree(tree, node.getSubNode(i), level + 1, reader);
}

// prune empty subnodes
Expand Down Expand Up @@ -301,12 +322,8 @@ private Node optimizeTree(
Envelope bounds = new Envelope();
if (node.getNumShapeIds() > 0) {
int[] shapeIds = node.getShapesId();
for (final int shapeId : shapeIds) {
int offset = index.getOffsetInBytes(shapeId);
reader.goTo(offset);
Record rec = reader.nextRecord();
Envelope env = new Envelope(rec.minX, rec.maxX, rec.minY, rec.maxY);
bounds.expandToInclude(env);
for (final int recNumber : shapeIds) {
reader.expandEnvelope(recNumber, bounds);
}
}
if (node.getNumSubNodes() > 0) {
Expand Down Expand Up @@ -369,7 +386,7 @@ void gatherStats(Node node, Map<Integer, Integer> stats) throws StoreException {

/** For quad tree this is the max depth. I don't know what it is for RTree */
public void setMax(int i) {
max = i;
maxDepth = i;
}

/** @param shpFiles */
Expand All @@ -392,6 +409,9 @@ public int getLeafSize() {
}

public void setLeafSize(int leafSize) {
if (leafSize < 1) {
throw new IllegalArgumentException("Maximum node leaf size must be a positive integer");
}
this.leafSize = leafSize;
}
}
Loading

0 comments on commit fa66664

Please sign in to comment.