Skip to content

Commit

Permalink
Add HydroRIVERS index feature (#1776)
Browse files Browse the repository at this point in the history
* finished GeoGenerator and have a start on GeoJsonCollection

* removed GeoJsonCollection

* removed files no longer used

* reverted old GeoIndexerTestBase

* added test for Json collection on Rivers

* updated to pass tests now

* removed debug code in GeoIndexerTestBase

* pedantic changes

* Update comment syntax

* fixed comment grammar
  • Loading branch information
d1shs0ap committed Feb 24, 2022
1 parent 9fafc60 commit 7472d86
Show file tree
Hide file tree
Showing 4 changed files with 340 additions and 0 deletions.
85 changes: 85 additions & 0 deletions src/main/java/io/anserini/index/generator/GeoGenerator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package io.anserini.index.generator;

import io.anserini.collection.JsonCollection;
import io.anserini.index.IndexArgs;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.*;
import org.apache.lucene.geo.Line;
import org.apache.lucene.geo.Polygon;
import org.apache.lucene.geo.SimpleWKTShapeParser;

import java.io.IOException;
import java.text.ParseException;

public class GeoGenerator implements LuceneDocumentGenerator<JsonCollection.Document> {
private static final Logger LOG = LogManager.getLogger(GeoGenerator.class);

protected IndexArgs args;

public GeoGenerator(IndexArgs args) {
this.args = args;
}

@Override
public Document createDocument(JsonCollection.Document geoDoc) {
Document doc = new Document();

// Store the raw JSON
if (args.storeRaw) {
doc.add(new StoredField(IndexArgs.RAW, geoDoc.raw()));
}

geoDoc.fields().forEach((k, v) -> {
if ("geometry".equals(k)) {
// parse the geometry fields using SimpleWKTParser and index them
try {
Object shape = SimpleWKTShapeParser.parse(v);

Field[] fields = new Field[0];
if (shape instanceof Line) {
fields = LatLonShape.createIndexableFields("geometry", (Line) shape);
} else if (shape instanceof Polygon) {
fields = LatLonShape.createIndexableFields("geometry", (Polygon) shape);
} else if (shape instanceof Line[]) {
for (Line line: (Line[]) shape) {
fields = LatLonShape.createIndexableFields("geometry", line);
}
} else if (shape instanceof Polygon[]) {
for (Polygon polygon: (Polygon[]) shape) {
fields = LatLonShape.createIndexableFields("geometry", polygon);
}
} else {
throw new IllegalArgumentException("unknown shape");
}

for (Field f: fields) {
doc.add(f);
}
} catch (ParseException | IOException e) {
LOG.error("Error parsing unknown shape using SimpleWKTShapeParser: " + v);
} catch (IllegalArgumentException e) {
LOG.error("Error casting shape to any of the types Line, Line[], Polygon, Polygon[]: " + v);
}

} else {
// go through all the non-geometry fields and try to index them as int or long if possible
try {
long vLong = Long.parseLong(v);
doc.add(new LongPoint(k, vLong));
doc.add(new StoredField(k, v));
} catch (NumberFormatException e1) {
try {
double vDouble = Double.parseDouble(v);
doc.add(new DoublePoint(k, vDouble));
doc.add(new StoredField(k, v));
} catch (NumberFormatException e2) {
doc.add(new StringField(k, v, Field.Store.YES));
}
}
}
});

return doc;
}
}
104 changes: 104 additions & 0 deletions src/test/java/io/anserini/collection/JsonCollectionGeoRiverTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package io.anserini.collection;

import org.junit.Before;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;

public class JsonCollectionGeoRiverTest extends JsonCollectionTest {
@Before
public void setUp() throws Exception {
super.setUp();

collectionPath = Paths.get("src/test/resources/sample_docs/json/collection_geo");
collection = new JsonCollection(collectionPath);

Path segment = Paths.get("src/test/resources/sample_docs/json/collection_geo/rivers.json");

segmentPaths.add(segment);
segmentDocCounts.put(segment, 3);

totalSegments = 1;
totalDocs = 3;

expected.put("90000001", Map.ofEntries(
Map.entry("HYRIV_ID", "90000001"),
Map.entry("NEXT_DOWN", "0"),
Map.entry("MAIN_RIV", "90000001"),
Map.entry("LENGTH_KM", "1.16"),
Map.entry("DIST_DN_KM", "0.0"),
Map.entry("DIST_UP_KM", "8.4"),
Map.entry("CATCH_SKM", "15.06"),
Map.entry("UPLAND_SKM", "15.0"),
Map.entry("ENDORHEIC", "0"),
Map.entry("DIS_AV_CMS", "0.089"),
Map.entry("ORD_STRA", "1"),
Map.entry("ORD_CLAS", "1"),
Map.entry("ORD_FLOW", "8"),
Map.entry("HYBAS_L12", "9120016560"),
Map.entry("geometry", "LINESTRING (-32.235416666667334 83.57916666666631, -32.235416666667334 83.589583333333)"),
Map.entry("id", "90000001")
));

expected.put("90000002", Map.ofEntries(
Map.entry("HYRIV_ID", "90000002"),
Map.entry("NEXT_DOWN", "0"),
Map.entry("MAIN_RIV", "90000002"),
Map.entry("LENGTH_KM", "1.16"),
Map.entry("DIST_DN_KM", "0.0"),
Map.entry("DIST_UP_KM", "69.1"),
Map.entry("CATCH_SKM", "10.08"),
Map.entry("UPLAND_SKM", "10.1"),
Map.entry("ENDORHEIC", "0"),
Map.entry("DIS_AV_CMS", "0.0"),
Map.entry("ORD_STRA", "1"),
Map.entry("ORD_CLAS", "1"),
Map.entry("ORD_FLOW", "10"),
Map.entry("HYBAS_L12", "9120016500"),
Map.entry("geometry", "LINESTRING (-36.03541666666723 83.54583333333295, -36.03541666666723 83.55624999999961)"),
Map.entry("id", "90000002")
));

expected.put("90000003", Map.ofEntries(
Map.entry("HYRIV_ID", "90000003"),
Map.entry("NEXT_DOWN", "0"),
Map.entry("MAIN_RIV", "90000003"),
Map.entry("LENGTH_KM", "3.02"),
Map.entry("DIST_DN_KM", "0.0"),
Map.entry("DIST_UP_KM", "35.3"),
Map.entry("CATCH_SKM", "12.24"),
Map.entry("UPLAND_SKM", "12.2"),
Map.entry("ENDORHEIC", "0"),
Map.entry("DIS_AV_CMS", "0.03"),
Map.entry("ORD_STRA", "1"),
Map.entry("ORD_CLAS", "1"),
Map.entry("ORD_FLOW", "8"),
Map.entry("HYBAS_L12", "9120016580"),
Map.entry("geometry", "LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)"),
Map.entry("id", "90000003")
));
}

@Override
void checkDocument(SourceDocument doc, Map<String, String> expected) {
// Note that we need an id in addition to HYRIV_ID to distinguish between different docs
assertTrue(doc.indexable());
assertEquals(expected.get("HYRIV_ID"), ((JsonCollection.Document) doc).fields().get("HYRIV_ID"));
assertEquals(expected.get("NEXT_DOWN"), ((JsonCollection.Document) doc).fields().get("NEXT_DOWN"));
assertEquals(expected.get("MAIN_RIV"), ((JsonCollection.Document) doc).fields().get("MAIN_RIV"));
assertEquals(expected.get("LENGTH_KM"), ((JsonCollection.Document) doc).fields().get("LENGTH_KM"));
assertEquals(expected.get("DIST_DN_KM"), ((JsonCollection.Document) doc).fields().get("DIST_DN_KM"));
assertEquals(expected.get("DIST_UP_KM"), ((JsonCollection.Document) doc).fields().get("DIST_UP_KM"));
assertEquals(expected.get("CATCH_SKM"), ((JsonCollection.Document) doc).fields().get("CATCH_SKM"));
assertEquals(expected.get("UPLAND_SKM"), ((JsonCollection.Document) doc).fields().get("UPLAND_SKM"));
assertEquals(expected.get("ENDORHEIC"), ((JsonCollection.Document) doc).fields().get("ENDORHEIC"));
assertEquals(expected.get("DIS_AV_CMS"), ((JsonCollection.Document) doc).fields().get("DIS_AV_CMS"));
assertEquals(expected.get("ORD_STRA"), ((JsonCollection.Document) doc).fields().get("ORD_STRA"));
assertEquals(expected.get("ORD_CLAS"), ((JsonCollection.Document) doc).fields().get("ORD_CLAS"));
assertEquals(expected.get("ORD_FLOW"), ((JsonCollection.Document) doc).fields().get("ORD_FLOW"));
assertEquals(expected.get("HYBAS_L12"), ((JsonCollection.Document) doc).fields().get("HYBAS_L12"));
assertEquals(expected.get("geometry"), ((JsonCollection.Document) doc).fields().get("geometry"));
assertEquals(expected.get("id"), doc.id());
}
}
97 changes: 97 additions & 0 deletions src/test/java/io/anserini/index/generator/GeoGeneratorTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package io.anserini.index.generator;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;
import io.anserini.collection.JsonCollection;
import io.anserini.index.IndexArgs;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.ShapeField;
import org.apache.lucene.index.IndexableField;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertEquals;

public class GeoGeneratorTest {
private JsonCollection.Document geoDoc;
private Document doc;

@Before
public void riverSetUp() {
ObjectMapper mapper = new ObjectMapper();
ObjectNode jsonObj = mapper.createObjectNode();
jsonObj.set("HYRIV_ID", TextNode.valueOf("90000003"));
jsonObj.set("NEXT_DOWN", TextNode.valueOf("0"));
jsonObj.set("MAIN_RIV", TextNode.valueOf("90000003"));
jsonObj.set("LENGTH_KM", TextNode.valueOf("3.02"));
jsonObj.set("DIST_DN_KM", TextNode.valueOf("0.0"));
jsonObj.set("DIST_UP_KM", TextNode.valueOf("35.3"));
jsonObj.set("CATCH_SKM", TextNode.valueOf("12.24"));
jsonObj.set("UPLAND_SKM", TextNode.valueOf("12.2"));
jsonObj.set("ENDORHEIC", TextNode.valueOf("0"));
jsonObj.set("DIS_AV_CMS", TextNode.valueOf("0.03"));
jsonObj.set("ORD_STRA", TextNode.valueOf("1"));
jsonObj.set("ORD_CLAS", TextNode.valueOf("1"));
jsonObj.set("ORD_FLOW", TextNode.valueOf("8"));
jsonObj.set("HYBAS_L12", TextNode.valueOf("9120016580"));
jsonObj.set("geometry", TextNode.valueOf("LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)"));
jsonObj.set("id", TextNode.valueOf("90000003"));

geoDoc = new JsonCollection.Document(jsonObj);

GeoGenerator generator = new GeoGenerator(new IndexArgs());
doc = generator.createDocument(geoDoc);
}

@Test
public void testRiverDocumentFields() {
// Check if the field types were inferred correctly, id field is omitted since it's a repeat of HYRIV_ID
assertEquals(LongPoint.class, doc.getField("HYRIV_ID").getClass());
assertEquals(90000003L, doc.getField("HYRIV_ID").numericValue());

assertEquals(LongPoint.class, doc.getField("NEXT_DOWN").getClass());
assertEquals(0L, doc.getField("NEXT_DOWN").numericValue());

assertEquals(LongPoint.class, doc.getField("MAIN_RIV").getClass());
assertEquals(90000003L, doc.getField("MAIN_RIV").numericValue());

assertEquals(DoublePoint.class, doc.getField("LENGTH_KM").getClass());
assertEquals(3.02, doc.getField("LENGTH_KM").numericValue());

assertEquals(DoublePoint.class, doc.getField("DIST_DN_KM").getClass());
assertEquals(0.0, doc.getField("DIST_DN_KM").numericValue());

assertEquals(DoublePoint.class, doc.getField("DIST_UP_KM").getClass());
assertEquals(35.3, doc.getField("DIST_UP_KM").numericValue());

assertEquals(DoublePoint.class, doc.getField("CATCH_SKM").getClass());
assertEquals(12.24, doc.getField("CATCH_SKM").numericValue());

assertEquals(DoublePoint.class, doc.getField("UPLAND_SKM").getClass());
assertEquals(12.2, doc.getField("UPLAND_SKM").numericValue());

assertEquals(LongPoint.class, doc.getField("ENDORHEIC").getClass());
assertEquals(0L, doc.getField("ENDORHEIC").numericValue());

assertEquals(DoublePoint.class, doc.getField("DIS_AV_CMS").getClass());
assertEquals(0.03, doc.getField("DIS_AV_CMS").numericValue());

assertEquals(LongPoint.class, doc.getField("ORD_STRA").getClass());
assertEquals(1L, doc.getField("ORD_STRA").numericValue());

assertEquals(LongPoint.class, doc.getField("ORD_CLAS").getClass());
assertEquals(1L, doc.getField("ORD_CLAS").numericValue());

assertEquals(LongPoint.class, doc.getField("ORD_FLOW").getClass());
assertEquals(8L, doc.getField("ORD_FLOW").numericValue());

assertEquals(LongPoint.class, doc.getField("HYBAS_L12").getClass());
assertEquals(9120016580L, doc.getField("HYBAS_L12").numericValue());

for (IndexableField f: doc.getFields("geometry")) {
assertEquals(ShapeField.Triangle.class, f.getClass());
}
}
}
54 changes: 54 additions & 0 deletions src/test/resources/sample_docs/json/collection_geo/rivers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"HYRIV_ID": 90000001,
"NEXT_DOWN": 0,
"MAIN_RIV": 90000001,
"LENGTH_KM": 1.16,
"DIST_DN_KM": 0.0,
"DIST_UP_KM": 8.4,
"CATCH_SKM": 15.06,
"UPLAND_SKM": 15.0,
"ENDORHEIC": 0,
"DIS_AV_CMS": 0.089,
"ORD_STRA": 1,
"ORD_CLAS": 1,
"ORD_FLOW": 8,
"HYBAS_L12": 9120016560,
"geometry": "LINESTRING (-32.235416666667334 83.57916666666631, -32.235416666667334 83.589583333333)",
"id": 90000001
}
{
"HYRIV_ID": 90000002,
"NEXT_DOWN": 0,
"MAIN_RIV": 90000002,
"LENGTH_KM": 1.16,
"DIST_DN_KM": 0.0,
"DIST_UP_KM": 69.1,
"CATCH_SKM": 10.08,
"UPLAND_SKM": 10.1,
"ENDORHEIC": 0,
"DIS_AV_CMS": 0.0,
"ORD_STRA": 1,
"ORD_CLAS": 1,
"ORD_FLOW": 10,
"HYBAS_L12": 9120016500,
"geometry": "LINESTRING (-36.03541666666723 83.54583333333295, -36.03541666666723 83.55624999999961)",
"id": 90000002
}
{
"HYRIV_ID": 90000003,
"NEXT_DOWN": 0,
"MAIN_RIV": 90000003,
"LENGTH_KM": 3.02,
"DIST_DN_KM": 0.0,
"DIST_UP_KM": 35.3,
"CATCH_SKM": 12.24,
"UPLAND_SKM": 12.2,
"ENDORHEIC": 0,
"DIS_AV_CMS": 0.03,
"ORD_STRA": 1,
"ORD_CLAS": 1,
"ORD_FLOW": 8,
"HYBAS_L12": 9120016580,
"geometry": "LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)",
"id": 90000003
}

0 comments on commit 7472d86

Please sign in to comment.