diff --git a/src/main/java/io/anserini/index/generator/GeoGenerator.java b/src/main/java/io/anserini/index/generator/GeoGenerator.java new file mode 100644 index 0000000000..e5c4e41226 --- /dev/null +++ b/src/main/java/io/anserini/index/generator/GeoGenerator.java @@ -0,0 +1,85 @@ +package io.anserini.index.generator; + +import io.anserini.collection.JsonCollection; +import io.anserini.index.IndexArgs; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.document.*; +import org.apache.lucene.geo.Line; +import org.apache.lucene.geo.Polygon; +import org.apache.lucene.geo.SimpleWKTShapeParser; + +import java.io.IOException; +import java.text.ParseException; + +public class GeoGenerator implements LuceneDocumentGenerator { + private static final Logger LOG = LogManager.getLogger(GeoGenerator.class); + + protected IndexArgs args; + + public GeoGenerator(IndexArgs args) { + this.args = args; + } + + @Override + public Document createDocument(JsonCollection.Document geoDoc) { + Document doc = new Document(); + + // Store the raw JSON + if (args.storeRaw) { + doc.add(new StoredField(IndexArgs.RAW, geoDoc.raw())); + } + + geoDoc.fields().forEach((k, v) -> { + if ("geometry".equals(k)) { + // parse the geometry fields using SimpleWKTParser and index them + try { + Object shape = SimpleWKTShapeParser.parse(v); + + Field[] fields = new Field[0]; + if (shape instanceof Line) { + fields = LatLonShape.createIndexableFields("geometry", (Line) shape); + } else if (shape instanceof Polygon) { + fields = LatLonShape.createIndexableFields("geometry", (Polygon) shape); + } else if (shape instanceof Line[]) { + for (Line line: (Line[]) shape) { + fields = LatLonShape.createIndexableFields("geometry", line); + } + } else if (shape instanceof Polygon[]) { + for (Polygon polygon: (Polygon[]) shape) { + fields = LatLonShape.createIndexableFields("geometry", polygon); + } + } else { + throw new IllegalArgumentException("unknown shape"); + } + + for (Field f: fields) { + doc.add(f); + } + } catch (ParseException | IOException e) { + LOG.error("Error parsing unknown shape using SimpleWKTShapeParser: " + v); + } catch (IllegalArgumentException e) { + LOG.error("Error casting shape to any of the types Line, Line[], Polygon, Polygon[]: " + v); + } + + } else { + // go through all the non-geometry fields and try to index them as int or long if possible + try { + long vLong = Long.parseLong(v); + doc.add(new LongPoint(k, vLong)); + doc.add(new StoredField(k, v)); + } catch (NumberFormatException e1) { + try { + double vDouble = Double.parseDouble(v); + doc.add(new DoublePoint(k, vDouble)); + doc.add(new StoredField(k, v)); + } catch (NumberFormatException e2) { + doc.add(new StringField(k, v, Field.Store.YES)); + } + } + } + }); + + return doc; + } +} diff --git a/src/test/java/io/anserini/collection/JsonCollectionGeoRiverTest.java b/src/test/java/io/anserini/collection/JsonCollectionGeoRiverTest.java new file mode 100644 index 0000000000..140b52b575 --- /dev/null +++ b/src/test/java/io/anserini/collection/JsonCollectionGeoRiverTest.java @@ -0,0 +1,104 @@ +package io.anserini.collection; + +import org.junit.Before; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +public class JsonCollectionGeoRiverTest extends JsonCollectionTest { + @Before + public void setUp() throws Exception { + super.setUp(); + + collectionPath = Paths.get("src/test/resources/sample_docs/json/collection_geo"); + collection = new JsonCollection(collectionPath); + + Path segment = Paths.get("src/test/resources/sample_docs/json/collection_geo/rivers.json"); + + segmentPaths.add(segment); + segmentDocCounts.put(segment, 3); + + totalSegments = 1; + totalDocs = 3; + + expected.put("90000001", Map.ofEntries( + Map.entry("HYRIV_ID", "90000001"), + Map.entry("NEXT_DOWN", "0"), + Map.entry("MAIN_RIV", "90000001"), + Map.entry("LENGTH_KM", "1.16"), + Map.entry("DIST_DN_KM", "0.0"), + Map.entry("DIST_UP_KM", "8.4"), + Map.entry("CATCH_SKM", "15.06"), + Map.entry("UPLAND_SKM", "15.0"), + Map.entry("ENDORHEIC", "0"), + Map.entry("DIS_AV_CMS", "0.089"), + Map.entry("ORD_STRA", "1"), + Map.entry("ORD_CLAS", "1"), + Map.entry("ORD_FLOW", "8"), + Map.entry("HYBAS_L12", "9120016560"), + Map.entry("geometry", "LINESTRING (-32.235416666667334 83.57916666666631, -32.235416666667334 83.589583333333)"), + Map.entry("id", "90000001") + )); + + expected.put("90000002", Map.ofEntries( + Map.entry("HYRIV_ID", "90000002"), + Map.entry("NEXT_DOWN", "0"), + Map.entry("MAIN_RIV", "90000002"), + Map.entry("LENGTH_KM", "1.16"), + Map.entry("DIST_DN_KM", "0.0"), + Map.entry("DIST_UP_KM", "69.1"), + Map.entry("CATCH_SKM", "10.08"), + Map.entry("UPLAND_SKM", "10.1"), + Map.entry("ENDORHEIC", "0"), + Map.entry("DIS_AV_CMS", "0.0"), + Map.entry("ORD_STRA", "1"), + Map.entry("ORD_CLAS", "1"), + Map.entry("ORD_FLOW", "10"), + Map.entry("HYBAS_L12", "9120016500"), + Map.entry("geometry", "LINESTRING (-36.03541666666723 83.54583333333295, -36.03541666666723 83.55624999999961)"), + Map.entry("id", "90000002") + )); + + expected.put("90000003", Map.ofEntries( + Map.entry("HYRIV_ID", "90000003"), + Map.entry("NEXT_DOWN", "0"), + Map.entry("MAIN_RIV", "90000003"), + Map.entry("LENGTH_KM", "3.02"), + Map.entry("DIST_DN_KM", "0.0"), + Map.entry("DIST_UP_KM", "35.3"), + Map.entry("CATCH_SKM", "12.24"), + Map.entry("UPLAND_SKM", "12.2"), + Map.entry("ENDORHEIC", "0"), + Map.entry("DIS_AV_CMS", "0.03"), + Map.entry("ORD_STRA", "1"), + Map.entry("ORD_CLAS", "1"), + Map.entry("ORD_FLOW", "8"), + Map.entry("HYBAS_L12", "9120016580"), + Map.entry("geometry", "LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)"), + Map.entry("id", "90000003") + )); + } + + @Override + void checkDocument(SourceDocument doc, Map expected) { + // Note that we need an id in addition to HYRIV_ID to distinguish between different docs + assertTrue(doc.indexable()); + assertEquals(expected.get("HYRIV_ID"), ((JsonCollection.Document) doc).fields().get("HYRIV_ID")); + assertEquals(expected.get("NEXT_DOWN"), ((JsonCollection.Document) doc).fields().get("NEXT_DOWN")); + assertEquals(expected.get("MAIN_RIV"), ((JsonCollection.Document) doc).fields().get("MAIN_RIV")); + assertEquals(expected.get("LENGTH_KM"), ((JsonCollection.Document) doc).fields().get("LENGTH_KM")); + assertEquals(expected.get("DIST_DN_KM"), ((JsonCollection.Document) doc).fields().get("DIST_DN_KM")); + assertEquals(expected.get("DIST_UP_KM"), ((JsonCollection.Document) doc).fields().get("DIST_UP_KM")); + assertEquals(expected.get("CATCH_SKM"), ((JsonCollection.Document) doc).fields().get("CATCH_SKM")); + assertEquals(expected.get("UPLAND_SKM"), ((JsonCollection.Document) doc).fields().get("UPLAND_SKM")); + assertEquals(expected.get("ENDORHEIC"), ((JsonCollection.Document) doc).fields().get("ENDORHEIC")); + assertEquals(expected.get("DIS_AV_CMS"), ((JsonCollection.Document) doc).fields().get("DIS_AV_CMS")); + assertEquals(expected.get("ORD_STRA"), ((JsonCollection.Document) doc).fields().get("ORD_STRA")); + assertEquals(expected.get("ORD_CLAS"), ((JsonCollection.Document) doc).fields().get("ORD_CLAS")); + assertEquals(expected.get("ORD_FLOW"), ((JsonCollection.Document) doc).fields().get("ORD_FLOW")); + assertEquals(expected.get("HYBAS_L12"), ((JsonCollection.Document) doc).fields().get("HYBAS_L12")); + assertEquals(expected.get("geometry"), ((JsonCollection.Document) doc).fields().get("geometry")); + assertEquals(expected.get("id"), doc.id()); + } +} diff --git a/src/test/java/io/anserini/index/generator/GeoGeneratorTest.java b/src/test/java/io/anserini/index/generator/GeoGeneratorTest.java new file mode 100644 index 0000000000..0979943ed9 --- /dev/null +++ b/src/test/java/io/anserini/index/generator/GeoGeneratorTest.java @@ -0,0 +1,97 @@ +package io.anserini.index.generator; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.databind.node.TextNode; +import io.anserini.collection.JsonCollection; +import io.anserini.index.IndexArgs; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.ShapeField; +import org.apache.lucene.index.IndexableField; +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.assertEquals; + +public class GeoGeneratorTest { + private JsonCollection.Document geoDoc; + private Document doc; + + @Before + public void riverSetUp() { + ObjectMapper mapper = new ObjectMapper(); + ObjectNode jsonObj = mapper.createObjectNode(); + jsonObj.set("HYRIV_ID", TextNode.valueOf("90000003")); + jsonObj.set("NEXT_DOWN", TextNode.valueOf("0")); + jsonObj.set("MAIN_RIV", TextNode.valueOf("90000003")); + jsonObj.set("LENGTH_KM", TextNode.valueOf("3.02")); + jsonObj.set("DIST_DN_KM", TextNode.valueOf("0.0")); + jsonObj.set("DIST_UP_KM", TextNode.valueOf("35.3")); + jsonObj.set("CATCH_SKM", TextNode.valueOf("12.24")); + jsonObj.set("UPLAND_SKM", TextNode.valueOf("12.2")); + jsonObj.set("ENDORHEIC", TextNode.valueOf("0")); + jsonObj.set("DIS_AV_CMS", TextNode.valueOf("0.03")); + jsonObj.set("ORD_STRA", TextNode.valueOf("1")); + jsonObj.set("ORD_CLAS", TextNode.valueOf("1")); + jsonObj.set("ORD_FLOW", TextNode.valueOf("8")); + jsonObj.set("HYBAS_L12", TextNode.valueOf("9120016580")); + jsonObj.set("geometry", TextNode.valueOf("LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)")); + jsonObj.set("id", TextNode.valueOf("90000003")); + + geoDoc = new JsonCollection.Document(jsonObj); + + GeoGenerator generator = new GeoGenerator(new IndexArgs()); + doc = generator.createDocument(geoDoc); + } + + @Test + public void testRiverDocumentFields() { + // Check if the field types were inferred correctly, id field is omitted since it's a repeat of HYRIV_ID + assertEquals(LongPoint.class, doc.getField("HYRIV_ID").getClass()); + assertEquals(90000003L, doc.getField("HYRIV_ID").numericValue()); + + assertEquals(LongPoint.class, doc.getField("NEXT_DOWN").getClass()); + assertEquals(0L, doc.getField("NEXT_DOWN").numericValue()); + + assertEquals(LongPoint.class, doc.getField("MAIN_RIV").getClass()); + assertEquals(90000003L, doc.getField("MAIN_RIV").numericValue()); + + assertEquals(DoublePoint.class, doc.getField("LENGTH_KM").getClass()); + assertEquals(3.02, doc.getField("LENGTH_KM").numericValue()); + + assertEquals(DoublePoint.class, doc.getField("DIST_DN_KM").getClass()); + assertEquals(0.0, doc.getField("DIST_DN_KM").numericValue()); + + assertEquals(DoublePoint.class, doc.getField("DIST_UP_KM").getClass()); + assertEquals(35.3, doc.getField("DIST_UP_KM").numericValue()); + + assertEquals(DoublePoint.class, doc.getField("CATCH_SKM").getClass()); + assertEquals(12.24, doc.getField("CATCH_SKM").numericValue()); + + assertEquals(DoublePoint.class, doc.getField("UPLAND_SKM").getClass()); + assertEquals(12.2, doc.getField("UPLAND_SKM").numericValue()); + + assertEquals(LongPoint.class, doc.getField("ENDORHEIC").getClass()); + assertEquals(0L, doc.getField("ENDORHEIC").numericValue()); + + assertEquals(DoublePoint.class, doc.getField("DIS_AV_CMS").getClass()); + assertEquals(0.03, doc.getField("DIS_AV_CMS").numericValue()); + + assertEquals(LongPoint.class, doc.getField("ORD_STRA").getClass()); + assertEquals(1L, doc.getField("ORD_STRA").numericValue()); + + assertEquals(LongPoint.class, doc.getField("ORD_CLAS").getClass()); + assertEquals(1L, doc.getField("ORD_CLAS").numericValue()); + + assertEquals(LongPoint.class, doc.getField("ORD_FLOW").getClass()); + assertEquals(8L, doc.getField("ORD_FLOW").numericValue()); + + assertEquals(LongPoint.class, doc.getField("HYBAS_L12").getClass()); + assertEquals(9120016580L, doc.getField("HYBAS_L12").numericValue()); + + for (IndexableField f: doc.getFields("geometry")) { + assertEquals(ShapeField.Triangle.class, f.getClass()); + } + } +} diff --git a/src/test/resources/sample_docs/json/collection_geo/rivers.json b/src/test/resources/sample_docs/json/collection_geo/rivers.json new file mode 100644 index 0000000000..6ba8675276 --- /dev/null +++ b/src/test/resources/sample_docs/json/collection_geo/rivers.json @@ -0,0 +1,54 @@ +{ + "HYRIV_ID": 90000001, + "NEXT_DOWN": 0, + "MAIN_RIV": 90000001, + "LENGTH_KM": 1.16, + "DIST_DN_KM": 0.0, + "DIST_UP_KM": 8.4, + "CATCH_SKM": 15.06, + "UPLAND_SKM": 15.0, + "ENDORHEIC": 0, + "DIS_AV_CMS": 0.089, + "ORD_STRA": 1, + "ORD_CLAS": 1, + "ORD_FLOW": 8, + "HYBAS_L12": 9120016560, + "geometry": "LINESTRING (-32.235416666667334 83.57916666666631, -32.235416666667334 83.589583333333)", + "id": 90000001 +} +{ + "HYRIV_ID": 90000002, + "NEXT_DOWN": 0, + "MAIN_RIV": 90000002, + "LENGTH_KM": 1.16, + "DIST_DN_KM": 0.0, + "DIST_UP_KM": 69.1, + "CATCH_SKM": 10.08, + "UPLAND_SKM": 10.1, + "ENDORHEIC": 0, + "DIS_AV_CMS": 0.0, + "ORD_STRA": 1, + "ORD_CLAS": 1, + "ORD_FLOW": 10, + "HYBAS_L12": 9120016500, + "geometry": "LINESTRING (-36.03541666666723 83.54583333333295, -36.03541666666723 83.55624999999961)", + "id": 90000002 +} +{ + "HYRIV_ID": 90000003, + "NEXT_DOWN": 0, + "MAIN_RIV": 90000003, + "LENGTH_KM": 3.02, + "DIST_DN_KM": 0.0, + "DIST_UP_KM": 35.3, + "CATCH_SKM": 12.24, + "UPLAND_SKM": 12.2, + "ENDORHEIC": 0, + "DIS_AV_CMS": 0.03, + "ORD_STRA": 1, + "ORD_CLAS": 1, + "ORD_FLOW": 8, + "HYBAS_L12": 9120016580, + "geometry": "LINESTRING (-29.737500000000722 83.54583333333295, -29.731250000000642 83.55208333333294, -29.731250000000642 83.57291666666629)", + "id": 90000003 +}