Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 4.1 #145

Merged
merged 13 commits into from
Mar 15, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Import identifiers from DwCA
Cleaner resolution of vernacular names, taking synonyms into account
  • Loading branch information
charvolant committed Feb 21, 2022
commit 439dfadf7682957696be38a6494461f6a0e1770a
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ public ALANameAnalyser() {
* @return The analyzed name
*/
@Override
public NameKey analyse(@Nullable NomenclaturalClassifier code, String scientificName, @Nullable String scientificNameAuthorship, RankType rankType, TaxonomicType taxonomicStatus, Set<TaxonFlag> flags, boolean loose) {
public AnalysisResult analyse(@Nullable NomenclaturalClassifier code, String scientificName, @Nullable String scientificNameAuthorship, RankType rankType, TaxonomicType taxonomicStatus, Set<TaxonFlag> flags, boolean loose) {
NameType nameType = NameType.INFORMAL;
ParsedName name = null;

Expand All @@ -219,20 +219,20 @@ public NameKey analyse(@Nullable NomenclaturalClassifier code, String scientific
scientificName = (left + " " + right).trim();
}
}
try {
name = this.nameParser.parse(scientificName, (rankType == null || rankType == RankType.UNRANKED) ? null : rankType.getCbRank());
if (name != null) {
nameType = name.getType();
if (rankType == null && name.getRank() != null)
rankType = RankType.getForCBRank(name.getRank());
}
} catch (UnparsableException ex) {
// Oh well, worth a try
}
if (UNSURE_MARKER.matcher(scientificName).find()) {
// Leave this well alone but indicate that it is doubtful
nameType = NameType.DOUBTFUL;
} else {
try {
name = this.nameParser.parse(scientificName, (rankType == null || rankType == RankType.UNRANKED) ? null : rankType.getCbRank());
if (name != null) {
nameType = name.getType();
if (rankType == null && name.getRank() != null)
rankType = RankType.getForCBRank(name.getRank());
}
} catch (UnparsableException ex) {
// Oh well, worth a try
}
if (loose) {
if (scientificNameAuthorship == null && name != null) {
String ac = this.normalise(name.authorshipComplete());
Expand Down Expand Up @@ -299,7 +299,18 @@ public NameKey analyse(@Nullable NomenclaturalClassifier code, String scientific
scientificName = scientificName.trim().toUpperCase();


return new NameKey(this, code, scientificName, scientificNameAuthorship, rankType, nameType, flags);
NameKey key = new NameKey(this, code, scientificName, scientificNameAuthorship, rankType, nameType, flags);
if (name == null)
return new AnalysisResult(key, null, null, null, null, null);
else
return new AnalysisResult(
key,
name.getGenusOrAbove(),
rankType != null && !rankType.isHigherThan(RankType.GENUS) ? name.getGenusOrAbove() : null,
name.getSpecificEpithet(),
name.getInfraSpecificEpithet(),
name.getCultivarEpithet()
);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ public NameAnalyser(AuthorComparator authorComparator, Reporter reporter) {
/**
* Convienience method for testing.
*/
public NameKey analyse(TaxonConceptInstance instance) {
public AnalysisResult analyse(TaxonConceptInstance instance) {
return this.analyse(instance.getCode(), instance.getScientificName(), instance.getScientificNameAuthorship(), instance.getRank(), instance.getTaxonomicStatus(), instance.getFlags(), false);
}

/**
* Convienience method for testing.
*/
public NameKey analyse(String code, String scientificName, String scientificNameAuthorship, String rank) {
public AnalysisResult analyse(String code, String scientificName, String scientificNameAuthorship, String rank) {
NomenclaturalClassifier canonicalCode = this.canonicaliseCode(code);
RankType rankType = this.canonicaliseRank(rank);
return this.analyse(canonicalCode, scientificName, scientificNameAuthorship, rankType, null, null, false);
Expand All @@ -92,7 +92,7 @@ public NameKey analyse(String code, String scientificName, String scientificName
*
* @return A suitable name key
*/
abstract public NameKey analyse(@Nullable NomenclaturalClassifier code, String scientificName, @Nullable String scientificNameAuthorship, @Nullable RankType rankType, @Nullable TaxonomicType taxonomicStatus, @Nullable Set<TaxonFlag> flags, boolean loose);
abstract public AnalysisResult analyse(@Nullable NomenclaturalClassifier code, String scientificName, @Nullable String scientificNameAuthorship, @Nullable RankType rankType, @Nullable TaxonomicType taxonomicStatus, @Nullable Set<TaxonFlag> flags, boolean loose);

/**
* Set the issue reporter.
Expand Down Expand Up @@ -248,4 +248,62 @@ public void report(IssueType type, String code, TaxonomicElement main, List<? ex
else
logger.warn("Report " + type.name() + " code=" + code + " main=" + main.toString() + " associated=" + associated);
}

/**
* The result of a name analysis.
* <p>
* As well as the all-important name key, any fragments of information about the parsed name are also returned.
* </p>
*/
public static class AnalysisResult {
private NameKey nameKey;
@Nullable
private String mononomial;
@Nullable
private String genus;
@Nullable
private String specificEpithet;
@Nullable
private String infraspecificEpithet;
@Nullable
private String cultivarEpithet;

public AnalysisResult(NameKey nameKey, @Nullable String mononomial, @Nullable String genus, @Nullable String specificEpithet, @Nullable String infraspecificEpithet, @Nullable String cultivarEpithet) {
this.nameKey = nameKey;
this.mononomial = mononomial;
this.genus = genus;
this.specificEpithet = specificEpithet;
this.infraspecificEpithet = infraspecificEpithet;
this.cultivarEpithet = cultivarEpithet;
}

public NameKey getNameKey() {
return this.nameKey;
}

@Nullable
public String getMononomial() {
return this.mononomial;
}

@Nullable
public String getGenus() {
return this.genus;
}

@Nullable
public String getSpecificEpithet() {
return this.specificEpithet;
}

@Nullable
public String getInfraspecificEpithet() {
return this.infraspecificEpithet;
}

@Nullable
public String getCultivarEpithet() {
return this.cultivarEpithet;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,24 @@ public Map<Term, Optional<String>> getClassification() {
return classification;
}

/**
* Add a classification hint.
* <p>
* If the hint is null or there is already a classification value, don't bother.
* Otherwise, add the hint to the classification.
* </p>
*
* @param term The classifcation term
* @param value The hint value
*/
public void addClassificationHint(Term term, @Nullable String value) {
if (value == null || (this.classification != null && this.classification.containsKey(term) && this.classification.get(term).isPresent()))
return;
if (this.classification == null)
this.classification = new HashMap<>();
this.classification.put(term, Optional.of(value));
}

/**
* A provider/id pair to help locate this taxon.
*
Expand Down Expand Up @@ -1508,7 +1526,7 @@ public void forwardTo(TaxonConceptInstance other, Taxonomy taxonomy) {
*
* @return True if the scientific name is valid
*/
// If you plan to change this, it is called by a parallel stream, so consisder thread safety
// If you plan to change this, it is called by a parallel stream, so consider thread safety
@Override
public boolean validate(Taxonomy taxonomy) {
boolean valid = true;
Expand All @@ -1519,7 +1537,6 @@ public boolean validate(Taxonomy taxonomy) {
taxonomy.report(IssueType.VALIDATION, "instance.validation.noParent", this, null);
valid = false;
}

}
if ((this.acceptedNameUsageID != null || this.acceptedNameUsage != null) && this.accepted == null) {
if (this.provider.isLoose())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -978,10 +978,11 @@ synchronized protected void addInferredInstance(TaxonConceptInstance instance) {
public TaxonConceptInstance addInstance(TaxonConceptInstance instance) throws Exception {
NameProvider provider = instance.getProvider();
String taxonID = instance.getTaxonID();
NameAnalyser.AnalysisResult analysis;
NameKey taxonKey;
String remark, explain;

taxonKey = this.analyser.analyse(
analysis = this.analyser.analyse(
instance.getCode(),
provider.correctScientificName(instance.getScientificName()),
provider.correctScientificNameAuthorship(instance.getScientificNameAuthorship()),
Expand All @@ -990,6 +991,7 @@ public TaxonConceptInstance addInstance(TaxonConceptInstance instance) throws Ex
instance.getFlags(),
provider.isLoose()
);
taxonKey = analysis.getNameKey();
taxonKey = instance.getProvider().adjustKey(taxonKey, instance);
switch (taxonKey.getType()) {
case PLACEHOLDER:
Expand Down Expand Up @@ -1019,6 +1021,14 @@ public TaxonConceptInstance addInstance(TaxonConceptInstance instance) throws Ex
break;
}
this.count("count.load.name." + taxonKey.getType().name());

// Add classification hints from the name
if (provider.isLoose() && instance.getParentNameUsageID() == null && instance.getParentNameUsage() == null) {
instance.addClassificationHint(DwcTerm.genus, analysis.getGenus());
instance.addClassificationHint(DwcTerm.specificEpithet, analysis.getSpecificEpithet());
instance.addClassificationHint(DwcTerm.infraspecificEpithet, analysis.getInfraspecificEpithet());
}

if (!instance.isForbidden() && (explain = instance.getProvider().forbid(instance, taxonKey)) != null) {
this.count("count.load.forbidden");
this.report(IssueType.NOTE, "taxonomy.load.forbidden", instance.getTaxonID(), instance.getDisplayName(), explain);
Expand Down Expand Up @@ -1114,7 +1124,7 @@ synchronized public void insertInstance(String taxonID, NameKey taxonKey, TaxonC
*/
public TaxonomicElement findElement(NomenclaturalClassifier code, String name, NameProvider provider, RankType rank) {
NameKey nameKey = null;
nameKey = this.analyser.analyse(code, name, null, rank, null, null, provider.isLoose()).toNameKey();
nameKey = this.analyser.analyse(code, name, null, rank, null, null, provider.isLoose()).getNameKey().toNameKey();
if (nameKey.isUncoded())
return this.bareNames.get(nameKey);
if (nameKey.isUnranked())
Expand Down Expand Up @@ -1726,7 +1736,7 @@ public List<Map<Term,String>> getIndexValues(Term type, String taxonID) throws I
public void sample(int samples) throws IndexBuilderException {
this.sample = new HashSet<>(samples);
Random random = new Random();
int jump = Math.max(10, this.instances.size() / 1000);
int jump = Math.max(10, this.instances.size() / samples) + 1;
Iterator<TaxonConceptInstance> iterator = this.instances.values().iterator();

if (!iterator.hasNext())
Expand All @@ -1740,17 +1750,20 @@ public void sample(int samples) throws IndexBuilderException {
iterator = this.instances.values().iterator(); // Start again
current = iterator.next().getContainer();
}
while (current != null && !this.sample.contains(current)) {
TaxonConceptInstance r = current.getRepresentative();
if (r == null) {
current = null;
} else {
this.sample.add(current);
if (r.isSynonym())
r = r.getAccepted() == null ? null : r.getAccepted().getRepresentative();
else
r = r.getParent() == null ? null : r.getParent().getRepresentative();
current = r == null ? null : r.getContainer();
Queue<TaxonConcept> processing = new LinkedList<>();
processing.offer(current);
while (!processing.isEmpty()) {
TaxonConcept tc = processing.remove();
if (this.sample.contains(tc))
continue;
this.sample.add(tc);
for (TaxonConceptInstance tci: tc.getInstances()) {
tci = tc.getResolved(tci);
processing.add(tci.getContainer());
if (tci.getResolvedAccepted() != null)
processing.add(tci.getResolvedAccepted().getContainer());
if (tci.getResolvedParent() != null)
processing.add(tci.getResolvedParent().getContainer());
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,11 +619,7 @@ protected void createExtraIdIndex(IndexWriter iw, File idFile) throws Exception
while ((values = reader.readNext()) != null) {

if (values != null && values.length >= 3) {
Document doc = new Document();
//doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED));
NameIndexField.LSID.store(values[2], doc);
//doc.add(new Field("reallsid", values[1], Store.YES, Index.NO));
NameIndexField.REAL_LSID.store(values[1], doc);
Document doc = this.createIdentifierDocument(values[2], null, values[1]);
iw.addDocument(doc);
}
}
Expand Down Expand Up @@ -744,6 +740,18 @@ protected Document createCommonNameDocument(String cn, String sn, String lsid, S
return doc;
}


protected Document createIdentifierDocument(String id, String sn, String lsid) {
Document doc = new Document();
if (sn != null) {
NameIndexField.NAME.store(sn, doc);
}

NameIndexField.LSID.store(id, doc);
NameIndexField.REAL_LSID.store(lsid, doc);
return doc;
}

public Document createALAIndexDocument(String name, String id, String lsid, String author, LinnaeanRankClassification cl){
return createALAIndexDocument(name,id, lsid, author,null,null, 0, 0, cl, null, null, MatchMetrics.DEFAULT_PRIORITY);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ public boolean create(File namesDwc) throws Exception{
writer.forceMerge(1);
log.info("Loading vernacular for " + namesDwc);
this.indexCommonNameExtension(archive);
log.info("Loading identfiiers for " + namesDwc);
this.indexIdentifierExtension(archive);
return true;
}

Expand Down Expand Up @@ -322,6 +324,7 @@ private boolean loadCommonNames(File verncacularDwc) throws Exception {
return true;
}


/**
* Index the common names CSV file supplied.
*
Expand Down Expand Up @@ -397,7 +400,7 @@ private void indexCommonNameExtension(Archive archive) throws Exception {
this.vernacularIndexWriter.addDocument(doc);
count++;
}
if(i % 1000 == 0){
if(i % 10000 == 0){
log.info("Processed " + i + " common names with " + count + " added to index");
}
}
Expand All @@ -407,6 +410,45 @@ private void indexCommonNameExtension(Archive archive) throws Exception {
}


private void indexIdentifierExtension(Archive archive) throws Exception {
ArchiveFile identifierArchiveFile = archive.getExtension(GbifTerm.Identifier);
Iterator<Record> iter = identifierArchiveFile == null ? null : identifierArchiveFile.iterator();
Map<String, Set<String>> seen = new HashMap<>();
int i = 0, count = 0;

if (identifierArchiveFile == null) {
log.info("No identifier extension from found in " + archive.getLocation());
return;
}
log.info("Starting to load the identifiers extension from " + archive.getLocation());
while (iter.hasNext()) {
i++;
Record record = iter.next();
String taxonID = record.id();
String identifier = record.value(DcTerm.identifier);
Set<String> seenIds = seen.computeIfAbsent(taxonID, k -> new HashSet<>());
if (!seenIds.contains(identifier) && !taxonID.equals(identifier)) {
TopDocs result = getLoadIdxResults(null, "lsid", taxonID, 1);
if (result.totalHits.value > 0) {
Document sciNameDoc = lsearcher.doc(result.scoreDocs[0].doc);
//get the scientific name
//we can add the common name
Document doc = createIdentifierDocument(identifier, sciNameDoc.get(NameIndexField.NAME.toString()), taxonID);
this.idWriter.addDocument(doc);
count++;
}
seenIds.add(identifier);
}
if(i % 10000 == 0){
log.info("Processed " + i + " identifiers with " + count + " added to index");
}
}
log.info("Finished processing " + i + " idenitfiers with " + count + " added to index");
this.idWriter.commit();
this.idWriter.forceMerge(1);
}


/**
* Creates a loading index to use to generate the hierarchy including the left right values.
*
Expand Down Expand Up @@ -1172,9 +1214,10 @@ public static void main(String[] args) {
indexer.indexCommonNames(commonNameFile);
indexer.createIrmng(irmngFile);
indexer.createExtraIdIndex(identifiersFile);
for (File dwca: dwcas)
for (File dwca: dwcas) {
if (indexer.loadCommonNames(dwca))
used.add(dwca);
}
indexer.commit();
for (File dwca: dwcas)
if (!used.contains(dwca))
Expand Down
Loading