Skip to content

Commit

Permalink
Build SNOMED CT index within 1GB memory 🚀
Browse files Browse the repository at this point in the history
  • Loading branch information
kaicode committed Sep 15, 2023
1 parent 9221d53 commit f502973
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 100 deletions.
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
<!-- For RF2 Import -->
<groupId>org.snomed.otf</groupId>
<artifactId>snomed-boot</artifactId>
<version>5.0.1-SNAPSHOT</version>
<exclusions>
<exclusion>
<groupId>com.google.collections</groupId>
Expand Down
67 changes: 0 additions & 67 deletions src/main/java/org/snomed/snowstormmicro/loading/ImportService.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.ihtsdo.otf.snomedboot.ReleaseImportException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.snomed.snowstormmicro.loading.ImportService;
import org.snomed.snowstormmicro.snomedimport.ImportService;
import org.snomed.snowstormmicro.util.TimerUtil;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package org.snomed.snowstormmicro.snomedimport;

import org.ihtsdo.otf.snomedboot.factory.ImpotentComponentFactory;
import org.snomed.snowstormmicro.domain.Concept;
import org.snomed.snowstormmicro.domain.Concepts;
import org.snomed.snowstormmicro.domain.Description;

import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ComponentFactoryWithDescriptionBatch extends ImpotentComponentFactory {

private final ComponentFactoryWithoutDescriptions baseComponents;
private final Collection<Long> conceptIdBatch;
private final Map<Long, Description> descriptionSynonymMap;
private final Concept dummyConcept;

public ComponentFactoryWithDescriptionBatch(ComponentFactoryWithoutDescriptions baseComponents, Collection<Long> conceptIdBatch) {
this.baseComponents = baseComponents;
descriptionSynonymMap = new HashMap<>();
dummyConcept = new Concept();
this.conceptIdBatch = conceptIdBatch;
}

@Override
public void newDescriptionState(String id, String effectiveTime, String active, String moduleId, String conceptId, String languageCode, String typeId, String term, String caseSignificanceId) {
if (active.equals("1") && (typeId.equals(Concepts.FSN) || typeId.equals(Concepts.SYNONYM)) && conceptIdBatch.contains(Long.parseLong(conceptId))) {
Description description = new Description(id, languageCode, typeId.equals(Concepts.FSN), term);
baseComponents.getConceptMap().getOrDefault(Long.parseLong(conceptId), dummyConcept).addDescription(description);
if (typeId.equals(Concepts.SYNONYM)) {
descriptionSynonymMap.put(Long.parseLong(id), description);
}
}
}

@Override
public void newReferenceSetMemberState(String[] fieldNames, String id, String effectiveTime, String active, String moduleId, String refsetId, String referencedComponentId, String... otherValues) {
if (active.equals("1")) {
if (fieldNames.length == 7 && fieldNames[6].equals("acceptabilityId") && otherValues[0].equals(Concepts.PREFERRED)) {
// Active lang refset member
Description description = descriptionSynonymMap.get(Long.parseLong(referencedComponentId));
if (description != null) {
description.getPreferredLangRefsets().add(refsetId);
}
}
}
}

public Map<Long, Concept> getConceptMap() {
return baseComponents.getConceptMap();
}

public Collection<Long> getConceptIdBatch() {
return conceptIdBatch;
}
}
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
package org.snomed.snowstormmicro.loading;
package org.snomed.snowstormmicro.snomedimport;

import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import org.ihtsdo.otf.snomedboot.factory.ImpotentComponentFactory;
import org.snomed.snowstormmicro.domain.Concept;
import org.snomed.snowstormmicro.domain.Concepts;
import org.snomed.snowstormmicro.domain.Description;

import java.util.HashMap;
import java.util.Map;

public class ComponentFactoryImpl extends ImpotentComponentFactory {
public class ComponentFactoryWithoutDescriptions extends ImpotentComponentFactory {

private final Map<Long, Concept> conceptMap;
private final Map<Long, Description> descriptionSynonymMap;
private final Concept dummyConcept;
private Integer maxDate = null;

public ComponentFactoryImpl() {
conceptMap = new HashMap<>();
descriptionSynonymMap = new HashMap<>();
public ComponentFactoryWithoutDescriptions() {
conceptMap = new Long2ObjectOpenHashMap<>();
dummyConcept = new Concept();
}

Expand All @@ -29,44 +27,48 @@ public void newConceptState(String conceptId, String effectiveTime, String activ

@Override
public void newDescriptionState(String id, String effectiveTime, String active, String moduleId, String conceptId, String languageCode, String typeId, String term, String caseSignificanceId) {
if (active.equals("1") && (typeId.equals(Concepts.FSN) || typeId.equals(Concepts.SYNONYM))) {
Description description = new Description(id, languageCode, typeId.equals(Concepts.FSN), term);
conceptMap.getOrDefault(Long.parseLong(conceptId), dummyConcept).addDescription(description);
if (typeId.equals(Concepts.SYNONYM)) {
descriptionSynonymMap.put(Long.parseLong(id), description);
}
}
collectMaxEffectiveTime(effectiveTime);
}

@Override
public void newRelationshipState(String id, String effectiveTime, String active, String moduleId, String sourceId, String destinationId, String relationshipGroup, String typeId, String characteristicTypeId, String modifierId) {
if (active.equals("1") && typeId.equals(Concepts.IS_A) && characteristicTypeId.equals(Concepts.INFERRED_RELATIONSHIP)) {
Concept parent = conceptMap.get(Long.parseLong(destinationId));
if (parent != null) {
conceptMap.getOrDefault(Long.parseLong(sourceId), dummyConcept).addParent(parent);
if (active.equals("1") && !characteristicTypeId.equals(Concepts.STATED_RELATIONSHIP)) {
if (typeId.equals(Concepts.IS_A)) {
Concept parent = conceptMap.get(Long.parseLong(destinationId));
if (parent != null) {
conceptMap.getOrDefault(Long.parseLong(sourceId), dummyConcept).addParent(parent);
}
} else {
conceptMap.getOrDefault(Long.parseLong(sourceId), dummyConcept)
.addRelationship(Integer.parseInt(relationshipGroup), Long.parseLong(typeId), Long.parseLong(destinationId), null);
}
}
collectMaxEffectiveTime(effectiveTime);
}

@Override
public void newConcreteRelationshipState(String id, String effectiveTime, String active, String moduleId, String sourceId, String value, String relationshipGroup, String typeId, String characteristicTypeId, String modifierId) {
if (active.equals("1") && !characteristicTypeId.equals(Concepts.STATED_RELATIONSHIP)) {
conceptMap.getOrDefault(Long.parseLong(sourceId), dummyConcept)
.addRelationship(Integer.parseInt(relationshipGroup), Long.parseLong(typeId), null, value);
}
}

@Override
public void newReferenceSetMemberState(String[] fieldNames, String id, String effectiveTime, String active, String moduleId, String refsetId, String referencedComponentId, String... otherValues) {
if (active.equals("1")) {
if (fieldNames.length == 6) {
// Active simple refset member
conceptMap.getOrDefault(Long.parseLong(referencedComponentId), dummyConcept).addMembership(refsetId);
} else if (fieldNames.length == 7 && fieldNames[6].equals("acceptabilityId") && otherValues[0].equals(Concepts.PREFERRED)) {
// Active lang refset member
Description description = descriptionSynonymMap.get(Long.parseLong(referencedComponentId));
if (description != null) {
description.getPreferredLangRefsets().add(refsetId);
}
}
}
collectMaxEffectiveTime(effectiveTime);
}

public void clearDescriptions() {
conceptMap.values().forEach(concept -> concept.getDescriptions().clear());
}

private void collectMaxEffectiveTime(String effectiveTime) {
if (maxDate == null || (effectiveTime != null && Integer.parseInt(effectiveTime) > maxDate)) {
maxDate = Integer.parseInt(effectiveTime);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package org.snomed.snowstormmicro.snomedimport;

import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import org.ihtsdo.otf.snomedboot.ReleaseImportException;
import org.ihtsdo.otf.snomedboot.ReleaseImporter;
import org.ihtsdo.otf.snomedboot.factory.ComponentFactory;
import org.ihtsdo.otf.snomedboot.factory.ComponentFactoryProvider;
import org.ihtsdo.otf.snomedboot.factory.LoadingProfile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.snomed.snowstormmicro.domain.Concept;
import org.snomed.snowstormmicro.service.CodeSystemRepository;
import org.snomed.snowstormmicro.util.TimerUtil;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

import static com.google.common.collect.Iterables.partition;
import static java.lang.String.format;

@Service
public class ImportService {

@Autowired
private CodeSystemRepository codeSystemRepository;

@Value("${index.path}")
private String indexPath;

private final Logger logger = LoggerFactory.getLogger(getClass());

private final LoadingProfile loadingProfile = LoadingProfile.light
.withAllRefsets()
.withInactiveConcepts()
.withIncludedReferenceSetFilenamePattern(".*der2_Refset.*|.*der2_cRefset.*");

public void importRelease(Set<String> releaseArchivePaths, String versionUri) throws IOException, ReleaseImportException {
Set<InputStream> archiveInputStream = new HashSet<>();
for (String filePath : releaseArchivePaths) {
File file = new File(filePath);
if (!file.isFile()) {
throw new IOException(format("File not found %s", file.getAbsolutePath()));
}
archiveInputStream.add(new FileInputStream(file));
}

ReleaseImporter releaseImporter = new ReleaseImporter();
try (IndexCreator indexCreator = new IndexCreator(indexPath, codeSystemRepository)) {

indexCreator.recreateIndex();
indexCreator.createCodeSystem(versionUri);

ComponentFactoryWithoutDescriptions componentFactoryBase = new ComponentFactoryWithoutDescriptions();
ComponentFactoryProvider componentFactoryProvider = new ComponentFactoryProvider() {

private boolean firstFactoryProvided;
private Iterator<List<Long>> conceptIdBatchIterator;
private final TimerUtil timer = new TimerUtil("Loading");
private Integer batchNumber = 0;

@Override
public ComponentFactory getNextComponentFactory() {
if (!firstFactoryProvided) {
firstFactoryProvided = true;
return componentFactoryBase;
}

if (conceptIdBatchIterator == null) {
timer.checkpoint("Loaded concepts");
Set<Long> conceptIds = componentFactoryBase.getConceptMap().keySet();
conceptIdBatchIterator = partition(conceptIds, 50_000).iterator();
}
if (conceptIdBatchIterator.hasNext()) {
batchNumber++;
componentFactoryBase.clearDescriptions();
return new ComponentFactoryWithDescriptionBatch(componentFactoryBase, new LongOpenHashSet(conceptIdBatchIterator.next())) {
@Override
public LoadingProfile getLoadingProfile() {
return LoadingProfile.light
.withoutConcepts()
.withoutTextDefinitions()
.withoutRelationships()
.withoutIdentifiers()
.withIncludedReferenceSetFilenamePattern(".?der2_cRefset_Language.*");
}

@Override
public void loadingComponentsCompleted() throws ReleaseImportException {
timer.checkpoint("Loaded description batch " + batchNumber);
try {
Collection<Long> conceptIdBatch = getConceptIdBatch();
List<Concept> conceptBatch = getConceptMap().values().stream()
.filter(concept -> conceptIdBatch.contains(Long.parseLong(concept.getConceptId())))
.toList();
indexCreator.createConceptBatch(conceptBatch);
timer.checkpoint("Written to index, batch " + batchNumber);
} catch (IOException e) {
throw new ReleaseImportException("Failed to write concept batch to index.", e);
}
}
};
} else {
timer.finish();
return null;
}
}
};

logger.info("Reading release files");
releaseImporter.loadEffectiveSnapshotReleaseFileStreams(archiveInputStream, loadingProfile, componentFactoryProvider, false);

// logger.info("Writing lucene index");
// try (IndexCreator indexCreator = new IndexCreator(directory, codeSystemRepository)) {
// indexCreator.createIndex(componentFactory, versionUri);
// }
}
}
}
Loading

0 comments on commit f502973

Please sign in to comment.