Skip to content

Commit

Permalink
Merge branch 'master' of origin
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa authored and Stanford NLP committed Sep 11, 2013
1 parent 9b7b5ae commit 1b9c252
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 693 deletions.
340 changes: 0 additions & 340 deletions doc/classify/LICENSE.txt

This file was deleted.

340 changes: 0 additions & 340 deletions doc/tregex/LICENSE.txt

This file was deleted.

4 changes: 2 additions & 2 deletions src/edu/stanford/nlp/ie/machinereading/MachineReading.java
Original file line number Diff line number Diff line change
Expand Up @@ -368,10 +368,10 @@ protected void train(Annotation training, int partition) throws Exception {
MachineReadingProperties.logger.fine("forceRetraining = " + this.forceRetraining+ ", modelFile.exists = " + modelFile.exists());
if(! this.forceRetraining&& modelFile.exists()){
MachineReadingProperties.logger.info("Loading entity extraction model from " + modelName + " ...");
entityExtractor = BasicEntityExtractor.load(modelName, MachineReadingProperties.entityClassifierClass, false);
entityExtractor = BasicEntityExtractor.load(modelName, MachineReadingProperties.entityClassifier, false);
} else {
MachineReadingProperties.logger.info("Training entity extraction model...");
entityExtractor = makeEntityExtractor(MachineReadingProperties.entityClassifierClass, MachineReadingProperties.entityGazetteerPath);
entityExtractor = makeEntityExtractor(MachineReadingProperties.entityClassifier, MachineReadingProperties.entityGazetteerPath);
entityExtractor.train(training);
MachineReadingProperties.logger.info("Serializing entity extraction model to " + modelName + " ...");
entityExtractor.save(modelName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ public class MachineReadingProperties {
@Option(name="entityGazetteerPath",gloss="location of entity gazetteer file (if you're using one) -- this is a temporary option")
static public String entityGazetteerPath;

@Option(name="entityClassifierClass",gloss="entity extractor class to use")
static public Class<BasicEntityExtractor> entityClassifierClass = edu.stanford.nlp.ie.machinereading.BasicEntityExtractor.class;
@Option(name="entityClassifier",gloss="entity extractor class to use")
static public Class<BasicEntityExtractor> entityClassifier = edu.stanford.nlp.ie.machinereading.BasicEntityExtractor.class;

@Option(name="entityResultsPrinters",gloss="comma-separated list of ResultsPrinter subclasses to use for printing the results of entity extraction")
static public String entityResultsPrinters = "";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,26 @@
# Pipeline options
annotators = pos, lemma, parse
# Below is outdated. Now just use standard annotators from models jar. Remember to include the models jar in your classpath!
# pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/bidirectional-distsim-wsj-0-18.tagger
# ner.model.3class = /u/nlp/data/ner/goodClassifiers/all.3class.distsim.crf.ser.gz
# ner.model.7class = /u/nlp/data/ner/goodClassifiers/muc.distsim.crf.ser.gz
# ner.model.MISCclass = /u/nlp/data/ner/goodClassifiers/conll.distsim.crf.ser.gz
# parse.model = /u/nlp/data/lexparser/englishPCFG.ser.gz
pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/bidirectional-distsim-wsj-0-18.tagger
ner.model.3class = /u/nlp/data/ner/goodClassifiers/all.3class.distsim.crf.ser.gz
ner.model.7class = /u/nlp/data/ner/goodClassifiers/muc.distsim.crf.ser.gz
ner.model.MISCclass = /u/nlp/data/ner/goodClassifiers/conll.distsim.crf.ser.gz
parse.model = /u/nlp/data/lexparser/englishPCFG.ser.gz
parse.maxlen = 100

# MachineReading properties
datasetReaderClass = edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader

#Data directory for training
trainPath = /u/nlp/data/RothCONLL04/conll04.corp

crossValidate = true
kfold = 2
kfold = 10

# where to save training sentences
serializedTrainingSentencesPath = tmp/roth_sentences.ser

serializedEntityExtractorPath = tmp/roth_entity_model.ser

# where to store the output of the extractor (sentence objects with relations generated by the model)
serializedRelationExtractorPath = tmp/roth_relation_model.ser

Expand All @@ -35,9 +36,9 @@ extractEvents = false
extractEntities = false

# The set chosen by feature selection using RothCONLL04:
# relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path
relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path
# The above features plus the features used in Bjorne BioNLP09:
relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path,dependency_path_POS_unigrams,dependency_path_word_n_grams,dependency_path_POS_n_grams,dependency_path_edge_lowlevel_n_grams,dependency_path_edge-node-edge-grams_lowlevel,dependency_path_node-edge-node-grams_lowlevel,dependency_path_directed_bigrams,dependency_path_edge_unigrams,same_head,entity_counts
# relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path,dependency_path_POS_unigrams,dependency_path_word_n_grams,dependency_path_POS_n_grams,dependency_path_edge_lowlevel_n_grams,dependency_path_edge-node-edge-grams_lowlevel,dependency_path_node-edge-node-grams_lowlevel,dependency_path_directed_bigrams,dependency_path_edge_unigrams,same_head,entity_counts



3 changes: 3 additions & 0 deletions src/edu/stanford/nlp/pipeline/Annotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ public String toString() {
public static final String STANFORD_TRUECASE = "truecase";
public static final String STANFORD_PARSE = "parse";
public static final String STANFORD_DETERMINISTIC_COREF = "dcoref";
public static final String STANFORD_RELATION = "relation";


public static final Requirement TOKENIZE_REQUIREMENT = new Requirement(STANFORD_TOKENIZE);
public static final Requirement CLEAN_XML_REQUIREMENT = new Requirement(STANFORD_CLEAN_XML);
Expand All @@ -115,6 +117,7 @@ public String toString() {
public static final Requirement TRUECASE_REQUIREMENT = new Requirement(STANFORD_TRUECASE);
public static final Requirement PARSE_REQUIREMENT = new Requirement(STANFORD_PARSE);
public static final Requirement DETERMINISTIC_COREF_REQUIREMENT = new Requirement(STANFORD_DETERMINISTIC_COREF);
public static final Requirement RELATION_EXTRACTOR_REQUIREMENT = new Requirement(STANFORD_RELATION);

/**
* These are annotators which StanfordCoreNLP does not know how to
Expand Down
4 changes: 4 additions & 0 deletions src/edu/stanford/nlp/pipeline/DefaultPaths.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ public class DefaultPaths {
public static final String DEFAULT_NFL_ENTITY_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_entity_model.ser";
public static final String DEFAULT_NFL_RELATION_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_relation_model.ser";
public static final String DEFAULT_NFL_GAZETTEER = "edu/stanford/nlp/models/machinereading/nfl/NFLgazetteer.txt";

public static final String DEFAULT_RELEX_ENTITY_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_entity_model.ser";
public static final String DEFAULT_RELEX_RELATION_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_relation_model.ser";
public static final String DEFAULT_RELEX_GAZETTEER = "edu/stanford/nlp/models/machinereading/nfl/NFLgazetteer.txt";


private DefaultPaths() {
Expand Down
125 changes: 125 additions & 0 deletions src/edu/stanford/nlp/pipeline/RelationExtractorAnnotator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package edu.stanford.nlp.pipeline;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.ie.machinereading.BasicEntityExtractor;
import edu.stanford.nlp.ie.machinereading.BasicRelationExtractor;
import edu.stanford.nlp.ie.machinereading.Extractor;
import edu.stanford.nlp.ie.machinereading.MachineReading;
import edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;

public class RelationExtractorAnnotator implements Annotator {
MachineReading mr;
private static boolean verbose = false;

public RelationExtractorAnnotator(Properties props){
verbose = Boolean.parseBoolean(props.getProperty("relex.verbose", "false"));
String entityModel = props.getProperty("relex.entity.model", DefaultPaths.DEFAULT_RELEX_ENTITY_MODEL);
String relationModel = props.getProperty("relex.relation.model", DefaultPaths.DEFAULT_RELEX_RELATION_MODEL);
try {
Extractor entityExtractor = BasicEntityExtractor.load(entityModel, BasicEntityExtractor.class, true);

Extractor relationExtractor = BasicRelationExtractor.load(relationModel);
mr = MachineReading.makeMachineReadingForAnnotation(new RothCONLL04Reader(), entityExtractor, relationExtractor, null, null,
null, true, verbose);
} catch(Exception e){
e.printStackTrace();
throw new RuntimeException(e);
}
}
@Override
public void annotate(Annotation annotation) {
// extract entities and relations
Annotation output = mr.annotate(annotation);

// transfer entities/relations back to the original annotation
List<CoreMap> outputSentences = output.get(SentencesAnnotation.class);
List<CoreMap> origSentences = annotation.get(SentencesAnnotation.class);
for(int i = 0; i < outputSentences.size(); i ++){
CoreMap outSent = outputSentences.get(i);
CoreMap origSent = origSentences.get(i);

// set entities
List<EntityMention> entities = outSent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
origSent.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, entities);
if(verbose && entities != null){
System.err.println("Extracted the following entities:");
for(EntityMention e: entities){
System.err.println("\t" + e);
}
}

// set relations
List<RelationMention> relations = outSent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
origSent.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, relations);
if(verbose && relations != null){
System.err.println("Extracted the following relations:");
for(RelationMention r: relations){
if(! r.getType().equals(RelationMention.UNRELATED)){
System.err.println(r);
}
}
}

// the NFLTokenizer might have changed some of the token texts (e.g., "10-5" -> "10 to 5")
// revert all tokens to their original texts
boolean verboseRevert = false;
String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
if(origText == null) throw new RuntimeException("Found corpus without text!");
if(verboseRevert) System.err.println("REVERTING SENT: " + origSent.get(TextAnnotation.class));
List<CoreLabel> tokens = origSent.get(TokensAnnotation.class);
List<Pair<Integer, String>> changes = new ArrayList<Pair<Integer,String>>();
int position = 0;
for(CoreLabel token: tokens) {
String tokenText = token.word();
if(verboseRevert) System.err.println("TOKEN " + tokenText + " " + token.beginPosition() + " " + token.endPosition());
String origToken = origText.substring(token.beginPosition(), token.endPosition());
if(! origToken.equals(tokenText)){
if(verboseRevert) System.err.println("Found difference at position #" + position + ": token [" + tokenText + "] vs text [" + origToken + "]");
token.set(TextAnnotation.class, origToken);
changes.add(new Pair<Integer, String>(position, origToken));
}
position ++;
}
// revert Tree leaves as well, if tokens were modified
Tree tree = origSent.get(TreeAnnotation.class);
if(tree != null && changes.size() > 0){
List<Tree> leaves = tree.getLeaves();
for(Pair<Integer, String> change: changes) {
Tree leaf = leaves.get(change.first);
if(verboseRevert) System.err.println("CHANGING LEAF " + leaf);
leaf.setValue(change.second);
if(verboseRevert) System.err.println("NEW LEAF: " + leaf);
}
}
}
}

@Override
public Set<Requirement> requires() {
return new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT, NER_REQUIREMENT, PARSE_REQUIREMENT);
}

@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(RELATION_EXTRACTOR_REQUIREMENT);
}

}

0 comments on commit 1b9c252

Please sign in to comment.