Merge branch 'master' of origin

stanfordnlp · Sep 11, 2013 · 1b9c252 · 1b9c252
1 parent 9b7b5ae
commit 1b9c252
Show file tree

Hide file tree

Showing 8 changed files with 146 additions and 693 deletions.
diff --git a/doc/classify/LICENSE.txt b/doc/classify/LICENSE.txt
diff --git a/doc/tregex/LICENSE.txt b/doc/tregex/LICENSE.txt
diff --git a/src/edu/stanford/nlp/ie/machinereading/MachineReading.java b/src/edu/stanford/nlp/ie/machinereading/MachineReading.java
@@ -368,10 +368,10 @@ protected void train(Annotation training, int partition) throws Exception {
       MachineReadingProperties.logger.fine("forceRetraining = " + this.forceRetraining+ ", modelFile.exists = " + modelFile.exists());
       if(! this.forceRetraining&& modelFile.exists()){
         MachineReadingProperties.logger.info("Loading entity extraction model from " + modelName + " ...");
-        entityExtractor = BasicEntityExtractor.load(modelName, MachineReadingProperties.entityClassifierClass, false);
+        entityExtractor = BasicEntityExtractor.load(modelName, MachineReadingProperties.entityClassifier, false);
       } else {
         MachineReadingProperties.logger.info("Training entity extraction model...");
-        entityExtractor = makeEntityExtractor(MachineReadingProperties.entityClassifierClass, MachineReadingProperties.entityGazetteerPath);
+        entityExtractor = makeEntityExtractor(MachineReadingProperties.entityClassifier, MachineReadingProperties.entityGazetteerPath);
         entityExtractor.train(training);
         MachineReadingProperties.logger.info("Serializing entity extraction model to " + modelName + " ...");
         entityExtractor.save(modelName);

diff --git a/src/edu/stanford/nlp/ie/machinereading/MachineReadingProperties.java b/src/edu/stanford/nlp/ie/machinereading/MachineReadingProperties.java
@@ -49,8 +49,8 @@ public class MachineReadingProperties {
   @Option(name="entityGazetteerPath",gloss="location of entity gazetteer file (if you're using one) -- this is a temporary option")
   static public String entityGazetteerPath;
 
-  @Option(name="entityClassifierClass",gloss="entity extractor class to use")
-  static public Class<BasicEntityExtractor> entityClassifierClass = edu.stanford.nlp.ie.machinereading.BasicEntityExtractor.class;
+  @Option(name="entityClassifier",gloss="entity extractor class to use")
+  static public Class<BasicEntityExtractor> entityClassifier = edu.stanford.nlp.ie.machinereading.BasicEntityExtractor.class;
 
   @Option(name="entityResultsPrinters",gloss="comma-separated list of ResultsPrinter subclasses to use for printing the results of entity extraction")
   static public String entityResultsPrinters = "";

diff --git a/src/edu/stanford/nlp/ie/machinereading/domains/roth/roth.properties b/src/edu/stanford/nlp/ie/machinereading/domains/roth/roth.properties
@@ -2,25 +2,26 @@
 # Pipeline options
 annotators = pos, lemma, parse
 # Below is outdated. Now just use standard annotators from models jar. Remember to include the models jar in your classpath!
-# pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/bidirectional-distsim-wsj-0-18.tagger
-# ner.model.3class = /u/nlp/data/ner/goodClassifiers/all.3class.distsim.crf.ser.gz
-# ner.model.7class = /u/nlp/data/ner/goodClassifiers/muc.distsim.crf.ser.gz
-# ner.model.MISCclass = /u/nlp/data/ner/goodClassifiers/conll.distsim.crf.ser.gz
-# parse.model = /u/nlp/data/lexparser/englishPCFG.ser.gz
+pos.model = /u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/bidirectional-distsim-wsj-0-18.tagger
+ner.model.3class = /u/nlp/data/ner/goodClassifiers/all.3class.distsim.crf.ser.gz
+ner.model.7class = /u/nlp/data/ner/goodClassifiers/muc.distsim.crf.ser.gz
+ner.model.MISCclass = /u/nlp/data/ner/goodClassifiers/conll.distsim.crf.ser.gz
+parse.model = /u/nlp/data/lexparser/englishPCFG.ser.gz
 parse.maxlen = 100
 
 # MachineReading properties
 datasetReaderClass = edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader
 
 #Data directory for training
 trainPath = /u/nlp/data/RothCONLL04/conll04.corp
-
 crossValidate = true
-kfold = 2
+kfold = 10
 
 # where to save training sentences
 serializedTrainingSentencesPath = tmp/roth_sentences.ser
 
+serializedEntityExtractorPath = tmp/roth_entity_model.ser
+
 # where to store the output of the extractor (sentence objects with relations generated by the model)
 serializedRelationExtractorPath = tmp/roth_relation_model.ser
 
@@ -35,9 +36,9 @@ extractEvents = false
 extractEntities = false
 
 # The set chosen by feature selection using RothCONLL04:
-# relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path
+relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path
 # The above features plus the features used in Bjorne BioNLP09:
- relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path,dependency_path_POS_unigrams,dependency_path_word_n_grams,dependency_path_POS_n_grams,dependency_path_edge_lowlevel_n_grams,dependency_path_edge-node-edge-grams_lowlevel,dependency_path_node-edge-node-grams_lowlevel,dependency_path_directed_bigrams,dependency_path_edge_unigrams,same_head,entity_counts
+# relationFeatures = arg_words,arg_type,dependency_path_lowlevel,dependency_path_words,surface_path_POS,entities_between_args,full_tree_path,dependency_path_POS_unigrams,dependency_path_word_n_grams,dependency_path_POS_n_grams,dependency_path_edge_lowlevel_n_grams,dependency_path_edge-node-edge-grams_lowlevel,dependency_path_node-edge-node-grams_lowlevel,dependency_path_directed_bigrams,dependency_path_edge_unigrams,same_head,entity_counts
 
 
 
diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -102,6 +102,8 @@ public String toString() {
   public static final String STANFORD_TRUECASE = "truecase";
   public static final String STANFORD_PARSE = "parse";
   public static final String STANFORD_DETERMINISTIC_COREF = "dcoref";
+  public static final String STANFORD_RELATION = "relation";
+
 
   public static final Requirement TOKENIZE_REQUIREMENT = new Requirement(STANFORD_TOKENIZE);
   public static final Requirement CLEAN_XML_REQUIREMENT = new Requirement(STANFORD_CLEAN_XML);
@@ -115,6 +117,7 @@ public String toString() {
   public static final Requirement TRUECASE_REQUIREMENT = new Requirement(STANFORD_TRUECASE);
   public static final Requirement PARSE_REQUIREMENT = new Requirement(STANFORD_PARSE);
   public static final Requirement DETERMINISTIC_COREF_REQUIREMENT = new Requirement(STANFORD_DETERMINISTIC_COREF);
+  public static final Requirement RELATION_EXTRACTOR_REQUIREMENT = new Requirement(STANFORD_RELATION);
 
   /**
    * These are annotators which StanfordCoreNLP does not know how to

diff --git a/src/edu/stanford/nlp/pipeline/DefaultPaths.java b/src/edu/stanford/nlp/pipeline/DefaultPaths.java
@@ -45,6 +45,10 @@ public class DefaultPaths {
   public static final String DEFAULT_NFL_ENTITY_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_entity_model.ser";
   public static final String DEFAULT_NFL_RELATION_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_relation_model.ser";
   public static final String DEFAULT_NFL_GAZETTEER = "edu/stanford/nlp/models/machinereading/nfl/NFLgazetteer.txt";
+
+  public static final String DEFAULT_RELEX_ENTITY_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_entity_model.ser";
+  public static final String DEFAULT_RELEX_RELATION_MODEL = "edu/stanford/nlp/models/machinereading/nfl/nfl_relation_model.ser";
+  public static final String DEFAULT_RELEX_GAZETTEER = "edu/stanford/nlp/models/machinereading/nfl/NFLgazetteer.txt";
 
 
   private DefaultPaths() {

diff --git a/src/edu/stanford/nlp/pipeline/RelationExtractorAnnotator.java b/src/edu/stanford/nlp/pipeline/RelationExtractorAnnotator.java
@@ -0,0 +1,125 @@
+package edu.stanford.nlp.pipeline;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Properties;
+import java.util.Set;
+
+import edu.stanford.nlp.ie.machinereading.BasicEntityExtractor;
+import edu.stanford.nlp.ie.machinereading.BasicRelationExtractor;
+import edu.stanford.nlp.ie.machinereading.Extractor;
+import edu.stanford.nlp.ie.machinereading.MachineReading;
+import edu.stanford.nlp.ie.machinereading.domains.roth.RothCONLL04Reader;
+import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
+import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
+import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
+import edu.stanford.nlp.util.ArraySet;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.Pair;
+
+public class RelationExtractorAnnotator implements Annotator {
+  MachineReading mr;
+  private static boolean verbose = false;
+
+  public RelationExtractorAnnotator(Properties props){
+    verbose = Boolean.parseBoolean(props.getProperty("relex.verbose", "false"));
+    String entityModel = props.getProperty("relex.entity.model", DefaultPaths.DEFAULT_RELEX_ENTITY_MODEL);
+    String relationModel = props.getProperty("relex.relation.model", DefaultPaths.DEFAULT_RELEX_RELATION_MODEL);
+    try {
+      Extractor entityExtractor = BasicEntityExtractor.load(entityModel, BasicEntityExtractor.class, true);
+
+      Extractor relationExtractor = BasicRelationExtractor.load(relationModel);
+      mr = MachineReading.makeMachineReadingForAnnotation(new RothCONLL04Reader(), entityExtractor, relationExtractor, null, null,
+          null, true, verbose);
+    } catch(Exception e){
+      e.printStackTrace();
+      throw new RuntimeException(e);
+    }
+  }
+  @Override
+  public void annotate(Annotation annotation) {
+ // extract entities and relations
+    Annotation output = mr.annotate(annotation);
+
+    // transfer entities/relations back to the original annotation
+    List<CoreMap> outputSentences = output.get(SentencesAnnotation.class);
+    List<CoreMap> origSentences = annotation.get(SentencesAnnotation.class);
+    for(int i = 0; i < outputSentences.size(); i ++){
+      CoreMap outSent = outputSentences.get(i);
+      CoreMap origSent = origSentences.get(i);
+
+      // set entities
+      List<EntityMention> entities = outSent.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
+      origSent.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, entities);
+      if(verbose && entities != null){
+        System.err.println("Extracted the following entities:");
+        for(EntityMention e: entities){
+          System.err.println("\t" + e);
+        }
+      }
+
+      // set relations
+      List<RelationMention> relations = outSent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
+      origSent.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, relations);
+      if(verbose && relations != null){
+        System.err.println("Extracted the following relations:");
+        for(RelationMention r: relations){
+          if(! r.getType().equals(RelationMention.UNRELATED)){
+            System.err.println(r);
+          }
+        }
+      }
+
+      // the NFLTokenizer might have changed some of the token texts (e.g., "10-5" -> "10 to 5")
+      // revert all tokens to their original texts
+      boolean verboseRevert = false;
+      String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
+      if(origText == null) throw new RuntimeException("Found corpus without text!");
+      if(verboseRevert) System.err.println("REVERTING SENT: " + origSent.get(TextAnnotation.class));
+      List<CoreLabel> tokens = origSent.get(TokensAnnotation.class);
+      List<Pair<Integer, String>> changes = new ArrayList<Pair<Integer,String>>();
+      int position = 0;
+      for(CoreLabel token: tokens) {
+        String tokenText = token.word();
+        if(verboseRevert) System.err.println("TOKEN " + tokenText + " " + token.beginPosition() + " " + token.endPosition());
+        String origToken = origText.substring(token.beginPosition(), token.endPosition());
+        if(! origToken.equals(tokenText)){
+          if(verboseRevert) System.err.println("Found difference at position #" + position + ": token [" + tokenText + "] vs text [" + origToken + "]");
+          token.set(TextAnnotation.class, origToken);
+          changes.add(new Pair<Integer, String>(position, origToken));
+        }
+        position ++;
+      }
+      // revert Tree leaves as well, if tokens were modified
+      Tree tree = origSent.get(TreeAnnotation.class);
+      if(tree != null && changes.size() > 0){
+        List<Tree> leaves = tree.getLeaves();
+        for(Pair<Integer, String> change: changes) {
+          Tree leaf = leaves.get(change.first);
+          if(verboseRevert) System.err.println("CHANGING LEAF " + leaf);
+          leaf.setValue(change.second);
+          if(verboseRevert) System.err.println("NEW LEAF: " + leaf);
+        }
+      }
+    }    
+  }
+
+  @Override
+  public Set<Requirement> requires() {
+    return new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT, NER_REQUIREMENT, PARSE_REQUIREMENT);
+  }
+
+  @Override
+  public Set<Requirement> requirementsSatisfied() {
+    return Collections.singleton(RELATION_EXTRACTOR_REQUIREMENT);
+  }
+
+}