Merge branch 'master' of scail:/u/nlp/git/javanlp

ty01csbaidu · Dec 20, 2014 · 52cf0e8 · 52cf0e8
1 parent 3e6115e
commit 52cf0e8
Show file tree

Hide file tree

Showing 19 changed files with 438 additions and 209 deletions.
diff --git a/itest/src/edu/stanford/nlp/pipeline/PosParserTagCompatibilityITest.java b/itest/src/edu/stanford/nlp/pipeline/PosParserTagCompatibilityITest.java
diff --git a/itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java b/itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java
@@ -0,0 +1,124 @@
+package edu.stanford.nlp.pipeline;
+
+import java.util.Set;
+
+import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
+import junit.framework.TestCase;
+
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.tagger.maxent.MaxentTagger;
+
+/**
+ * @author Christopher Manning
+ */
+public class TaggerParserPosTagCompatibilityITest extends TestCase {
+
+ private static void testTagSet3(String[] lexParsers, String[] maxentTaggers, String[] srParsers) {
+ LexicalizedParser lp = LexicalizedParser.loadModel(lexParsers[0]);
+ Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
+ for (String name : maxentTaggers) {
+ MaxentTagger tagger = new MaxentTagger(name);
+ assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch", tagSet, tagger.tagSet());
+ }
+ for (String name : lexParsers) {
+ LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
+ assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
+ tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
+ }
+
+ for (String name : srParsers) {
+ ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
+
+ assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
+ tagSet, srp.tagSet());
+ }
+ }
+
+
+ private static final String[] englishTaggers = {
+ "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger",
+ "edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger",
+ "edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger",
+ };
+
+ private static final String[] englishParsers = {
+ "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
+ "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz",
+ "edu/stanford/nlp/models/lexparser/englishRNN.ser.gz",
+ "edu/stanford/nlp/models/lexparser/englishFactored.ser.gz",
+ };
+
+ private static final String[] englishSrParsers = {
+ "edu/stanford/nlp/models/srparser/englishSR.beam.ser.gz",
+ "edu/stanford/nlp/models/srparser/englishSR.ser.gz",
+ };
+
+ public void testEnglishTagSet() {
+ testTagSet3(englishParsers, englishTaggers, englishSrParsers);
+ }
+
+
+ private static final String[] germanTaggers = {
+ "edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
+ "edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
+ "edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
+ "edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
+ };
+
+ private static final String[] germanParsers = {
+ "edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz",
+ "edu/stanford/nlp/models/lexparser/germanFactored.ser.gz",
+ };
+ private static final String[] germanSrParsers = {
+ "edu/stanford/nlp/models/srparser/germanSR.ser.gz",
+ };
+
+ public void testGermanTagSet() {
+ testTagSet3(germanParsers, germanTaggers, germanSrParsers);
+ }
+
+
+ private static final String[] chineseTaggers = {
+ "edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger",
+ };
+
+ private static final String[] chineseParsers = {
+ // Can't compare Xinhua ones because they have a smaller tag set than the full CTB v6+
+// "edu/stanford/nlp/models/lexparser/xinhuaPCFG.ser.gz",
+ "edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz",
+ "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz",
+// "edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz",
+// "edu/stanford/nlp/models/lexparser/xinhuaFactored.ser.gz",
+
+ };
+ private static final String[] chineseSrParsers = {
+ "edu/stanford/nlp/models/srparser/chineseSR.ser.gz",
+ };
+
+ public void testChineseTagSet() {
+ testTagSet3(chineseParsers, chineseTaggers, chineseSrParsers);
+ }
+
+
+ private static final String[] spanishTaggers = {
+ "edu/stanford/nlp/models/pos-tagger/spanish/spanish.tagger",
+ "edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger",
+ };
+
+ private static final String[] spanishParsers = {
+ "edu/stanford/nlp/models/lexparser/spanishPCFG.ser.gz",
+ };
+
+ private static final String[] spanishSrParsers = {
+ // todo [cdm 2014]: For some reason the SR parsers don't have the same tag set, missing 6 tags....
+// "edu/stanford/nlp/models/srparser/spanishSR.ser.gz",
+// "edu/stanford/nlp/models/srparser/spanishSR.beam.ser.gz",
+ };
+
+ public void testSpanishTagSet() {
+ testTagSet3(spanishParsers, spanishTaggers, spanishSrParsers);
+ }
+
+ // todo: Add French and Arabic sometime
+
+}
diff --git a/scripts/lexparser/makeSerialized.csh b/scripts/lexparser/makeSerialized.csh
@@ -87,16 +87,16 @@ echo "Classpath is $CLASSPATH" >> serializedParsers.log
 
 
 
-( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
+( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
 
 # "General English" PCFG binary 
 
-( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
+( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
 
 
 # "General English" PCFG, case insensitive, binary
 
-( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
+( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
 
 
 # English WSJ 2-21 PCFG simplified grammar
@@ -107,7 +107,7 @@ echo "Classpath is $CLASSPATH" >> serializedParsers.log
 # English with extras PCFG simplified grammar
 # This dumbed down parser is used by the RNN parser. 
 # See /scr/nlp/data/dvparser for more details.
-( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
+( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
 
 
 # Xinhua Mainland Chinese PCFG binary

diff --git a/scripts/srparser/Makefile b/scripts/srparser/Makefile
@@ -6,8 +6,8 @@ WSJ_TAGGER = /u/nlp/data/pos-tagger/distrib/wsj-0-18-bidirectional-nodistsim.tag
 WSJ_TLPP = edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams
 
 
-ENGLISH_TRAIN = /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 200-2199,9000-9999
-ENGLISH_TRAIN2 = /u/nlp/data/lexparser/extraTrain 1-4000
+ENGLISH_TRAIN = /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 200-2199
+ENGLISH_TRAIN2 = /u/nlp/data/lexparser/extraTrain 1-4000,9000-9999
 ENGLISH_DEV = $(WSJ_DEV)
 ENGLISH_TEST = $(WSJ_TEST)
 ENGLISH_TAGGER = /u/nlp/data/pos-tagger/distrib/english-left3words-distsim.tagger

diff --git a/src/edu/stanford/nlp/io/IOUtils.java b/src/edu/stanford/nlp/io/IOUtils.java
@@ -313,6 +313,19 @@ public static <T> T readObjectFromURLOrClasspathOrFileSystem(String filename) th
  return ErasureUtils.uncheckedCast(o);
  }
 
+ public static <T> T readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem(String msg, String path) {
+ T obj;
+ try {
+ Timing timing = new Timing();
+ System.err.print(msg + ' ' + path + " ... ");
+ obj = IOUtils.readObjectFromURLOrClasspathOrFileSystem(path);
+ timing.done();
+ } catch (IOException | ClassNotFoundException e) {
+ throw new RuntimeIOException(e);
+ }
+ return obj;
+ }
+
  public static <T> T readObjectFromObjectStream(ObjectInputStream ois) throws IOException,
  ClassNotFoundException {
  Object o = ois.readObject();

diff --git a/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java b/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java
@@ -645,17 +645,17 @@ public Value apply(SequenceMatchResult<T> matchResult) {
 
  /**
  * Interface for a rule that extracts a list of matched items from a input
- * @param <I>
- * @param <O>
+ * @param <I> input type
+ * @param <O> output type
  */
  public static interface ExtractRule<I,O> {
  public boolean extract(I in, List<O> out);
  }
 
  /**
  * Extraction rule that filters the input before passing it on to the next extractor
- * @param <I>
- * @param <O>
+ * @param <I> input type
+ * @param <O> output type
  */
  public static class FilterExtractRule<I,O> implements ExtractRule<I,O>
  {
@@ -684,8 +684,8 @@ public boolean extract(I in, List<O> out) {
  /**
  * Extraction rule that applies a list of rules in sequence and aggregates
  * all matches found
- * @param <I>
- * @param <O>
+ * @param <I> input type
+ * @param <O> output type
  */
  public static class ListExtractRule<I,O> implements ExtractRule<I,O>
  {
@@ -729,8 +729,9 @@ public void addRules(Collection<ExtractRule<I,O>> rules)
 
  /**
  * Extraction rule to apply a extraction rule on a particular CoreMap field
- * @param <T>
- * @param <O>
+ * Input is of type CoreMap, output is templated type O.
+ * @param <T> type of the annotation field
+ * @param <O> output type
  */
  public static class CoreMapExtractRule<T,O> implements ExtractRule<CoreMap, O>
  {
@@ -754,6 +755,12 @@ public boolean extract(CoreMap cm, List<O> out) {
 
  }
 
+ /**
+ * Extraction rule that treats a single CoreMap as a list/sequence of CoreMaps
+ * (convenience class, for use with BasicSequenceExtractRule)
+ * Input is of type CoreMap, output is templated type O.
+ * @param <O> output type
+ */
  public static class CoreMapToListExtractRule<O> implements ExtractRule<CoreMap, O>
  {
  ExtractRule<List<? extends CoreMap>,O> extractRule;
@@ -767,6 +774,10 @@ public boolean extract(CoreMap cm, List<O> out) {
  }
  }
 
+ /**
+ * Extraction rule
+ * Input is of type CoreMap, output is MatchedExpression
+ */
  public static class BasicSequenceExtractRule implements ExtractRule< List<? extends CoreMap>, MatchedExpression>
  {
  MatchedExpression.SingleAnnotationExtractor extractor;