Skip to content

Commit

Permalink
Merge branch 'master' of scail:/u/nlp/git/javanlp
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabor Angeli authored and Stanford NLP committed Dec 20, 2014
1 parent 3e6115e commit 52cf0e8
Show file tree
Hide file tree
Showing 19 changed files with 438 additions and 209 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package edu.stanford.nlp.pipeline;

import java.util.Set;

import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
import junit.framework.TestCase;

import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;

/**
* @author Christopher Manning
*/
public class TaggerParserPosTagCompatibilityITest extends TestCase {

private static void testTagSet3(String[] lexParsers, String[] maxentTaggers, String[] srParsers) {
LexicalizedParser lp = LexicalizedParser.loadModel(lexParsers[0]);
Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
for (String name : maxentTaggers) {
MaxentTagger tagger = new MaxentTagger(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch", tagSet, tagger.tagSet());
}
for (String name : lexParsers) {
LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
}

for (String name : srParsers) {
ShiftReduceParser srp = ShiftReduceParser.loadModel(name);

assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
tagSet, srp.tagSet());
}
}


private static final String[] englishTaggers = {
"edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger",
"edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger",
"edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger",
};

private static final String[] englishParsers = {
"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
"edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz",
"edu/stanford/nlp/models/lexparser/englishRNN.ser.gz",
"edu/stanford/nlp/models/lexparser/englishFactored.ser.gz",
};

private static final String[] englishSrParsers = {
"edu/stanford/nlp/models/srparser/englishSR.beam.ser.gz",
"edu/stanford/nlp/models/srparser/englishSR.ser.gz",
};

public void testEnglishTagSet() {
testTagSet3(englishParsers, englishTaggers, englishSrParsers);
}


private static final String[] germanTaggers = {
"edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
"edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
"edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
"edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
};

private static final String[] germanParsers = {
"edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz",
"edu/stanford/nlp/models/lexparser/germanFactored.ser.gz",
};
private static final String[] germanSrParsers = {
"edu/stanford/nlp/models/srparser/germanSR.ser.gz",
};

public void testGermanTagSet() {
testTagSet3(germanParsers, germanTaggers, germanSrParsers);
}


private static final String[] chineseTaggers = {
"edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger",
};

private static final String[] chineseParsers = {
// Can't compare Xinhua ones because they have a smaller tag set than the full CTB v6+
// "edu/stanford/nlp/models/lexparser/xinhuaPCFG.ser.gz",
"edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz",
"edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz",
// "edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz",
// "edu/stanford/nlp/models/lexparser/xinhuaFactored.ser.gz",

};
private static final String[] chineseSrParsers = {
"edu/stanford/nlp/models/srparser/chineseSR.ser.gz",
};

public void testChineseTagSet() {
testTagSet3(chineseParsers, chineseTaggers, chineseSrParsers);
}


private static final String[] spanishTaggers = {
"edu/stanford/nlp/models/pos-tagger/spanish/spanish.tagger",
"edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger",
};

private static final String[] spanishParsers = {
"edu/stanford/nlp/models/lexparser/spanishPCFG.ser.gz",
};

private static final String[] spanishSrParsers = {
// todo [cdm 2014]: For some reason the SR parsers don't have the same tag set, missing 6 tags....
// "edu/stanford/nlp/models/srparser/spanishSR.ser.gz",
// "edu/stanford/nlp/models/srparser/spanishSR.beam.ser.gz",
};

public void testSpanishTagSet() {
testTagSet3(spanishParsers, spanishTaggers, spanishSrParsers);
}

// todo: Add French and Arabic sometime

}
8 changes: 4 additions & 4 deletions scripts/lexparser/makeSerialized.csh
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,16 @@ echo "Classpath is $CLASSPATH" >> serializedParsers.log



( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log

# "General English" PCFG binary

( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log


# "General English" PCFG, case insensitive, binary

( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log
( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log


# English WSJ 2-21 PCFG simplified grammar
Expand All @@ -107,7 +107,7 @@ echo "Classpath is $CLASSPATH" >> serializedParsers.log
# English with extras PCFG simplified grammar
# This dumbed down parser is used by the RNN parser.
# See /scr/nlp/data/dvparser for more details.
( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log
( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log


# Xinhua Mainland Chinese PCFG binary
Expand Down
4 changes: 2 additions & 2 deletions scripts/srparser/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ WSJ_TAGGER = /u/nlp/data/pos-tagger/distrib/wsj-0-18-bidirectional-nodistsim.tag
WSJ_TLPP = edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams


ENGLISH_TRAIN = /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 200-2199,9000-9999
ENGLISH_TRAIN2 = /u/nlp/data/lexparser/extraTrain 1-4000
ENGLISH_TRAIN = /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 200-2199
ENGLISH_TRAIN2 = /u/nlp/data/lexparser/extraTrain 1-4000,9000-9999
ENGLISH_DEV = $(WSJ_DEV)
ENGLISH_TEST = $(WSJ_TEST)
ENGLISH_TAGGER = /u/nlp/data/pos-tagger/distrib/english-left3words-distsim.tagger
Expand Down
13 changes: 13 additions & 0 deletions src/edu/stanford/nlp/io/IOUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,19 @@ public static <T> T readObjectFromURLOrClasspathOrFileSystem(String filename) th
return ErasureUtils.uncheckedCast(o);
}

public static <T> T readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem(String msg, String path) {
T obj;
try {
Timing timing = new Timing();
System.err.print(msg + ' ' + path + " ... ");
obj = IOUtils.readObjectFromURLOrClasspathOrFileSystem(path);
timing.done();
} catch (IOException | ClassNotFoundException e) {
throw new RuntimeIOException(e);
}
return obj;
}

public static <T> T readObjectFromObjectStream(ObjectInputStream ois) throws IOException,
ClassNotFoundException {
Object o = ois.readObject();
Expand Down
27 changes: 19 additions & 8 deletions src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java
Original file line number Diff line number Diff line change
Expand Up @@ -645,17 +645,17 @@ public Value apply(SequenceMatchResult<T> matchResult) {

/**
* Interface for a rule that extracts a list of matched items from a input
* @param <I>
* @param <O>
* @param <I> input type
* @param <O> output type
*/
public static interface ExtractRule<I,O> {
public boolean extract(I in, List<O> out);
}

/**
* Extraction rule that filters the input before passing it on to the next extractor
* @param <I>
* @param <O>
* @param <I> input type
* @param <O> output type
*/
public static class FilterExtractRule<I,O> implements ExtractRule<I,O>
{
Expand Down Expand Up @@ -684,8 +684,8 @@ public boolean extract(I in, List<O> out) {
/**
* Extraction rule that applies a list of rules in sequence and aggregates
* all matches found
* @param <I>
* @param <O>
* @param <I> input type
* @param <O> output type
*/
public static class ListExtractRule<I,O> implements ExtractRule<I,O>
{
Expand Down Expand Up @@ -729,8 +729,9 @@ public void addRules(Collection<ExtractRule<I,O>> rules)

/**
* Extraction rule to apply a extraction rule on a particular CoreMap field
* @param <T>
* @param <O>
* Input is of type CoreMap, output is templated type O.
* @param <T> type of the annotation field
* @param <O> output type
*/
public static class CoreMapExtractRule<T,O> implements ExtractRule<CoreMap, O>
{
Expand All @@ -754,6 +755,12 @@ public boolean extract(CoreMap cm, List<O> out) {

}

/**
* Extraction rule that treats a single CoreMap as a list/sequence of CoreMaps
* (convenience class, for use with BasicSequenceExtractRule)
* Input is of type CoreMap, output is templated type O.
* @param <O> output type
*/
public static class CoreMapToListExtractRule<O> implements ExtractRule<CoreMap, O>
{
ExtractRule<List<? extends CoreMap>,O> extractRule;
Expand All @@ -767,6 +774,10 @@ public boolean extract(CoreMap cm, List<O> out) {
}
}

/**
* Extraction rule
* Input is of type CoreMap, output is MatchedExpression
*/
public static class BasicSequenceExtractRule implements ExtractRule< List<? extends CoreMap>, MatchedExpression>
{
MatchedExpression.SingleAnnotationExtractor extractor;
Expand Down
Loading

0 comments on commit 52cf0e8

Please sign in to comment.