Skip to content

Commit

Permalink
post-processing for bolt chinese
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Frederick Voigt Jr authored and Stanford NLP committed Sep 12, 2013
1 parent b5e6c50 commit 59275c9
Show file tree
Hide file tree
Showing 22 changed files with 1,644 additions and 841 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,21 @@ public void testTokenSequenceMatcherNumber() throws IOException {
assertFalse(match);
}

public void testTokenSequenceMatcherNested() throws IOException {
CoreMap doc = createDocument("A A A B B B B B B C C");

// Test sequence with groups
TokenSequencePattern p = TokenSequencePattern.compile( "( /B/+ )+");
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
assertEquals(1, m.groupCount());
assertEquals("B B B B B B", m.group());
assertEquals("B B B B B B", m.group(1));
match = m.find();
assertFalse(match);
}

public void testTokenSequenceMatcherABs() throws IOException {
CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A");

Expand Down Expand Up @@ -1298,7 +1313,7 @@ public void testMultiPatternMatcher() throws IOException {
public void testCompile() {
String s = "(?$se \"matching\" \"this\"|\"don't\")";
CoreMap doc = createDocument("does this do matching this");
TokenSequencePattern p =TokenSequencePattern.compile(s);
TokenSequencePattern p = TokenSequencePattern.compile(s);
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
Expand Down
188 changes: 188 additions & 0 deletions itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.StringUtils;
import junit.framework.TestCase;

import java.io.File;
import java.io.PrintWriter;
import java.util.List;
import java.util.Properties;

/**
* Test cases for TokensRegexNERAnnotator (taken from RegexNERAnnotator)
* @author Angel Chang
*/
public class TokensRegexNERAnnotatorITest extends TestCase {
private static final String REGEX_ANNOTATOR_NAME = "tokensregexner";
private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";

private static StanfordCoreNLP pipeline;
private static Annotator caseless;
private static Annotator cased;
private static Annotator annotator;

@Override
public void setUp() throws Exception {
synchronized(TokensRegexNERAnnotatorITest.class) {
if (pipeline == null) { // Hack so we don't load the pipeline fresh for every test
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner");
pipeline = new StanfordCoreNLP(props);
// Basic caseless and cased tokens regex annotators
caseless = new TokensRegexNERAnnotator(MAPPING, true);
cased = new TokensRegexNERAnnotator(MAPPING);
annotator = cased;
}
}
}

// Helper methods
protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props)
{
return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
}

protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception
{
Properties props = new Properties();
File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt");
PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath());
for (String[] p: patterns) {
pw.println(StringUtils.join(p, "\t"));
}
pw.close();
props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping", tempFile.getAbsolutePath());
props.setProperty(REGEX_ANNOTATOR_NAME + ".ignorecase", String.valueOf(ignoreCase));
return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
}

protected static Annotation createDocument(String text) {
Annotation annotation = new Annotation(text);
pipeline.annotate(annotation);
return annotation;
}

/**
* Helper method, checks that each token is tagged with the expected NER type.
*/
private static void checkTags(List<CoreLabel> tokens, String ... tags) {
assertEquals(tags.length, tokens.size());
for (int i = 0; i < tags.length; ++i) {
assertEquals("Mismatch for token " + i + " " + tokens.get(i),
tags[i], tokens.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class));
}
}

/**
* Helper method, re-annotate each token with specified tag
*/
private static void reannotate(List<CoreLabel> tokens, Class key, String ... tags) {
assertEquals(tags.length, tokens.size());
for (int i = 0; i < tags.length; ++i) {
tokens.get(i).set(key, tags[i]);
}
}

// Tests for TokensRegex syntax
public void testTokensRegexSyntax() throws Exception {
String[][] regexes =
new String[][]{
new String[]{"( /University/ /of/ [ {ner:LOCATION} ] )", "SCHOOL"}
// TODO: TokensRegex literal string patterns ignores ignoreCase settings
//new String[]{"( University of [ {ner:LOCATION} ] )", "SCHOOL"}
};
Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false);

String str = "University of California is located in California.";
Annotation document = createDocument(str);
annotatorCased.annotate(document);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

checkTags(tokens,
"ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O");

reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class,
"O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
annotatorCased.annotate(document);

checkTags(tokens,
"SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");

// Try lowercase
Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true);

str = "university of california is located in california.";
document = createDocument(str);
tokens = document.get(CoreAnnotations.TokensAnnotation.class);
checkTags(tokens,
"O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
annotatorCased.annotate(document);
checkTags(tokens,
"O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
annotatorCaseless.annotate(document);
checkTags(tokens,
"SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
}

// Basic tests from RegexNERAnnotatorITest
public void testBasicMatching() throws Exception {
String str = "President Barack Obama lives in Chicago , Illinois , " +
"and is a practicing Christian .";
Annotation document = createDocument(str);
annotator.annotate(document);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

checkTags(tokens,
"TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE",
"O", "O", "O", "O", "O", "IDEOLOGY", "O");

}

/**
* The LOCATION on Ontario Place should not be overwritten since Ontario (STATE_OR_PROVINCE)
* does not span Ontario Place. Native American Church will overwrite ORGANIZATION with
* RELIGION.
*/
public void testOverwrite() throws Exception {
String str = "I like Ontario Place , and I like the Native American Church , too .";
Annotation document = createDocument(str);
annotator.annotate(document);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

checkTags(tokens, "O", "O", "LOCATION", "LOCATION", "O", "O", "O", "O", "O", "RELIGION",
"RELIGION", "RELIGION", "O", "O", "O");

}

/**
* In the mapping file, Christianity is assigned a higher priority than Early Christianity,
* and so Early should not be marked as RELIGION.
*/
public void testPriority() throws Exception {
String str = "Christianity is of higher regex priority than Early Christianity . ";
Annotation document = createDocument(str);
annotator.annotate(document);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
checkTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O");
}


/**
* Test that if there are no annotations at all, the annotator
* throws an exception. We are happy if we can catch an exception
* and continue, and if we don't get any exceptions, we throw an
* exception of our own.
*/
public void testEmptyAnnotation() throws Exception {
try {
annotator.annotate(new Annotation(""));
} catch(RuntimeException e) {
return;
}
fail("Never expected to get this far... the annotator should have thrown an exception by now");
}

}
27 changes: 22 additions & 5 deletions src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.CoreTokenFactory;
import edu.stanford.nlp.sequences.*;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
Expand Down Expand Up @@ -83,6 +82,10 @@ public abstract class AbstractSequenceClassifier<IN extends CoreMap> implements
public SeqClassifierFlags flags;
public Index<String> classIndex; // = null;
public FeatureFactory<IN> featureFactory;

// Thang Sep13: multiple feature factories (NERFeatureFactory, EmbeddingFeatureFactory)
public List<FeatureFactory<IN>> featureFactories;

protected IN pad;
private CoreTokenFactory<IN> tokenFactory;
public int windowSize;
Expand Down Expand Up @@ -124,8 +127,16 @@ public AbstractSequenceClassifier(SeqClassifierFlags flags) {
this.flags = flags;

// try {
this.featureFactory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs);
// this.featureFactory = (FeatureFactory<IN>) Class.forName(flags.featureFactory).newInstance();
// Thang Sep13: allow for multiple feature factories.
this.featureFactory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility
if(flags.featureFactories!=null){
this.featureFactories = new ArrayList<FeatureFactory<IN>>();
for (int i = 0; i < flags.featureFactories.length; i++) {
FeatureFactory<IN> indFeatureFactory = new MetaClass(flags.featureFactories[i]).
createInstance(flags.featureFactoriesArgs.get(i));
this.featureFactories.add(indFeatureFactory);
}
}
if (flags.tokenFactory == null) {
tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
} else {
Expand Down Expand Up @@ -156,8 +167,14 @@ protected final void reinit() {
pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
pad.set(CoreAnnotations.GoldAnswerAnnotation.class, flags.backgroundSymbol);

featureFactory.init(flags);

// Thang Sep13: allow for multiple feature factories.
featureFactory.init(flags); // for compatible use
if(flags.featureFactories!=null){
for (FeatureFactory<IN> indFeatureFactory : featureFactories) {
indFeatureFactory.init(flags);
}
}

defaultReaderAndWriter = makeReaderAndWriter();
if (flags.readerAndWriter != null &&
flags.readerAndWriter.equals(flags.plainTextDocumentReaderAndWriter)) {
Expand Down
30 changes: 30 additions & 0 deletions src/edu/stanford/nlp/ie/EmbeddingFeatureFactory.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
*
*/
package edu.stanford.nlp.ie;

import java.util.Collection;

import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.util.PaddedList;

/**
* For features generated from word embeddings
*
* @author Thang Luong <[email protected]>, created on Sep 11, 2013: minor enhancements.
* @author Mengqiu Wang: original developer.
*/
public class EmbeddingFeatureFactory extends FeatureFactory {

/* (non-Javadoc)
* @see edu.stanford.nlp.sequences.FeatureFactory#getCliqueFeatures(edu.stanford.nlp.util.PaddedList, int, edu.stanford.nlp.sequences.Clique)
*/
@Override
public Collection getCliqueFeatures(PaddedList info, int position,
Clique clique) {
// TODO Auto-generated method stub
return null;
}

}
Loading

0 comments on commit 59275c9

Please sign in to comment.