post-processing for bolt chinese

stanfordnlp · Sep 12, 2013 · 59275c9 · 59275c9
1 parent b5e6c50
commit 59275c9
Show file tree

Hide file tree

Showing 22 changed files with 1,644 additions and 841 deletions.
diff --git a/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java b/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java
@@ -1089,6 +1089,21 @@ public void testTokenSequenceMatcherNumber() throws IOException {
  assertFalse(match);
  }
 
+ public void testTokenSequenceMatcherNested() throws IOException {
+ CoreMap doc = createDocument("A A A B B B B B B C C");
+
+ // Test sequence with groups
+ TokenSequencePattern p = TokenSequencePattern.compile( "( /B/+ )+");
+ TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+ boolean match = m.find();
+ assertTrue(match);
+ assertEquals(1, m.groupCount());
+ assertEquals("B B B B B B", m.group());
+ assertEquals("B B B B B B", m.group(1));
+ match = m.find();
+ assertFalse(match);
+ }
+
  public void testTokenSequenceMatcherABs() throws IOException {
  CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A");
 
@@ -1298,7 +1313,7 @@ public void testMultiPatternMatcher() throws IOException {
  public void testCompile() {
  String s = "(?$se \"matching\" \"this\"|\"don't\")";
  CoreMap doc = createDocument("does this do matching this");
- TokenSequencePattern p =TokenSequencePattern.compile(s);
+ TokenSequencePattern p = TokenSequencePattern.compile(s);
  TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
  boolean match = m.find();
  assertTrue(match);

diff --git a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java
@@ -0,0 +1,188 @@
+package edu.stanford.nlp.pipeline;
+
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.util.StringUtils;
+import junit.framework.TestCase;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Test cases for TokensRegexNERAnnotator (taken from RegexNERAnnotator)
+ * @author Angel Chang
+ */
+public class TokensRegexNERAnnotatorITest extends TestCase {
+ private static final String REGEX_ANNOTATOR_NAME = "tokensregexner";
+ private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";
+
+ private static StanfordCoreNLP pipeline;
+ private static Annotator caseless;
+ private static Annotator cased;
+ private static Annotator annotator;
+
+ @Override
+ public void setUp() throws Exception {
+ synchronized(TokensRegexNERAnnotatorITest.class) {
+ if (pipeline == null) { // Hack so we don't load the pipeline fresh for every test
+ Properties props = new Properties();
+ props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner");
+ pipeline = new StanfordCoreNLP(props);
+ // Basic caseless and cased tokens regex annotators
+ caseless = new TokensRegexNERAnnotator(MAPPING, true);
+ cased = new TokensRegexNERAnnotator(MAPPING);
+ annotator = cased;
+ }
+ }
+ }
+
+ // Helper methods
+ protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props)
+ {
+ return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
+ }
+
+ protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception
+ {
+ Properties props = new Properties();
+ File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt");
+ PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath());
+ for (String[] p: patterns) {
+ pw.println(StringUtils.join(p, "\t"));
+ }
+ pw.close();
+ props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping", tempFile.getAbsolutePath());
+ props.setProperty(REGEX_ANNOTATOR_NAME + ".ignorecase", String.valueOf(ignoreCase));
+ return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
+ }
+
+ protected static Annotation createDocument(String text) {
+ Annotation annotation = new Annotation(text);
+ pipeline.annotate(annotation);
+ return annotation;
+ }
+
+ /**
+ * Helper method, checks that each token is tagged with the expected NER type.
+ */
+ private static void checkTags(List<CoreLabel> tokens, String ... tags) {
+ assertEquals(tags.length, tokens.size());
+ for (int i = 0; i < tags.length; ++i) {
+ assertEquals("Mismatch for token " + i + " " + tokens.get(i),
+ tags[i], tokens.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class));
+ }
+ }
+
+ /**
+ * Helper method, re-annotate each token with specified tag
+ */
+ private static void reannotate(List<CoreLabel> tokens, Class key, String ... tags) {
+ assertEquals(tags.length, tokens.size());
+ for (int i = 0; i < tags.length; ++i) {
+ tokens.get(i).set(key, tags[i]);
+ }
+ }
+
+ // Tests for TokensRegex syntax
+ public void testTokensRegexSyntax() throws Exception {
+ String[][] regexes =
+ new String[][]{
+ new String[]{"( /University/ /of/ [ {ner:LOCATION} ] )", "SCHOOL"}
+ // TODO: TokensRegex literal string patterns ignores ignoreCase settings
+ //new String[]{"( University of [ {ner:LOCATION} ] )", "SCHOOL"}
+ };
+ Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false);
+
+ String str = "University of California is located in California.";
+ Annotation document = createDocument(str);
+ annotatorCased.annotate(document);
+ List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+
+ checkTags(tokens,
+ "ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O");
+
+ reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class,
+ "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
+ annotatorCased.annotate(document);
+
+ checkTags(tokens,
+ "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
+
+ // Try lowercase
+ Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true);
+
+ str = "university of california is located in california.";
+ document = createDocument(str);
+ tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+ checkTags(tokens,
+ "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
+ annotatorCased.annotate(document);
+ checkTags(tokens,
+ "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
+ annotatorCaseless.annotate(document);
+ checkTags(tokens,
+ "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
+ }
+
+ // Basic tests from RegexNERAnnotatorITest
+ public void testBasicMatching() throws Exception {
+ String str = "President Barack Obama lives in Chicago , Illinois , " +
+ "and is a practicing Christian .";
+ Annotation document = createDocument(str);
+ annotator.annotate(document);
+ List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+
+ checkTags(tokens,
+ "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE",
+ "O", "O", "O", "O", "O", "IDEOLOGY", "O");
+
+ }
+
+ /**
+ * The LOCATION on Ontario Place should not be overwritten since Ontario (STATE_OR_PROVINCE)
+ * does not span Ontario Place. Native American Church will overwrite ORGANIZATION with
+ * RELIGION.
+ */
+ public void testOverwrite() throws Exception {
+ String str = "I like Ontario Place , and I like the Native American Church , too .";
+ Annotation document = createDocument(str);
+ annotator.annotate(document);
+ List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+
+ checkTags(tokens, "O", "O", "LOCATION", "LOCATION", "O", "O", "O", "O", "O", "RELIGION",
+ "RELIGION", "RELIGION", "O", "O", "O");
+
+ }
+
+ /**
+ * In the mapping file, Christianity is assigned a higher priority than Early Christianity,
+ * and so Early should not be marked as RELIGION.
+ */
+ public void testPriority() throws Exception {
+ String str = "Christianity is of higher regex priority than Early Christianity . ";
+ Annotation document = createDocument(str);
+ annotator.annotate(document);
+ List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+ checkTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O");
+ }
+
+
+ /**
+ * Test that if there are no annotations at all, the annotator
+ * throws an exception. We are happy if we can catch an exception
+ * and continue, and if we don't get any exceptions, we throw an
+ * exception of our own.
+ */
+ public void testEmptyAnnotation() throws Exception {
+ try {
+ annotator.annotate(new Annotation(""));
+ } catch(RuntimeException e) {
+ return;
+ }
+ fail("Never expected to get this far... the annotator should have thrown an exception by now");
+ }
+
+}
diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -40,7 +40,6 @@
 import edu.stanford.nlp.process.CoreLabelTokenFactory;
 import edu.stanford.nlp.process.CoreTokenFactory;
 import edu.stanford.nlp.sequences.*;
-import edu.stanford.nlp.sequences.FeatureFactory;
 import edu.stanford.nlp.stats.ClassicCounter;
 import edu.stanford.nlp.stats.Counter;
 import edu.stanford.nlp.stats.Counters;
@@ -83,6 +82,10 @@ public abstract class AbstractSequenceClassifier<IN extends CoreMap> implements
  public SeqClassifierFlags flags;
  public Index<String> classIndex; // = null;
  public FeatureFactory<IN> featureFactory;
+
+ // Thang Sep13: multiple feature factories (NERFeatureFactory, EmbeddingFeatureFactory)
+ public List<FeatureFactory<IN>> featureFactories; 
+
  protected IN pad;
  private CoreTokenFactory<IN> tokenFactory;
  public int windowSize;
@@ -124,8 +127,16 @@ public AbstractSequenceClassifier(SeqClassifierFlags flags) {
  this.flags = flags;
 
  // try {
- this.featureFactory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs);
- // this.featureFactory = (FeatureFactory<IN>) Class.forName(flags.featureFactory).newInstance();
+ // Thang Sep13: allow for multiple feature factories.
+ this.featureFactory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility
+ if(flags.featureFactories!=null){
+ this.featureFactories = new ArrayList<FeatureFactory<IN>>();
+ for (int i = 0; i < flags.featureFactories.length; i++) {
+ FeatureFactory<IN> indFeatureFactory = new MetaClass(flags.featureFactories[i]).
+ createInstance(flags.featureFactoriesArgs.get(i));
+ this.featureFactories.add(indFeatureFactory);
+ }
+ }
  if (flags.tokenFactory == null) {
  tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
  } else {
@@ -156,8 +167,14 @@ protected final void reinit() {
  pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
  pad.set(CoreAnnotations.GoldAnswerAnnotation.class, flags.backgroundSymbol);
 
- featureFactory.init(flags);
-
+ // Thang Sep13: allow for multiple feature factories.
+ featureFactory.init(flags); // for compatible use
+ if(flags.featureFactories!=null){
+ for (FeatureFactory<IN> indFeatureFactory : featureFactories) {
+ indFeatureFactory.init(flags);
+ }
+ }
+
  defaultReaderAndWriter = makeReaderAndWriter();
  if (flags.readerAndWriter != null &&
  flags.readerAndWriter.equals(flags.plainTextDocumentReaderAndWriter)) {

diff --git a/src/edu/stanford/nlp/ie/EmbeddingFeatureFactory.java b/src/edu/stanford/nlp/ie/EmbeddingFeatureFactory.java
@@ -0,0 +1,30 @@
+/**
+ * 
+ */
+package edu.stanford.nlp.ie;
+
+import java.util.Collection;
+
+import edu.stanford.nlp.sequences.Clique;
+import edu.stanford.nlp.sequences.FeatureFactory;
+import edu.stanford.nlp.util.PaddedList;
+
+/**
+ * For features generated from word embeddings
+ * 
+ * @author Thang Luong <[email protected]>, created on Sep 11, 2013: minor enhancements.
+ * @author Mengqiu Wang: original developer.
+ */
+public class EmbeddingFeatureFactory extends FeatureFactory {
+
+ /* (non-Javadoc)
+ * @see edu.stanford.nlp.sequences.FeatureFactory#getCliqueFeatures(edu.stanford.nlp.util.PaddedList, int, edu.stanford.nlp.sequences.Clique)
+ */
+ @Override
+ public Collection getCliqueFeatures(PaddedList info, int position,
+ Clique clique) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+}