Update TokensSequenceNERAnnotatorITest, small cleanups for TokensRege…

…xNerAnnotator.
stanfordnlp · Sep 11, 2013 · 9e13557 · 9e13557
1 parent 44df966
commit 9e13557
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 28 deletions.
diff --git a/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java b/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java
@@ -1089,6 +1089,21 @@ public void testTokenSequenceMatcherNumber() throws IOException {
     assertFalse(match);
   }
 
+  public void testTokenSequenceMatcherNested() throws IOException {
+    CoreMap doc = createDocument("A A A B B B B B B C C");
+
+    // Test sequence with groups
+    TokenSequencePattern p = TokenSequencePattern.compile( "( /B/+ )+");
+    TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+    boolean match = m.find();
+    assertTrue(match);
+    assertEquals(1, m.groupCount());
+    assertEquals("B B B B B B", m.group());
+    assertEquals("B B B B B B", m.group(1));
+    match = m.find();
+    assertFalse(match);
+  }
+
   public void testTokenSequenceMatcherABs() throws IOException {
     CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A");
 
@@ -1298,7 +1313,7 @@ public void testMultiPatternMatcher() throws IOException {
   public void testCompile() {
     String s = "(?$se \"matching\" \"this\"|\"don't\")";
     CoreMap doc = createDocument("does this do matching this");
-    TokenSequencePattern p =TokenSequencePattern.compile(s);
+    TokenSequencePattern p = TokenSequencePattern.compile(s);
     TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
     boolean match = m.find();
     assertTrue(match);

diff --git a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java
@@ -1,9 +1,13 @@
 package edu.stanford.nlp.pipeline;
 
+import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.util.StringUtils;
 import junit.framework.TestCase;
 
+import java.io.File;
+import java.io.PrintWriter;
 import java.util.List;
 import java.util.Properties;
 
@@ -13,8 +17,7 @@
  */
 public class TokensRegexNERAnnotatorITest extends TestCase {
   private static final String REGEX_ANNOTATOR_NAME = "tokensregexner";
-  //private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";
-  private static final String MAPPING = "C:\\\\code\\\\NLP\\\\itest_map";
+  private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";
 
   private static StanfordCoreNLP pipeline;
   private static Annotator caseless;
@@ -36,11 +39,26 @@ public void setUp() throws Exception {
     }
   }
 
+  // Helper methods
   protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props)
   {
     return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
   }
 
+  protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception
+  {
+    Properties props = new Properties();
+    File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt");
+    PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath());
+    for (String[] p: patterns) {
+      pw.println(StringUtils.join(p, "\t"));
+    }
+    pw.close();
+    props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping", tempFile.getAbsolutePath());
+    props.setProperty(REGEX_ANNOTATOR_NAME + ".ignorecase", String.valueOf(ignoreCase));
+    return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
+  }
+
   protected static Annotation createDocument(String text) {
     Annotation annotation = new Annotation(text);
     pipeline.annotate(annotation);
@@ -68,7 +86,49 @@ private static void reannotate(List<CoreLabel> tokens, Class key, String ... tag
     }
   }
 
-  public void testBasicMatching() {
+  // Tests for TokensRegex syntax
+  public void testTokensRegexSyntax() throws Exception {
+    String[][] regexes =
+      new String[][]{
+        new String[]{"( /University/ /of/ [ {ner:LOCATION} ] )", "SCHOOL"}
+        // TODO: TokensRegex literal string patterns ignores ignoreCase settings
+        //new String[]{"( University of [ {ner:LOCATION} ] )", "SCHOOL"}
+    };
+    Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false);
+
+    String str = "University of California is located in California.";
+    Annotation document = createDocument(str);
+    annotatorCased.annotate(document);
+    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+
+    checkTags(tokens,
+            "ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O");
+
+    reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class,
+            "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
+    annotatorCased.annotate(document);
+
+    checkTags(tokens,
+            "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
+
+    // Try lowercase
+    Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true);
+
+    str = "university of california is located in california.";
+    document = createDocument(str);
+    tokens = document.get(CoreAnnotations.TokensAnnotation.class);
+    checkTags(tokens,
+            "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
+    annotatorCased.annotate(document);
+    checkTags(tokens,
+            "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
+    annotatorCaseless.annotate(document);
+    checkTags(tokens,
+            "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
+  }
+
+  // Basic tests from RegexNERAnnotatorITest
+  public void testBasicMatching() throws Exception {
     String str = "President Barack Obama lives in Chicago , Illinois , " +
     "and is a practicing Christian .";
     Annotation document = createDocument(str);
@@ -86,7 +146,7 @@ public void testBasicMatching() {
    * does not span Ontario Place.  Native American Church will overwrite ORGANIZATION with
    * RELIGION.
    */
-  public void testOverwrite() {
+  public void testOverwrite() throws Exception {
     String str = "I like Ontario Place , and I like the Native American Church , too .";
     Annotation document = createDocument(str);
     annotator.annotate(document);
@@ -101,7 +161,7 @@ public void testOverwrite() {
    * In the mapping file, Christianity is assigned a higher priority than Early Christianity,
    * and so Early should not be marked as RELIGION.
    */
-  public void testPriority() {
+  public void testPriority() throws Exception {
     String str = "Christianity is of higher regex priority than Early Christianity . ";
     Annotation document = createDocument(str);
     annotator.annotate(document);
@@ -116,7 +176,7 @@ public void testPriority() {
    * and continue, and if we don't get any exceptions, we throw an
    * exception of our own.
    */
-  public void testEmptyAnnotation() {
+  public void testEmptyAnnotation() throws Exception {
     try {
       annotator.annotate(new Annotation(""));
     } catch(RuntimeException e) {

diff --git a/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePatternTrigger.java b/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePatternTrigger.java
@@ -50,7 +50,6 @@ public Triple<Class,String,Boolean> apply(NodePattern<CoreMap> in) {
       Triple<Class,String,Boolean> firstTextTrigger = pattern.findNodePattern(textTriggerFilter);
       if (firstTextTrigger != null) {
         if (firstTextTrigger.third) {
-          // Ignore case
           lowercaseStringTriggers.add(firstTextTrigger.first, firstTextTrigger.second.toLowerCase(), pattern);
         } else {
           annotationTriggers.add(firstTextTrigger.first, firstTextTrigger.second, pattern);

diff --git a/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java b/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java
@@ -83,7 +83,7 @@
  *   </tr>
  *   <tr><td><code>noDefaultOverwriteLabels</code></td>
  *      <td>Comma separated list of output types for which default NER labels are not overwritten.
- *          For this types, only if the matched expression has NER type matchting the
+ *          For this types, only if the matched expression has NER type matching the
  *          specified overwriteableType for the regex will the NER type be overwritten.</td>
  *      <td><code></code></td></tr>
  *   <tr><td><code>ignoreCase</code></td><td><code>Boolean</code></td>
@@ -134,29 +134,33 @@ public TokensRegexNERAnnotator(String mapping, boolean ignoreCase, String validP
   }
 
   private static Properties getProperties(String name, String mapping, boolean ignoreCase, String validPosRegex) {
+    String prefix = (name != null && !name.isEmpty())? name + ".":"";
     Properties props = new Properties();
-    props.setProperty(name + ".mapping", mapping);
-    props.setProperty(name +".ignorecase", String.valueOf(ignoreCase));
+    props.setProperty(prefix + "mapping", mapping);
+    props.setProperty(prefix + "ignorecase", String.valueOf(ignoreCase));
     if (validPosRegex != null) {
-      props.setProperty(name +".validpospattern", validPosRegex);
+      props.setProperty(prefix + "validpospattern", validPosRegex);
     }
     return props;
   }
 
   public TokensRegexNERAnnotator(String name, Properties properties) {
-    String backgroundSymbol = properties.getProperty(name + ".backgroundSymbol",
+    String prefix = (name != null && !name.isEmpty())? name + ".":"";
+    String backgroundSymbol = properties.getProperty(prefix + "backgroundSymbol",
             SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL + ",MISC");
     String[] backgroundSymbols = backgroundSymbol.split("\\s*,\\s*");
-    String[] mappings = PropertiesUtils.getStringArray(properties, name + ".mapping",
-            new String[] { DefaultPaths.DEFAULT_REGEXNER_RULES} );
-    String validPosRegex = properties.getProperty(name + ".validpospattern");
-    this.posMatchType = PosMatchType.valueOf(properties.getProperty(name + ".posmatchtype",
+    String mappingFiles = properties.getProperty(prefix + "mapping", DefaultPaths.DEFAULT_REGEXNER_RULES);
+    String[] mappings = mappingFiles.split("\\s*[,;]\\s*");
+    String validPosRegex = properties.getProperty(prefix + "validpospattern");
+    this.posMatchType = PosMatchType.valueOf(properties.getProperty(prefix + "posmatchtype",
             DEFAULT_POS_MATCH_TYPE.name()));
     boolean overwriteMyLabels = true;
 
-    this.noDefaultOverwriteLabels = CollectionUtils.asSet(PropertiesUtils.getStringArray(properties, name + ".noDefaultOverwriteLabels"));
-    this.ignoreCase = PropertiesUtils.getBool(properties, name + ".ignorecase", false);
-    this.verbose = PropertiesUtils.getBool(properties, name + ".verbose", false);
+    String noDefaultOverwriteLabelsProp = properties.getProperty(prefix + "noDefaultOverwriteLabels");
+    this.noDefaultOverwriteLabels = (noDefaultOverwriteLabelsProp != null)?
+            CollectionUtils.asSet(noDefaultOverwriteLabelsProp.split("\\s*,\\s*")):new HashSet<String>();
+    this.ignoreCase = PropertiesUtils.getBool(properties, prefix + "ignorecase", false);
+    this.verbose = PropertiesUtils.getBool(properties, prefix + "verbose", false);
 
     if (validPosRegex != null && !validPosRegex.equals("")) {
       validPosPattern = Pattern.compile(validPosRegex);
@@ -238,8 +242,9 @@ private void annotateMatched(List<CoreLabel> tokens) {
       Entry entry = patternToEntry.get(m.pattern());
 
       // Check if we will overwrite the existing annotation with this annotation
-      int start = m.start();
-      int end = m.end();
+      int g = entry.annotateGroup;
+      int start = m.start(g);
+      int end = m.end(g);
 
       boolean overwriteOriginalNer = checkPosTags(entry, tokens, start, end);
       if (overwriteOriginalNer) {
@@ -251,8 +256,8 @@ private void annotateMatched(List<CoreLabel> tokens) {
         }
       } else {
         if (verbose) {
-          System.err.println("Not annotating  '" + m.group() + "': " +
-                  StringUtils.joinFields(m.groupNodes(), CoreAnnotations.NamedEntityTagAnnotation.class)
+          System.err.println("Not annotating  '" + m.group(g) + "': " +
+                  StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class)
                   + " with " + entry.type + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'");
         }
       }
@@ -357,13 +362,15 @@ private static class Entry {
     public String type; // the associated type
     public Set<String> overwritableTypes; // what types can be overwritten by this entry
     public double priority;
+    public int annotateGroup;
 
-    public Entry(String tokensRegex, String[] regex, String type, Set<String> overwritableTypes, double priority) {
+    public Entry(String tokensRegex, String[] regex, String type, Set<String> overwritableTypes, double priority, int annotateGroup) {
       this.tokensRegex = tokensRegex;
       this.regex = regex;
       this.type = type.intern();
       this.overwritableTypes = overwritableTypes;
       this.priority = priority;
+      this.annotateGroup = annotateGroup;
     }
 
     public String toString() {
@@ -432,8 +439,8 @@ private static List<Entry> readEntries(String annotatorName,
       String tokensRegex = null;
       String[] regexes = null;
       if (regex.startsWith("( ") && regex.endsWith(" )")) {
-        // Tokens regex
-        tokensRegex = regex;
+        // Tokens regex (remove start and end parenthesis)
+        tokensRegex = regex.substring(1,regex.length()-1).trim();
       } else {
         regexes = regex.split("\\s+");
       }
@@ -462,7 +469,9 @@ private static List<Entry> readEntries(String annotatorName,
         }
       }
 
-      Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority);
+      // TODO: Get annotate group from input....
+      int annotateGroup = 0;
+      Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup);
       if (seenRegexes.containsKey(key)) {
         Entry oldEntry = seenRegexes.get(key);
         if (priority > oldEntry.priority) {