Skip to content

Commit

Permalink
Update TokensSequenceNERAnnotatorITest, small cleanups for TokensRege…
Browse files Browse the repository at this point in the history
…xNerAnnotator.
  • Loading branch information
angelxuanchang authored and Stanford NLP committed Sep 11, 2013
1 parent 44df966 commit 9e13557
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,21 @@ public void testTokenSequenceMatcherNumber() throws IOException {
assertFalse(match);
}

public void testTokenSequenceMatcherNested() throws IOException {
CoreMap doc = createDocument("A A A B B B B B B C C");

// Test sequence with groups
TokenSequencePattern p = TokenSequencePattern.compile( "( /B/+ )+");
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
assertEquals(1, m.groupCount());
assertEquals("B B B B B B", m.group());
assertEquals("B B B B B B", m.group(1));
match = m.find();
assertFalse(match);
}

public void testTokenSequenceMatcherABs() throws IOException {
CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A");

Expand Down Expand Up @@ -1298,7 +1313,7 @@ public void testMultiPatternMatcher() throws IOException {
public void testCompile() {
String s = "(?$se \"matching\" \"this\"|\"don't\")";
CoreMap doc = createDocument("does this do matching this");
TokenSequencePattern p =TokenSequencePattern.compile(s);
TokenSequencePattern p = TokenSequencePattern.compile(s);
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.StringUtils;
import junit.framework.TestCase;

import java.io.File;
import java.io.PrintWriter;
import java.util.List;
import java.util.Properties;

Expand All @@ -13,8 +17,7 @@
*/
public class TokensRegexNERAnnotatorITest extends TestCase {
private static final String REGEX_ANNOTATOR_NAME = "tokensregexner";
//private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";
private static final String MAPPING = "C:\\\\code\\\\NLP\\\\itest_map";
private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";

private static StanfordCoreNLP pipeline;
private static Annotator caseless;
Expand All @@ -36,11 +39,26 @@ public void setUp() throws Exception {
}
}

// Helper methods
protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props)
{
return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
}

protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception
{
Properties props = new Properties();
File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt");
PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath());
for (String[] p: patterns) {
pw.println(StringUtils.join(p, "\t"));
}
pw.close();
props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping", tempFile.getAbsolutePath());
props.setProperty(REGEX_ANNOTATOR_NAME + ".ignorecase", String.valueOf(ignoreCase));
return new TokensRegexNERAnnotator(REGEX_ANNOTATOR_NAME, props);
}

protected static Annotation createDocument(String text) {
Annotation annotation = new Annotation(text);
pipeline.annotate(annotation);
Expand Down Expand Up @@ -68,7 +86,49 @@ private static void reannotate(List<CoreLabel> tokens, Class key, String ... tag
}
}

public void testBasicMatching() {
// Tests for TokensRegex syntax
public void testTokensRegexSyntax() throws Exception {
String[][] regexes =
new String[][]{
new String[]{"( /University/ /of/ [ {ner:LOCATION} ] )", "SCHOOL"}
// TODO: TokensRegex literal string patterns ignores ignoreCase settings
//new String[]{"( University of [ {ner:LOCATION} ] )", "SCHOOL"}
};
Annotator annotatorCased = getTokensRegexNerAnnotator(regexes, false);

String str = "University of California is located in California.";
Annotation document = createDocument(str);
annotatorCased.annotate(document);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

checkTags(tokens,
"ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O");

reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class,
"O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
annotatorCased.annotate(document);

checkTags(tokens,
"SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");

// Try lowercase
Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true);

str = "university of california is located in california.";
document = createDocument(str);
tokens = document.get(CoreAnnotations.TokensAnnotation.class);
checkTags(tokens,
"O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
annotatorCased.annotate(document);
checkTags(tokens,
"O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O");
annotatorCaseless.annotate(document);
checkTags(tokens,
"SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O");
}

// Basic tests from RegexNERAnnotatorITest
public void testBasicMatching() throws Exception {
String str = "President Barack Obama lives in Chicago , Illinois , " +
"and is a practicing Christian .";
Annotation document = createDocument(str);
Expand All @@ -86,7 +146,7 @@ public void testBasicMatching() {
* does not span Ontario Place. Native American Church will overwrite ORGANIZATION with
* RELIGION.
*/
public void testOverwrite() {
public void testOverwrite() throws Exception {
String str = "I like Ontario Place , and I like the Native American Church , too .";
Annotation document = createDocument(str);
annotator.annotate(document);
Expand All @@ -101,7 +161,7 @@ public void testOverwrite() {
* In the mapping file, Christianity is assigned a higher priority than Early Christianity,
* and so Early should not be marked as RELIGION.
*/
public void testPriority() {
public void testPriority() throws Exception {
String str = "Christianity is of higher regex priority than Early Christianity . ";
Annotation document = createDocument(str);
annotator.annotate(document);
Expand All @@ -116,7 +176,7 @@ public void testPriority() {
* and continue, and if we don't get any exceptions, we throw an
* exception of our own.
*/
public void testEmptyAnnotation() {
public void testEmptyAnnotation() throws Exception {
try {
annotator.annotate(new Annotation(""));
} catch(RuntimeException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ public Triple<Class,String,Boolean> apply(NodePattern<CoreMap> in) {
Triple<Class,String,Boolean> firstTextTrigger = pattern.findNodePattern(textTriggerFilter);
if (firstTextTrigger != null) {
if (firstTextTrigger.third) {
// Ignore case
lowercaseStringTriggers.add(firstTextTrigger.first, firstTextTrigger.second.toLowerCase(), pattern);
} else {
annotationTriggers.add(firstTextTrigger.first, firstTextTrigger.second, pattern);
Expand Down
49 changes: 29 additions & 20 deletions src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
* </tr>
* <tr><td><code>noDefaultOverwriteLabels</code></td>
* <td>Comma separated list of output types for which default NER labels are not overwritten.
* For this types, only if the matched expression has NER type matchting the
* For this types, only if the matched expression has NER type matching the
* specified overwriteableType for the regex will the NER type be overwritten.</td>
* <td><code></code></td></tr>
* <tr><td><code>ignoreCase</code></td><td><code>Boolean</code></td>
Expand Down Expand Up @@ -134,29 +134,33 @@ public TokensRegexNERAnnotator(String mapping, boolean ignoreCase, String validP
}

private static Properties getProperties(String name, String mapping, boolean ignoreCase, String validPosRegex) {
String prefix = (name != null && !name.isEmpty())? name + ".":"";
Properties props = new Properties();
props.setProperty(name + ".mapping", mapping);
props.setProperty(name +".ignorecase", String.valueOf(ignoreCase));
props.setProperty(prefix + "mapping", mapping);
props.setProperty(prefix + "ignorecase", String.valueOf(ignoreCase));
if (validPosRegex != null) {
props.setProperty(name +".validpospattern", validPosRegex);
props.setProperty(prefix + "validpospattern", validPosRegex);
}
return props;
}

public TokensRegexNERAnnotator(String name, Properties properties) {
String backgroundSymbol = properties.getProperty(name + ".backgroundSymbol",
String prefix = (name != null && !name.isEmpty())? name + ".":"";
String backgroundSymbol = properties.getProperty(prefix + "backgroundSymbol",
SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL + ",MISC");
String[] backgroundSymbols = backgroundSymbol.split("\\s*,\\s*");
String[] mappings = PropertiesUtils.getStringArray(properties, name + ".mapping",
new String[] { DefaultPaths.DEFAULT_REGEXNER_RULES} );
String validPosRegex = properties.getProperty(name + ".validpospattern");
this.posMatchType = PosMatchType.valueOf(properties.getProperty(name + ".posmatchtype",
String mappingFiles = properties.getProperty(prefix + "mapping", DefaultPaths.DEFAULT_REGEXNER_RULES);
String[] mappings = mappingFiles.split("\\s*[,;]\\s*");
String validPosRegex = properties.getProperty(prefix + "validpospattern");
this.posMatchType = PosMatchType.valueOf(properties.getProperty(prefix + "posmatchtype",
DEFAULT_POS_MATCH_TYPE.name()));
boolean overwriteMyLabels = true;

this.noDefaultOverwriteLabels = CollectionUtils.asSet(PropertiesUtils.getStringArray(properties, name + ".noDefaultOverwriteLabels"));
this.ignoreCase = PropertiesUtils.getBool(properties, name + ".ignorecase", false);
this.verbose = PropertiesUtils.getBool(properties, name + ".verbose", false);
String noDefaultOverwriteLabelsProp = properties.getProperty(prefix + "noDefaultOverwriteLabels");
this.noDefaultOverwriteLabels = (noDefaultOverwriteLabelsProp != null)?
CollectionUtils.asSet(noDefaultOverwriteLabelsProp.split("\\s*,\\s*")):new HashSet<String>();
this.ignoreCase = PropertiesUtils.getBool(properties, prefix + "ignorecase", false);
this.verbose = PropertiesUtils.getBool(properties, prefix + "verbose", false);

if (validPosRegex != null && !validPosRegex.equals("")) {
validPosPattern = Pattern.compile(validPosRegex);
Expand Down Expand Up @@ -238,8 +242,9 @@ private void annotateMatched(List<CoreLabel> tokens) {
Entry entry = patternToEntry.get(m.pattern());

// Check if we will overwrite the existing annotation with this annotation
int start = m.start();
int end = m.end();
int g = entry.annotateGroup;
int start = m.start(g);
int end = m.end(g);

boolean overwriteOriginalNer = checkPosTags(entry, tokens, start, end);
if (overwriteOriginalNer) {
Expand All @@ -251,8 +256,8 @@ private void annotateMatched(List<CoreLabel> tokens) {
}
} else {
if (verbose) {
System.err.println("Not annotating '" + m.group() + "': " +
StringUtils.joinFields(m.groupNodes(), CoreAnnotations.NamedEntityTagAnnotation.class)
System.err.println("Not annotating '" + m.group(g) + "': " +
StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class)
+ " with " + entry.type + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'");
}
}
Expand Down Expand Up @@ -357,13 +362,15 @@ private static class Entry {
public String type; // the associated type
public Set<String> overwritableTypes; // what types can be overwritten by this entry
public double priority;
public int annotateGroup;

public Entry(String tokensRegex, String[] regex, String type, Set<String> overwritableTypes, double priority) {
public Entry(String tokensRegex, String[] regex, String type, Set<String> overwritableTypes, double priority, int annotateGroup) {
this.tokensRegex = tokensRegex;
this.regex = regex;
this.type = type.intern();
this.overwritableTypes = overwritableTypes;
this.priority = priority;
this.annotateGroup = annotateGroup;
}

public String toString() {
Expand Down Expand Up @@ -432,8 +439,8 @@ private static List<Entry> readEntries(String annotatorName,
String tokensRegex = null;
String[] regexes = null;
if (regex.startsWith("( ") && regex.endsWith(" )")) {
// Tokens regex
tokensRegex = regex;
// Tokens regex (remove start and end parenthesis)
tokensRegex = regex.substring(1,regex.length()-1).trim();
} else {
regexes = regex.split("\\s+");
}
Expand Down Expand Up @@ -462,7 +469,9 @@ private static List<Entry> readEntries(String annotatorName,
}
}

Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority);
// TODO: Get annotate group from input....
int annotateGroup = 0;
Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup);
if (seenRegexes.containsKey(key)) {
Entry oldEntry = seenRegexes.get(key);
if (priority > oldEntry.priority) {
Expand Down

0 comments on commit 9e13557

Please sign in to comment.