Skip to content

Commit

Permalink
Merge branch 'master' into gm-character
Browse files Browse the repository at this point in the history
  • Loading branch information
Grace Muzny authored and Stanford NLP committed Jan 28, 2016
1 parent 72ee3a4 commit ad6349a
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public void annotate(Annotation annotation) {
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
trees.add(tree);

SemanticGraph dependencies = SemanticGraphFactory.makeFromTree(tree, Mode.COLLAPSED, Extras.NONE, false, null, true);
SemanticGraph dependencies = SemanticGraphFactory.makeFromTree(tree, Mode.COLLAPSED, Extras.NONE, true, null, true);
sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, dependencies);

if (!hasSpeakerAnnotations) {
Expand Down
75 changes: 71 additions & 4 deletions src/edu/stanford/nlp/pipeline/QuoteAnnotator.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public class QuoteAnnotator implements Annotator {
// whether to convert unicode quotes to non-unicode " and '
// before processing
public boolean ASCII_QUOTES = false;
// Whether or not to allow quotes of the same type embedded inside of each other
public boolean ALLOW_EMBEDDED_SAME = false;

// TODO: implement this
// public boolean closeUnclosedQuotes = false;
Expand All @@ -56,6 +58,24 @@ public class QuoteAnnotator implements Annotator {
DIRECTED_QUOTES = Collections.unmodifiableMap(tmp);
}

public static final Map<String, String> QUOTE_BEGINNERS;
static {
Map<String, String> tmp = Generics.newHashMap();
tmp.put("“", "”"); // directed double inward
// tmp.put("‘", "’"); // directed single inward
tmp.put("«", "»"); // guillemets
tmp.put("‹","›"); // single guillemets
tmp.put("「", "」"); // cjk brackets
tmp.put("『", "』"); // cjk brackets
tmp.put("„","”"); // directed double down/up left pointing
tmp.put("‚","’"); // directed single down/up left pointing
// tmp.put("``","''"); // double latex -- single latex quotes don't belong here!
// tmp.put("`","'"); // single latex
// tmp.put("\"","\""); // double standard
// tmp.put("'","'"); // single standard
QUOTE_BEGINNERS = Collections.unmodifiableMap(tmp);
}

/** Return a QuoteAnnotator that isolates quotes denoted by the
* ASCII characters " and '. If an unclosed quote appears, by default,
* this quote will not be counted as a quote.
Expand Down Expand Up @@ -96,6 +116,7 @@ public QuoteAnnotator(Properties props, boolean verbose) {
USE_SINGLE = Boolean.parseBoolean(props.getProperty("singleQuotes", "false"));
MAX_LENGTH = Integer.parseInt(props.getProperty("maxLength", "-1"));
ASCII_QUOTES = Boolean.parseBoolean(props.getProperty("asciiQuotes", "false"));
ALLOW_EMBEDDED_SAME = Boolean.parseBoolean(props.getProperty("allowEmbeddedSame", "false"));

VERBOSE = verbose;
Timing timer = null;
Expand Down Expand Up @@ -292,6 +313,47 @@ public static Annotation makeQuote(String surfaceForm, int begin, int end,
return quote;
}

// public List<Pair<Integer, Integer>> iterativishQuotes(String text) {
// // This stack will store pairs that are quote
// // kind & the index that it was found at.
// Stack<Pair<String, Integer>> quoteSilo = Generics.newStack();
// List<Pair<Integer, Integer>> quotes = Generics.newArrayList();
//
// for (int i = 0; i < text.length(); i++) {
// // is the character at this index a quote?
// String index = text.substring(i, i + 1);
// // is the character at this index possibly a two-character wide quote?
// // Could this string begin a quote?
// Pair<String, Integer> beginner = null;
// if (i < text.length() - 1 &&
// index.equals("`") &&
// text.substring(i, i + 2).equals("``")) {
// beginner = new Pair<>(text.substring(i, i + 2), i);
// i += 1; // need to advance i so that we don't grab the inner bit also!
// } else if (QUOTE_BEGINNERS.containsKey(index)) {
// beginner = new Pair<>(index, i);
// }
// if (beginner != null) {
// quoteSilo.push(beginner);
// continue; // we don't want to do the end of the loop!
// }
// // Could this string end a quote?
// // is is a two-wide ender?
// Pair<String, Integer> ender = null;
// if (i < text.length() - 1 &&
// index.equals("'") &&
// text.substring(i, i + 2).equals("''")) {
// ender = new Pair<>(text.substring(i, i + 2), i);
// i += 1;
// } else if (QUOTE_BEGINNERS.values().contains(index)) {
// ender = new Pair<>(index, i);
// }
// if (ender != null) {
// quoteSilo.push(ender);
// }
// }
// }

public List<Pair<Integer, Integer>> getQuotes(String text) {
return recursiveQuotes(text, 0, null);
}
Expand Down Expand Up @@ -367,7 +429,6 @@ public List<Pair<Integer, Integer>> recursiveQuotes(String text, int offset, Str
quote = null;
}


if (c.length() > 1) {
i += c.length() - 1;
}
Expand Down Expand Up @@ -416,16 +477,22 @@ public List<Pair<Integer, Integer>> recursiveQuotes(String text, int offset, Str
} else {
for (String qKind : quotesMap.keySet()) {
for (Pair<Integer, Integer> q : quotesMap.get(qKind)) {
if (q.first() < q.second() - qKind.length() * 2) {
if (q.second() - q.first() >= qKind.length() * 2) {
String toPass = text.substring(q.first() + qKind.length(),
q.second() - qKind.length());
String qKindToPass = DIRECTED_QUOTES.containsKey(qKind) || qKind.equals("`") ? null : qKind;
String qKindToPass = null;
if (!(DIRECTED_QUOTES.containsKey(qKind) || qKind.equals("`"))
|| !ALLOW_EMBEDDED_SAME) {
qKindToPass = qKind;
}
List<Pair<Integer, Integer>> embedded = recursiveQuotes(toPass,
q.first() + qKind.length() + offset, qKindToPass);
for (Pair<Integer, Integer> e : embedded) {
// don't add offset here because the
// recursive method already added it
quotes.add(new Pair(e.first(), e.second()));
if (e.second() - e.first() > 2) {
quotes.add(new Pair(e.first(), e.second()));
}
}
}
quotes.add(new Pair(q.first() + offset, q.second() + offset));
Expand Down
64 changes: 57 additions & 7 deletions test/src/edu/stanford/nlp/pipeline/QuoteAnnotatorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public class QuoteAnnotatorTest extends TestCase {
private static StanfordCoreNLP pipelineNoSingleQuotes;
private static StanfordCoreNLP pipelineMaxFive;
private static StanfordCoreNLP pipelineAsciiQuotes;
private static StanfordCoreNLP pipelineAllowEmbeddedSame;

/**
* Initialize the annotators at the start of the unit test.
Expand Down Expand Up @@ -55,9 +56,33 @@ public void setUp() {
props.setProperty("asciiQuotes", "true");
pipelineAsciiQuotes = new StanfordCoreNLP(props);
}
if (pipelineAllowEmbeddedSame == null) {
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, quote5");
props.setProperty("customAnnotatorClass.quote5", "edu.stanford.nlp.pipeline.QuoteAnnotator");
props.setProperty("allowEmbeddedSame", "true");
pipelineAllowEmbeddedSame = new StanfordCoreNLP(props);
}
}
}

public void testBasicEmbeddedSameUnicode() {
String text = "“Hello,” he said, “how “are” you doing?”";
List<CoreMap> quotes = runQuotes(text, 2, pipeline);
assertEquals("“Hello,”", quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEquals("“how “are” you doing?”", quotes.get(1).get(CoreAnnotations.TextAnnotation.class));
List<CoreMap> embedded = quotes.get(1).get(CoreAnnotations.QuotationsAnnotation.class);
assertEquals(embedded.size(), 0);
}

public void testBasicAllowEmbeddedSameUnicode() {
String text = "“Hello,” he said, “how “are” you doing?”";
List<CoreMap> quotes = runQuotes(text, 2, pipelineAllowEmbeddedSame);
assertEquals("“Hello,”", quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEquals("“how “are” you doing?”", quotes.get(1).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("“are”", "“how “are” you doing?”", quotes);
}

public void testBasicAsciiQuotes() {
String text = "“Hello,“ he said, “how are you doing?”";
List<CoreMap> quotes = runQuotes(text, 2, pipelineAsciiQuotes);
Expand Down Expand Up @@ -106,53 +131,78 @@ public void testLatexQuotesWithDirectedApostrophes() {

public void testEmbeddedLatexQuotes() {
String text = "``Hello ``how are you doing?''''";
List<CoreMap> quotes = runQuotes(text, 1);
List<CoreMap> quotes = runQuotes(text, 1, pipelineAllowEmbeddedSame);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("``how are you doing?''", text, quotes);
assertInnerAnnotationValues(quotes.get(0), 0, 0, 0, 0, 9);
}

public void testEmbeddedLatexQuotesNoEmbedded() {
String text = "``Hello ``how are you doing?''''";
List<CoreMap> quotes = runQuotes(text, 1, pipeline);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
List<CoreMap> embedded = quotes.get(0).get(CoreAnnotations.QuotationsAnnotation.class);
assertEquals(0, embedded.size());
}

public void testEmbeddedSingleLatexQuotes() {
String text = "`Hello `how are you doing?''";
List<CoreMap> quotes = runQuotes(text, 1);
List<CoreMap> quotes = runQuotes(text, 1, pipelineAllowEmbeddedSame);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("`how are you doing?'", text, quotes);
}

public void testEmbeddedLatexQuotesAllEndSamePlace() {
String text = "``Hello ``how `are ``you doing?'''''''";
List<CoreMap> quotes = runQuotes(text, 1);
List<CoreMap> quotes = runQuotes(text, 1, pipelineAllowEmbeddedSame);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("``how `are ``you doing?'''''", text, quotes);
assertEmbedded("`are ``you doing?'''", "``how `are ``you doing?'''''", quotes);
assertEmbedded("``you doing?''", "`are ``you doing?'''", quotes);
}

public void testEmbeddedLatexQuotesAllEndSamePlaceNoEmbedded() {
String text = "``Hello ``how ``are ``you doing?''''''''";
List<CoreMap> quotes = runQuotes(text, 1, pipeline);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
List<CoreMap> embedded = quotes.get(0).get(CoreAnnotations.QuotationsAnnotation.class);
assertEquals(0, embedded.size());
}

public void testTripleEmbeddedLatexQuotes() {
String text = "``Hel ``lo ``how'' are you'' doing?''";
List<CoreMap> quotes = runQuotes(text, 1);
List<CoreMap> quotes = runQuotes(text, 1, pipelineAllowEmbeddedSame);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("``lo ``how'' are you''", text, quotes);
assertEmbedded("``how''", "``lo ``how'' are you''", quotes);
}

public void testTripleEmbeddedLatexQuotesNoEmbedded() {
String text = "``Hel ``lo ``how'' are you'' doing?''";
// This case fails unless you also don't consider single quotes
List<CoreMap> quotes = runQuotes(text, 1, pipelineNoSingleQuotes);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
List<CoreMap> embedded = quotes.get(0).get(CoreAnnotations.QuotationsAnnotation.class);
assertEquals(0, embedded.size());
}

public void testTripleEmbeddedUnicodeQuotes() {
String text = "“Hel «lo “how” are you» doing?”";
List<CoreMap> quotes = runQuotes(text, 1);
List<CoreMap> quotes = runQuotes(text, 1, pipelineAllowEmbeddedSame);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("«lo “how” are you»", text, quotes);
assertEmbedded("“how”", "«lo “how” are you»", quotes);
}

public void testBasicIgnoreSingleQuotes() {
String text = "“Hello,” he 'said', “how are you doing?”";
List<CoreMap> quotes = runQuotes(text, 2, pipelineNoSingleQuotes);
List<CoreMap> quotes = runQuotes(text, 2, pipelineAllowEmbeddedSame);
assertEquals("“Hello,”", quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEquals("“how are you doing?”", quotes.get(1).get(CoreAnnotations.TextAnnotation.class));

text = "\"'Tis Impossible, “Mr. 'tis “Mr. Bennet” Bennet”, impossible, when 'tis I am not acquainted with him\n" +
" myself; how can you be so teasing?\"";
quotes = runQuotes(text, 1, pipelineNoSingleQuotes);
quotes = runQuotes(text, 1, pipelineAllowEmbeddedSame);
assertEquals(text, quotes.get(0).get(CoreAnnotations.TextAnnotation.class));
assertEmbedded("“Mr. Bennet”", "“Mr. 'tis “Mr. Bennet” Bennet”", quotes);
assertEmbedded("“Mr. 'tis “Mr. Bennet” Bennet”", text, quotes);
Expand Down

0 comments on commit ad6349a

Please sign in to comment.