-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
ParagraphAnnotator.java
98 lines (85 loc) · 3.27 KB
/
ParagraphAnnotator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package edu.stanford.nlp.paragraphs;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Grace Muzny
*/
public class ParagraphAnnotator implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(ParagraphAnnotator.class);
private final boolean VERBOSE;
private final boolean DEBUG = true;
// Whether or not to allow quotes of the same type embedded inside of each other
// ["one" | "two"]
public String PARAGRAPH_BREAK = "two";
public ParagraphAnnotator(Properties props, boolean verbose) {
PARAGRAPH_BREAK = props.getProperty("paragraphBreak", "two");
VERBOSE = verbose;
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
System.err.print("Adding paragraph index annotation (" + PARAGRAPH_BREAK + ") ...");
}
Pattern paragraphSplit = null;
if (PARAGRAPH_BREAK.equals("two")) {
paragraphSplit = Pattern.compile("\\n\\n+");
} else if (PARAGRAPH_BREAK.equals("one")) {
paragraphSplit = Pattern.compile("\\n+");
}
String fullText = annotation.get(CoreAnnotations.TextAnnotation.class);
Matcher m = paragraphSplit.matcher(fullText);
List<Integer> paragraphBreaks = Generics.newArrayList();
while (m.find()) {
// get the staring index
paragraphBreaks.add(m.start());
}
// each sentence gets a paragraph id annotation
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
int currParagraph = -1;
int nextParagraphStartIndex = -1;
for (CoreMap sent : sentences) {
int sentBegin = sent.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
if (sentBegin >= nextParagraphStartIndex) {
if (currParagraph + 1 < paragraphBreaks.size()) {
nextParagraphStartIndex = paragraphBreaks.get(currParagraph + 1);
} else {
nextParagraphStartIndex = fullText.length();
}
currParagraph++;
}
sent.set(CoreAnnotations.ParagraphIndexAnnotation.class, currParagraph);
}
if (VERBOSE) {
System.err.println("done");
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.singleton(CoreAnnotations.ParagraphIndexAnnotation.class);
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class
));
}
}