src/edu/stanford/nlp/paragraphs/ParagraphAnnotator.java

package edu.stanford.nlp.paragraphs;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Grace Muzny
 */
public class ParagraphAnnotator implements Annotator {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ParagraphAnnotator.class);

  private final boolean VERBOSE;
  private final boolean DEBUG = true;

  // Whether or not to allow quotes of the same type embedded inside of each other
  // ["one" | "two"]
  public String PARAGRAPH_BREAK = "two";

  public ParagraphAnnotator(Properties props, boolean verbose) {
    PARAGRAPH_BREAK = props.getProperty("paragraphBreak", "two");
    VERBOSE = verbose;
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Adding paragraph index annotation (" + PARAGRAPH_BREAK + ") ...");
    }
    Pattern paragraphSplit = null;
    if (PARAGRAPH_BREAK.equals("two")) {
      paragraphSplit = Pattern.compile("\\n\\n+");
    } else if (PARAGRAPH_BREAK.equals("one")) {
      paragraphSplit = Pattern.compile("\\n+");
    }

    String fullText = annotation.get(CoreAnnotations.TextAnnotation.class);
    Matcher m = paragraphSplit.matcher(fullText);
    List<Integer> paragraphBreaks = Generics.newArrayList();
    while (m.find()) {
      // get the staring index
      paragraphBreaks.add(m.start());
    }

    // each sentence gets a paragraph id annotation
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    int currParagraph = -1;
    int nextParagraphStartIndex = -1;
    for (CoreMap sent : sentences) {
      int sentBegin = sent.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      if (sentBegin >= nextParagraphStartIndex) {
        if (currParagraph + 1 < paragraphBreaks.size()) {
          nextParagraphStartIndex = paragraphBreaks.get(currParagraph + 1);
        } else {
          nextParagraphStartIndex = fullText.length();
        }
        currParagraph++;
      }
      sent.set(CoreAnnotations.ParagraphIndexAnnotation.class, currParagraph);
    }
    if (VERBOSE) {
      System.err.println("done");
    }
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    return Collections.singleton(CoreAnnotations.ParagraphIndexAnnotation.class);
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return new HashSet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.BeforeAnnotation.class,
        CoreAnnotations.AfterAnnotation.class,
        CoreAnnotations.TokenBeginAnnotation.class,
        CoreAnnotations.TokenEndAnnotation.class,
        CoreAnnotations.IndexAnnotation.class,
        CoreAnnotations.OriginalTextAnnotation.class
    ));
  }

}