SupervisedSieveTraining.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.quoteattribution.Sieves.training;

import edu.stanford.nlp.classify.Classifier;
import edu.stanford.nlp.classify.GeneralDataset;
import edu.stanford.nlp.classify.RVFDataset;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.RVFDatum;
import edu.stanford.nlp.paragraphs.ParagraphAnnotator;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.QuoteAttributionAnnotator;
import edu.stanford.nlp.quoteattribution.*;
import edu.stanford.nlp.quoteattribution.Sieves.Sieve;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.*;

/**
 * Created by mjfang on 12/1/16.
 */
public class SupervisedSieveTraining {

  private static Sieve sieve; // use to access functions

  // Take in a training Annotated document:
  // convert document to dataset & featurize
  // train classifier
  // output model
  // report training F1/accuracy

  public static final Set<String> punctuation = new HashSet<>(Arrays.asList(new String[]{",", ".", "\"", "\n"}));
  public static final Set<String> punctuationForFeatures = new HashSet<>(Arrays.asList(new String[]{",", ".", "!", "?"})); //TODO: in original iteration: maybe can combine these!?


  //given a sentence, return the begin token of the paragraph it's in
  private static int getParagraphBeginToken(CoreMap sentence, List<CoreMap> sentences) {
    int paragraphId = sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
    int paragraphBeginToken = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
    for(int i = sentence.get(CoreAnnotations.SentenceIndexAnnotation.class) - 1; i >= 0; i--) {
      CoreMap currSentence = sentences.get(i);
      if(currSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == paragraphId) {
        paragraphBeginToken = currSentence.get(CoreAnnotations.TokenBeginAnnotation.class);
      }
      else {
        break;
      }
    }
    return paragraphBeginToken;
  }

  private static int getParagraphEndToken(CoreMap sentence, List<CoreMap> sentences) {
    int quoteParagraphId = sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
    int paragraphEndToken = sentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1;
    for(int i = sentence.get(CoreAnnotations.SentenceIndexAnnotation.class); i < sentences.size(); i++) {
      CoreMap currSentence = sentences.get(i);
      if(currSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphId) {
        paragraphEndToken = currSentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1;
      }
      else {
        break;
      }
    }
    return paragraphEndToken;
  }

  private static Map<Integer, List<CoreMap>> getQuotesInParagraph(Annotation doc) {
    List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class);
    List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
    Map<Integer, List<CoreMap>> paragraphToQuotes = new HashMap<>();
    for(CoreMap quote : quotes) {
      CoreMap sentence = sentences.get(quote.get(CoreAnnotations.SentenceBeginAnnotation.class));
      paragraphToQuotes.putIfAbsent(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class), new ArrayList<>());
      paragraphToQuotes.get(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class)).add(quote);
    }
    return paragraphToQuotes;
  }

  //assumption: exclusion list is non-overlapping, ordered
  private static List<Pair<Integer, Integer>> getRangeExclusion(Pair<Integer, Integer> originalRange, List<Pair<Integer, Integer>> exclusionList) {

    List<Pair<Integer, Integer>> leftoverRanges = new ArrayList<>();
    Pair<Integer, Integer> currRange = originalRange;
    for(Pair<Integer, Integer> exRange : exclusionList) {
      Pair<Integer, Integer> leftRange = new Pair<>(currRange.first, exRange.first - 1);
      if(leftRange.second - leftRange.first >= 0) {
        leftoverRanges.add(leftRange);
      }

      if(currRange.second == exRange.second) {
        break;
      }
      else {
        currRange = new Pair<>(exRange.second + 1, currRange.second);
      }
    }
    if(currRange.first < currRange.second)
      leftoverRanges.add(currRange);
    return leftoverRanges;
  }

  public static class FeaturesData {
    public GeneralDataset<String, String> dataset;
    public Map<Integer, Pair<Integer, Integer>> mapQuoteToDataRange;
    public Map<Integer, Sieve.MentionData> mapDatumToMention;

    public FeaturesData(Map<Integer, Pair<Integer, Integer>> mapQuoteToDataRange, Map<Integer, Sieve.MentionData> mapDatumToMention, GeneralDataset<String, String> dataset) {
      this.mapQuoteToDataRange = mapQuoteToDataRange;
      this.mapDatumToMention = mapDatumToMention;
      this.dataset = dataset;
    }
  }
  public static class SieveData {
    Annotation doc;
    Map<String, List<Person>> characterMap;
    Map<Integer, String> pronounCorefMap;
    Set<String> animacyList;

    public SieveData(Annotation doc, Map<String, List<Person>> characterMap, Map<Integer, String> pronounCorefMap, Set<String> animacyList) {
      this.doc = doc;
      this.characterMap = characterMap;
      this.pronounCorefMap = pronounCorefMap;
      this.animacyList = animacyList;
    }
  }


  //goldList null if not training
  public static FeaturesData featurize(SieveData sd, List<XMLToAnnotation.GoldQuoteInfo> goldList, boolean isTraining) {

    Annotation doc = sd.doc;

    sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);

    List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class);
    List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class);

    Map<Integer, List<CoreMap>> paragraphToQuotes = getQuotesInParagraph(doc);
    GeneralDataset<String, String> dataset = new RVFDataset<>();

    //necessary for 'ScoreBestMention'
    Map<Integer, Pair<Integer, Integer>> mapQuoteToDataRange = new HashMap<>(); //maps quote to corresponding indices in the dataset
    Map<Integer, Sieve.MentionData> mapDatumToMention = new HashMap<>();

    if(isTraining && goldList.size() != quotes.size()) {
      throw new RuntimeException("Gold Quote List size doesn't match quote list size!");
    }

    for(int quoteIdx = 0; quoteIdx < quotes.size(); quoteIdx++) {

      int initialSize = dataset.size();

      CoreMap quote = quotes.get(quoteIdx);
      XMLToAnnotation.GoldQuoteInfo gold = null;
      if(isTraining) {
        gold = goldList.get(quoteIdx);
        if (gold.speaker == "") {
          continue;
        }
      }

      CoreMap quoteFirstSentence = sentences.get(quote.get(CoreAnnotations.SentenceBeginAnnotation.class));
      Pair<Integer, Integer> quoteRun = new Pair<>(quote.get(CoreAnnotations.TokenBeginAnnotation.class), quote.get(CoreAnnotations.TokenEndAnnotation.class));
//      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
      int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);

      //add mentions before quote up to the previous paragraph
      int rightValue = quoteRun.first - 1;
      int leftValue = quoteRun.first - 1;
      //move left value to be the first token idx of the previous paragraph
      for(int sentIdx = quote.get(CoreAnnotations.SentenceBeginAnnotation.class); sentIdx >= 0; sentIdx--) {
        CoreMap sentence = sentences.get(sentIdx);
        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
          continue;
        }
        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx - 1) { //quoteParagraphIdx - 1 for this and prev
          leftValue = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
        }
        else {
          break;
        }
      }


      List<Sieve.MentionData> mentionsInPreviousParagraph = eliminateDuplicates(sieve.findClosestMentionsInSpanBackward(new Pair<>(leftValue, rightValue)));

      //mentions in next paragraph
      leftValue = quoteRun.second + 1;
      rightValue = quoteRun.second + 1;
      for(int sentIdx = quote.get(CoreAnnotations.SentenceEndAnnotation.class); sentIdx < sentences.size(); sentIdx++) {
        CoreMap sentence = sentences.get(sentIdx);
//        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
//          continue;
//        }
        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx ) { //quoteParagraphIdx + 1
          rightValue = sentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1;
        }
        else {
          break;
        }
      }

      List<Sieve.MentionData> mentionsInNextParagraph = sieve.findClosestMentionsInSpanForward(new Pair<>(leftValue, rightValue));

      List<Sieve.MentionData> candidateMentions = new ArrayList<>();
      candidateMentions.addAll(mentionsInPreviousParagraph);
      candidateMentions.addAll(mentionsInNextParagraph);

//      System.out.println(candidateMentions.size());
      int rankedDistance = 1;
      int numBackwards = mentionsInPreviousParagraph.size();

      for(Sieve.MentionData mention : candidateMentions) {

        List<CoreLabel> mentionCandidateTokens = doc.get(CoreAnnotations.TokensAnnotation.class).subList(mention.begin, mention.end + 1);
        CoreMap mentionCandidateSentence = sentences.get(mentionCandidateTokens.get(0).sentIndex());
//        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
//          continue;
//        }

        Counter<String> features = new ClassicCounter<>();

        boolean isLeft = true;
        int distance = quoteRun.first - mention.end;

        if (distance < 0) {
          isLeft = false;
          distance = mention.begin - quoteRun.second;
        }
        if (distance < 0) {
          continue; //disregard mention-in-quote cases.
        }

        features.setCount("wordDistance", distance);

        List<CoreLabel> betweenTokens;
        if (isLeft) {
          betweenTokens = tokens.subList(mention.end + 1, quoteRun.first);
        } else {
          betweenTokens = tokens.subList(quoteRun.second + 1, mention.begin);
        }

        //Punctuation in between
        for (CoreLabel token : betweenTokens) {
          if (punctuation.contains(token.word())) {
            features.setCount("punctuationPresence:" + token.word(), 1);
          }
        }

        // number of mentions away
        features.setCount("rankedDistance", rankedDistance);
        rankedDistance++;
        if (rankedDistance == numBackwards) {//reset for the forward
          rankedDistance = 1;
        }

//        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);

        //third distance: # of paragraphs away
        int mentionParagraphIdx = -1;
        CoreMap sentenceInMentionParagraph = null;

        int quoteParagraphBeginToken = getParagraphBeginToken(quoteFirstSentence, sentences);
        int quoteParagraphEndToken = getParagraphEndToken(quoteFirstSentence, sentences);

        if (isLeft) {
          if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) {
            features.setCount("leftParagraphDistance", 0);
            mentionParagraphIdx = quoteParagraphIdx;
            sentenceInMentionParagraph = quoteFirstSentence;
          } else {
            int paragraphDistance = 1;
            int currParagraphIdx = quoteParagraphIdx - paragraphDistance;
            CoreMap currSentence = quoteFirstSentence;
            int currSentenceIdx = currSentence.get(CoreAnnotations.SentenceIndexAnnotation.class);
            while (currParagraphIdx >= 0) {
//              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
              //extract begin and end tokens of
              while (currSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) != currParagraphIdx) {
                currSentenceIdx--;
                currSentence = sentences.get(currSentenceIdx);
              }
              int prevParagraphBegin = getParagraphBeginToken(currSentence, sentences);
              int prevParagraphEnd = getParagraphEndToken(currSentence, sentences);

              if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd) {
                mentionParagraphIdx = currParagraphIdx;
                sentenceInMentionParagraph = currSentence;

                features.setCount("leftParagraphDistance", paragraphDistance);
                if (paragraphDistance % 2 == 0)
                  features.setCount("leftParagraphDistanceEven", 1);
                break;
              }
              paragraphDistance++;
              currParagraphIdx--;
            }
          }
        } else //right
        {
          if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) {
            features.setCount("rightParagraphDistance", 0);
            sentenceInMentionParagraph = quoteFirstSentence;
            mentionParagraphIdx = quoteParagraphIdx;
          } else {
            int paragraphDistance = 1;
            int nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
            CoreMap currSentence = quoteFirstSentence;
            int currSentenceIdx = currSentence.get(CoreAnnotations.SentenceIndexAnnotation.class);
            while (currSentenceIdx < sentences.size()) {
              while (currSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) != nextParagraphIndex) {
                currSentenceIdx++;
                currSentence = sentences.get(currSentenceIdx);
              }
              int nextParagraphBegin = getParagraphBeginToken(currSentence, sentences);
              int nextParagraphEnd = getParagraphEndToken(currSentence, sentences);
              if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd) {
                sentenceInMentionParagraph = currSentence;
                features.setCount("rightParagraphDistance", paragraphDistance);
                break;
              }
              paragraphDistance++;
              nextParagraphIndex++;
            }

          }
        }

        //2. mention features
        int mentionParagraphBegin = getParagraphBeginToken(sentenceInMentionParagraph, sentences);
        int mentionParagraphEnd = getParagraphEndToken(sentenceInMentionParagraph, sentences);

        if(!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken)) {
          List<CoreMap> quotesInMentionParagraph = paragraphToQuotes.getOrDefault(mentionParagraphIdx, new ArrayList<>());
          Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> namesInMentionParagraph = sieve.scanForNames(new Pair<>(mentionParagraphBegin, mentionParagraphEnd));

          features.setCount("quotesInMentionParagraph", quotesInMentionParagraph.size());
          features.setCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
          features.setCount("namesInMentionParagraph", namesInMentionParagraph.first.size());

          //mention ordering in paragraph it is in
          for (int i = 0; i < namesInMentionParagraph.second.size(); i++) {
            if (ExtractQuotesUtil.rangeContains(new Pair<>(mention.begin, mention.end), namesInMentionParagraph.second.get(i)))
              features.setCount("orderInParagraph", i);
          }

          //if mention paragraph is all one quote

          if (quotesInMentionParagraph.size() == 1) {
            CoreMap qInMentionParagraph = quotesInMentionParagraph.get(0);
            if (qInMentionParagraph.get(CoreAnnotations.TokenBeginAnnotation.class) == mentionParagraphBegin && qInMentionParagraph.get(CoreAnnotations.TokenEndAnnotation.class) - 1 == mentionParagraphEnd) {
              features.setCount("mentionParagraphIsInConversation", 1);
            } else {
              features.setCount("mentionParagraphIsInConversation", -1);
            }
          }
          for (CoreMap quoteIMP : quotesInMentionParagraph) {
            if (ExtractQuotesUtil.rangeContains(new Pair<>(quoteIMP.get(CoreAnnotations.TokenBeginAnnotation.class), quoteIMP.get(CoreAnnotations.TokenEndAnnotation.class) - 1), new Pair<>(mention.begin, mention.end)))
              features.setCount("mentionInQuote", 1);
          }
          if (features.getCount("mentionInQuote") != 1)
            features.setCount("mentionNotInQuote", 1);
        }

        //nearby word syntax types
        CoreLabel prevWord = tokens.get(mention.begin - 1);
        CoreLabel nextWord = tokens.get(mention.end + 1);

        features.setCount("prevWordType:" + prevWord.tag(), 1);
        features.setCount("nextWordType:" + nextWord.tag(), 1);
//                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
        if (punctuationForFeatures.contains(prevWord.lemma()))
          features.setCount("prevWordPunct:" + prevWord.lemma(), 1);
        if (punctuationForFeatures.contains(nextWord.lemma()))
          features.setCount("nextWordPunct:" + nextWord.lemma(), 1);

        //quote paragraph features
        List<CoreMap> quotesInQuoteParagraph = paragraphToQuotes.get(quoteParagraphIdx);
        features.setCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.size());
        features.setCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
        features.setCount("NamesInQuoteParagraph", sieve.scanForNames(new Pair<>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.size());

        //quote features
        features.setCount("quoteLength", quote.get(CoreAnnotations.TokenEndAnnotation.class) - quote.get(CoreAnnotations.TokenBeginAnnotation.class) + 1);
        for (int i = 0; i < quotesInQuoteParagraph.size(); i++) {
          if (quotesInQuoteParagraph.get(i).equals(quote)) {
            features.setCount("quotePosition", i + 1);
          }
        }
        if (features.getCount("quotePosition") == 0)
          throw new RuntimeException("Check this (equality not working)");


        Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> namesData = sieve.scanForNames(quoteRun);
        for (String name : namesData.first) {
          features.setCount("charactersInQuote:" + sd.characterMap.get(name).get(0).name, 1);
        }

        //if quote encompasses entire paragraph
        if (quote.get(CoreAnnotations.TokenBeginAnnotation.class) == quoteParagraphBeginToken && quote.get(CoreAnnotations.TokenEndAnnotation.class) == quoteParagraphEndToken) {
          features.setCount("isImplicitSpeaker", 1);
        } else {
          features.setCount("isImplicitSpeaker", -1);
        }

        //Vocative detection
        if (mention.type.equals("name")) {
          List<Person> pList = sd.characterMap.get(sieve.tokenRangeToString(new Pair<>(mention.begin, mention.end)));
          Person p;
          if (pList != null)
            p = pList.get(0);
          else
            p = sd.characterMap.get(sieve.scanForNames(new Pair<>(mention.begin, mention.end)).first.get(0)).get(0);

          for (String name : namesData.first) {
            if (p.aliases.contains(name))
              features.setCount("nameInQuote", 1);
          }

          if (quoteParagraphIdx > 0) {
//            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
            List<CoreMap> quotesInPrevParagraph = paragraphToQuotes.getOrDefault(quoteParagraphIdx - 1, new ArrayList<>());
            List<Pair<Integer, Integer>> exclusionList = new ArrayList<>();
            for (CoreMap quoteIPP : quotesInPrevParagraph) {
              Pair<Integer, Integer> quoteRange = new Pair<>(quoteIPP.get(CoreAnnotations.TokenBeginAnnotation.class), quoteIPP.get(CoreAnnotations.TokenEndAnnotation.class));
              exclusionList.add(quoteRange);
              for (String name : sieve.scanForNames(quoteRange).first) {
                if (p.aliases.contains(name))
                  features.setCount("nameInPrevParagraphQuote", 1);
              }
            }

            int sentenceIdx = quoteFirstSentence.get(CoreAnnotations.SentenceIndexAnnotation.class);

            CoreMap sentenceInPrevParagraph = null;
            for(int i = sentenceIdx - 1; i >= 0; i--) {
              CoreMap currSentence = sentences.get(i);
              if(currSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx - 1) {
                sentenceInPrevParagraph = currSentence;
                break;
              }
            }

            int prevParagraphBegin = getParagraphBeginToken(sentenceInPrevParagraph, sentences);
            int prevParagraphEnd = getParagraphEndToken(sentenceInPrevParagraph, sentences);
            List<Pair<Integer, Integer>> prevParagraphNonQuoteRuns = getRangeExclusion(new Pair<>(prevParagraphBegin, prevParagraphEnd), exclusionList);
            for (Pair<Integer, Integer> nonQuoteRange : prevParagraphNonQuoteRuns) {
              for (String name : sieve.scanForNames(nonQuoteRange).first) {
                if (p.aliases.contains(name))
                  features.setCount("nameInPrevParagraphNonQuote", 1);
              }
            }
          }
        }

        if (isTraining) {
          if (QuoteAttributionUtils.rangeContains(new Pair<>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair<>(mention.begin, mention.end))) {
            RVFDatum<String, String> datum = new RVFDatum<>(features, "isMention");
            datum.setID(Integer.toString(dataset.size()));
            mapDatumToMention.put(dataset.size(), mention);
            dataset.add(datum);
          } else {
            RVFDatum<String, String> datum = new RVFDatum<>(features, "isNotMention");
            datum.setID(Integer.toString(dataset.size()));
            dataset.add(datum);
            mapDatumToMention.put(dataset.size(), mention);
          }
        }
        else {
          RVFDatum<String, String> datum = new RVFDatum<>(features, "none");
          datum.setID(Integer.toString(dataset.size()));
          mapDatumToMention.put(dataset.size(), mention);
          dataset.add(datum);
        }
      }

      mapQuoteToDataRange.put(quoteIdx, new Pair<>(initialSize, dataset.size() - 1));
    }
    return new FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset);
  }

  //TODO: potential bug in previous iteration: not implementing order reversal in eliminateDuplicates
  private static List<Sieve.MentionData> eliminateDuplicates(List<Sieve.MentionData> mentionCandidates)
  {
    List<Sieve.MentionData> newList = new ArrayList<>();
    Set<String> seenText = new HashSet<>();
    for(int i = 0; i < mentionCandidates.size(); i++)
    {
      Sieve.MentionData mentionCandidate = mentionCandidates.get(i);
      String text = mentionCandidate.text;
      if(!seenText.contains(text) || mentionCandidate.type.equals("Pronoun"))
        newList.add(mentionCandidate);
      seenText.add(text);
    }
    return newList;
  }


  public static void outputModel(String fileName, Classifier<String, String> clf) {
    FileOutputStream fo = null;
    try {
      fo = new FileOutputStream(fileName);
      ObjectOutputStream so = new ObjectOutputStream(fo);
      so.writeObject(clf);
      so.flush();
      so.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

  }

  public static void train(XMLToAnnotation.Data data, Properties props) {
    Map<String, List<Person>> characterMap = QuoteAttributionUtils.readPersonMap(props.getProperty("charactersPath"));
    Map<Integer, String> pronounCorefMap =
            QuoteAttributionUtils.setupCoref(props.getProperty("booknlpCoref"), characterMap, data.doc);
    Set<String> animacyList = QuoteAttributionUtils.readAnimacyList(QuoteAttributionAnnotator.ANIMACY_WORD_LIST);
    FeaturesData fd = featurize(new SieveData(data.doc, characterMap, pronounCorefMap, animacyList), data.goldList, true);
    ExtractQuotesClassifier quotesClassifier = new ExtractQuotesClassifier(fd.dataset);
    outputModel(props.getProperty("modelPath"), quotesClassifier.getClassifier());
  }


  public static void main(String[] args) throws Exception {
    String home = "/home/mjfang/action_grammars/";
    // make the first argument one for a base directory
    String specificFile = "1PPDevUncollapsed.props";
    if (args.length >= 1) {
      home = args[0];
    }
    if (args.length >= 2) {
      specificFile = args[1];
    }
    System.out.println("Base directory: " + home);
    Properties props = StringUtils.propFileToProperties(home + "ExtractQuotesXMLScripts/" + specificFile);
    XMLToAnnotation.Data data = XMLToAnnotation.readXMLFormat(props.getProperty("file"));
    Properties propsPara = new Properties();
    propsPara.setProperty("paragraphBreak", "one");
    ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false);
    pa.annotate(data.doc);
    Properties annotatorProps = new Properties();
    annotatorProps.setProperty("charactersPath", props.getProperty("charactersPath")); //"characterList.txt"
    annotatorProps.setProperty("booknlpCoref", props.getProperty("booknlpCoref"));
    annotatorProps.setProperty("modelPath", props.getProperty("modelPath"));//"model.ser");
    QuoteAttributionAnnotator qaa = new QuoteAttributionAnnotator(annotatorProps);
    qaa.annotate(data.doc);
    ChapterAnnotator ca = new ChapterAnnotator();
    ca.annotate(data.doc);
    train(data, annotatorProps);
  }
}