HeaderNameTagger.java example

Explorer
MinorThird-master
import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.text.mixup.*;
import edu.cmu.minorthird.text.learn.*;

import java.util.*;

/**
 * Finds all tokens in a designated set of "header fields" for a
 * document, and marks every occurrence of these tokens in a document
 * as with the "headerName" property.
 */

public class HeaderNameTagger extends AbstractAnnotator
{
	static private final String HEADER_NAME_ANNOTATION = "headerNames";
	static private final String HEADER_NAME_PROP = "headerName";

	protected void doAnnotate(MonotonicTextLabels labels)
	{

		// first find header areas
		try {
			MixupProgram prog = new MixupProgram(	new String[] 
				{ "defSpanType _header =top~ re '\\n(From|To|Cc):\\s*\\S+\\s+([^\\n]+)',2" } );
                        MixupInterpreter interp = new MixupInterpreter(prog);
			interp.eval(labels);
		} catch (Mixup.ParseException e) {
			throw new IllegalStateException("mixup error: "+e);
		}

		for (Span.Looper i=labels.getTextBase().documentSpanIterator(); i.hasNext(); ) {

			Span doc = i.nextSpan();

			// collect tokens appearing in the HEADER_FIELDS of this document
			Set headerNames = new HashSet();
			for (Span.Looper k=labels.instanceIterator("_header", doc.getDocumentId()); k.hasNext(); ) {
				Span span = k.nextSpan();
				for (int h=0; h<span.size(); h++) {
					headerNames.add( span.getToken(h).getValue().toLowerCase() );
				}
			}

			// mark all occurrences of these words in this document 
			for (int j=0; j<doc.size(); j++) {
				Token token = doc.getTextToken(j);
				if (headerNames.contains( token.getValue().toLowerCase() )) {
					labels.setProperty( token, HEADER_NAME_PROP, "t" );
				}
			}
		}

		// provide the annotation
		labels.setAnnotatedBy( HEADER_NAME_ANNOTATION );
	}

	public String explainAnnotation(TextLabels e, Span s)
	{
		return "not implemented";
	}
}