TermExtractor.java example

Explorer
lucida-master
- lucida
package info.ephyra.questionanalysis;

import info.ephyra.answerselection.filters.TruncationFilter;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.util.Dictionary;
import info.ephyra.util.StringUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * Extracts single- and multi-token terms from a sentence. Multi-token terms are
 * named entities or compound terms found in dictionaries.
 * 
 * @author Nico Schlaefer
 * @version 2007-05-28
 */
public class TermExtractor {
	/** Maximum length of a term in tokens. */
	private static final int MAX_TERM_LENGTH = 4;
	
	/**
	 * Checks if the given term is among the named entities and returns the
	 * types of the entities that match it.
	 * 
	 * @param term a term, potentially a named entity
	 * @param nes named entities
	 * @return types of matching entities
	 */
	private static String[] getNeTypes(String term, String[][] nes) {
		List<String> neTypes = new ArrayList<String>();
		Set<String> neTypesSet = new HashSet<String>();
		
		for (int neId = 0; neId < nes.length; neId++)
			for (String ne : nes[neId])
				if (term.equals(ne)) {
					String neType = NETagger.getNeType(neId);
					if (neTypesSet.add(neType))
						// there may be multiple taggers (IDs) for one type
						neTypes.add(neType);
					break;
				}
		
		return neTypes.toArray(new String[neTypes.size()]);
	}
	
	/**
	 * Extracts named entities from the given sentence.
	 * 
	 * @param sentence sentence to analyze
	 * @return named entities in the sentence
	 */
	public static String[][] getNes(String sentence) {
		String[] tokens = NETagger.tokenize(sentence);
		String[][] nes = NETagger.extractNes(new String[][] {tokens})[0];
		
		// untokenize named entities
		for (int i = 0; i < nes.length; i++)
			for (int j = 0; j < nes[i].length; j++)
				nes[i][j] = OpenNLP.untokenize(nes[i][j], sentence);
		
		return nes;
	}
	
	/**
	 * Extracts named entities from the given sentence and context string.
	 * 
	 * @param sentence sentence to analyze
	 * @param context context string
	 * @return named entities in the sentence and context string
	 */
	public static String[][] getNes(String sentence, String context) {
		// extract NEs from sentence
		String[][] sentenceNes = getNes(sentence);
		if (context == null || context.length() == 0) return sentenceNes;
		
		// extract NEs from context string
		String[][] contextNes = getNes(context);
		
		// merge NEs
		String[][] nes = new String[sentenceNes.length][];
		for (int i = 0; i < nes.length; i++) {
			if (sentenceNes[i].length == 0) nes[i] = contextNes[i];
			else if (contextNes[i].length == 0) nes[i] = sentenceNes[i];
			else {
				ArrayList<String> nesL = new ArrayList<String>();
				for (String ne : sentenceNes[i]) nesL.add(ne);
				for (String ne : contextNes[i]) nesL.add(ne);
				nes[i] = nesL.toArray(new String[nesL.size()]);
			}
		}
		return nes;
	}
	
	/**
	 * Extracts terms from the given sentence.
	 * 
	 * @param sentence sentence to analyze
	 * @param dicts dictionaries with compound terms
	 * @return terms in the sentence
	 */
	public static Term[] getTerms(String sentence, Dictionary[] dicts) {
		String[][] nes = getNes(sentence);
		
		return getTerms(sentence, nes, dicts);
	}
	
	/**
	 * Extracts terms from the given sentence, reusing named entities that have
	 * been extracted before.
	 * 
	 * @param sentence sentence to analyze
	 * @param nes named entities in the sentence
	 * @param dicts dictionaries with compound terms
	 * @return terms in the sentence
	 */
	public static Term[] getTerms(String sentence, String[][] nes,
			Dictionary[] dicts) {
		// extract tokens
		String[] tokens = OpenNLP.tokenize(sentence);
		// tag part of speech
		String[] pos = OpenNLP.tagPos(tokens);
		// tag phrase chunks
		String[] chunks = OpenNLP.tagChunks(tokens, pos);
		// mark tokens as not yet assigned to a term
		boolean[] assigned = new boolean[tokens.length];
		Arrays.fill(assigned, false);
		
		// for each token a term that starts at that token or 'null'
		Term[] terms = new Term[tokens.length];
		// normalized terms (do identify duplicates)
		Set<String> termSet = new HashSet<String>();
		
		// construct multi-token terms
		for (int length = MAX_TERM_LENGTH; length > 1; length--)
			for (int id = 0; id < tokens.length - length + 1; id++) {
				// one of the tokens is already assigned to a term?
				boolean skip = false;
				for (int offset = 0; offset < length; offset++)
					if (assigned[id + offset]) {
						skip = true;
						continue;
					}
				if (skip) continue;
				
				// get phrase spanning the tokens
				String text = tokens[id];
				for (int offset = 1; offset < length; offset++)
					text += " " + tokens[id + offset];
				text = OpenNLP.untokenize(text, sentence);
				
				// phrase is a duplicate?
				if (!termSet.add(StringUtils.normalize(text))) continue;
				// phrase does not contain keywords?
				if (KeywordExtractor.getKeywords(text).length == 0) continue;
				
				// phrase is a named entity?
				String[] neTypes = getNeTypes(text, nes);
				if (neTypes.length > 0) {
					// construct term
					terms[id] = new Term(text, Term.COMPOUND, neTypes);
					// mark tokens as assigned
					for (int offset = 0; offset < length; offset++)
						assigned[id + offset] = true;
					continue;
				}
				
				for (Dictionary dict : dicts) {
					// phrase is not a noun phrase or verb phrase?
					if (!(chunks[id].endsWith("NP") &&  // look up noun phrases
							chunks[id + length - 1].endsWith("NP"))/* &&
						!(chunks[id].endsWith("VP") &&  // look up verb phrases
							chunks[id + length - 1].endsWith("VP"))*/)
						continue;
					
					// phrase contains a special characters other than '.'?
					if (text.matches(".*?[^\\w\\s\\.].*+")) continue;
					// phrase can be truncated?
					if (!text.equals(TruncationFilter.truncate(text))) continue;
					
					// phrase is in the dictionary?
					if (dict.contains(text)) {
						// construct term
						terms[id] = new Term(text, Term.COMPOUND);
						// mark tokens as assigned
						for (int offset = 0; offset < length; offset++)
							assigned[id + offset] = true;
						continue;
					}
				}
			}
		
		// construct single-token terms
		for (int id = 0; id < tokens.length; id++) {
			// token is part of a multi-token term?
			if (assigned[id]) continue;
			
			// token is a duplicate?
			if (!termSet.add(StringUtils.normalize(tokens[id]))) continue;
			// token does not contain keywords?
			if (KeywordExtractor.getKeywords(tokens[id]).length == 0) continue;
			
			// get named entity types and construct term
			String[] neTypes = getNeTypes(tokens[id], nes);
			terms[id] = new Term(tokens[id], pos[id], neTypes);
		}
		
		// get ordered list of terms
		List<Term> termsL = new ArrayList<Term>();
		for (Term term : terms)
			if (term != null) termsL.add(term);
		
		return termsL.toArray(new Term[termsL.size()]);
	}
	
	/**
	 * Extracts terms from the given sentence and context string.
	 * 
	 * @param sentence sentence to analyze
	 * @param context context string
	 * @param nes named entities in the sentence and context string
	 * @param dicts dictionaries with compound terms
	 * @return terms in the sentence and context string
	 */
	public static Term[] getTerms(String sentence, String context,
			String[][] nes, Dictionary[] dicts) {
		// extract terms from sentence
		Term[] sentenceTerms = getTerms(sentence, nes, dicts);
		if (context == null || context.length() == 0) return sentenceTerms;
		
		// extract terms from context string
		Term[] contextTerms = getTerms(context, nes, dicts);
		if (sentenceTerms.length == 0) return contextTerms;
		if (contextTerms.length == 0) return sentenceTerms;
		
		// merge terms, eliminate duplicates
		List<Term> terms = new ArrayList<Term>();
		Set<String> termSet = new HashSet<String>();
		for (Term sentenceTerm : sentenceTerms)
			if (termSet.add(StringUtils.normalize(sentenceTerm.getText())))
				terms.add(sentenceTerm);
		for (Term contextTerm : contextTerms)
			if (termSet.add(StringUtils.normalize(contextTerm.getText())))
				terms.add(contextTerm);
		return terms.toArray(new Term[terms.size()]);
	}
	
	/**
	 * Extracts single-token terms from the given sentence.
	 * 
	 * @param sentence sentence to analyze
	 * @return single-token terms in the sentence
	 */
	public static Term[] getSingleTokenTerms(String sentence) {
		// extract tokens
		String[] tokens = OpenNLP.tokenize(sentence);
		// tag part of speech
		String[] pos = OpenNLP.tagPos(tokens);
		
		// extracted terms
		ArrayList<Term> terms = new ArrayList<Term>();
		// normalized terms (do identify duplicates)
		Set<String> termSet = new HashSet<String>();
		
		// construct single-token terms
		for (int id = 0; id < tokens.length; id++) {
			// token is a duplicate?
			if (!termSet.add(StringUtils.normalize(tokens[id]))) continue;
			// token does not contain keywords?
			if (KeywordExtractor.getKeywords(tokens[id]).length == 0) continue;
			
			// construct term
			terms.add(new Term(tokens[id], pos[id]));
		}
		
		return terms.toArray(new Term[terms.size()]);
	}
}