FeatureExtractor.java example

Explorer
sensim-master
- src
  - main
    - java
      - dima
        CosineDistancePigFunction.java
        MahoutVectorConverter.java
        MatrixMarketStorageWithCounts.java
        UIMAConverterHelper.java
        UIMAXMLConverterHelper.java
      - sensim
        CoreNLPAnnotator.java
        FeatureExtractor.java
        IndexToIndexMapper.java
        JsonLoader.java
        NounPairLabeler.java
        PairSentenceSelector.java
        PukwacReader.java
package sensim;

import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.N;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NN;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NP;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pig.data.*;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.jgrapht.alg.ConnectivityInspector;
import org.jgrapht.alg.KShortestPaths;
import org.jgrapht.graph.UndirectedSubgraph;
import org.xml.sax.SAXException;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.builtin.OutputSchema;
import org.apache.pig.tools.pigstats.PigStatusReporter;
import org.apache.uima.UIMAException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.jgrapht.GraphPath;
import org.jgrapht.Graphs;
import org.jgrapht.UndirectedGraph;
import org.jgrapht.alg.BronKerboschCliqueFinder;
import org.jgrapht.alg.FloydWarshallShortestPaths;
import org.jgrapht.graph.SimpleGraph;

import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import dima.UIMAXMLConverterHelper;

/**
 * Date: 4/10/13
 * Time: 3:35 PM
 *
 * @author Priska Herger, Johannes Kirschnick
 *         <p/>
 *         Description: A Pig script that takes a JCas as input and extracts
 *         its annotations in terms of the shortest path between
 *         all recognized entities.
 */

@OutputSchema("features:bag {datum:tuple (noun1:chararray, noun2:chararray, feature:chararray, sentence:chararray)}")
public class FeatureExtractor extends EvalFunc<DataBag> {

	private final JCas jCas;

	private UIMAXMLConverterHelper uimaXMLConverterHelper;
	private BagFactory bagFactory = BagFactory.getInstance();
	private TupleFactory tupleFactory = TupleFactory.getInstance();

	private int numMaxHops = 5;
	private int numSkipsInSubtree = 0;

	enum SelectionType {
		NOUN, COMMONNOUN, PROPERNOUN, NAMEDENTITY
	}

	private final Class<? extends Annotation> selectionType;

	enum Counters {
		FEATURES
	}

	public FeatureExtractor() throws UIMAException {

		this("NOUN");
	}

	public FeatureExtractor(String selectionType) throws UIMAException {

		uimaXMLConverterHelper = new UIMAXMLConverterHelper(false);
		jCas = JCasFactory.createJCas();

		switch (SelectionType.valueOf(selectionType)) {

			case NOUN:
				this.selectionType = N.class;
				break;
			case COMMONNOUN:
				this.selectionType = NN.class;
				break;
			case PROPERNOUN:
				this.selectionType = NP.class;
				break;
			case NAMEDENTITY:
				this.selectionType = NamedEntity.class;
				break;
			default:
				this.selectionType = N.class;
		}
	}

	public FeatureExtractor(String selectionType, String numMaxHops, String numSkipsInSubtree)
			throws UIMAException {

		this(selectionType);
		this.numMaxHops = Integer.parseInt(numMaxHops);
		this.numSkipsInSubtree = Integer.parseInt(numSkipsInSubtree);
	}

	@Override
	public DataBag exec(Tuple input) throws IOException {

		if (input == null || input.size() == 0 || input.get(0) == null) {
			return null;
		}

		DataBag patternBag = bagFactory.newDefaultBag();

		try {

			CharSequence charseq = (CharSequence) input.get(0);
			InputStream stream = IOUtils.toInputStream(charseq, Charsets.UTF_8.name());
			// note that jCas is changed in deserialize(...) and contains different data upon return!
			// design decision in favor of speed at the expense of readability
			uimaXMLConverterHelper.deserialize(stream, jCas);

			// for every sentence in jCas, do
			Iterator<Sentence> sentences = JCasUtil.iterator(jCas, Sentence.class);

			while (sentences.hasNext()) {

				Sentence sentence = sentences.next();

				// insert dependencies into a graph and extract shortest path
				UndirectedGraph<Token, DependencyEdge> graph = makeDependencyGraph(sentence);
				if (graph == null) {
					return null;
				}

				if (input.size() == 3) {

					ArrayList<String> nounPair = Lists.newArrayList();
					nounPair.add((String) input.get(1));
					nounPair.add((String) input.get(2));

					patternBag.addAll(getAllSubtrees(sentence, nounPair, graph));

				}
				else {
					return null;
				}
			}

			// emit some stats
			PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
			if (pigStatusReporter != null) {
				pigStatusReporter.getCounter(Counters.FEATURES).increment(patternBag.size());
			}

		} catch (UIMAException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();
		}
		return patternBag;
	}

	/**
	 * Inserts all dependencies of an annotated sentence into a graph
	 * with governor and dependent as vertices and dependency type as
	 * edge.
	 *
	 * @return UndirectedGraph of dependencies
	 */
	public UndirectedGraph<Token, DependencyEdge> makeDependencyGraph(Sentence sentence) {

		UndirectedGraph<Token, DependencyEdge> graph = new SimpleGraph<Token, DependencyEdge>(DependencyEdge.class);

		// skip sentences with less than two words of the selected type
		List<? extends Annotation> nouns = JCasUtil.selectCovered(selectionType, sentence);
		if(nouns.size() < 2) {
			return null;
		}

		List<Dependency> dependencies = JCasUtil.selectCovered(Dependency.class, sentence);

		for (Dependency dependency : dependencies) {

			Token governor = dependency.getGovernor();
			Token dependent = dependency.getDependent();
			String dependencyType = dependency.getDependencyType();

			if(governor != null & dependent != null) { //TODO: why would it be null !?
				graph.addVertex(governor);
				graph.addVertex(dependent);
				graph.addEdge(governor, dependent, new DependencyEdge(governor, dependent, dependencyType));
			}
		}
		return graph;
	}

	/**
	 * Extract all subtrees involving the specified word pair up to a certain length as features.
	 *
	 * @param sentence
	 * @param wordPair
	 * @param graph
	 * @return dataBag
	 */
	public DataBag getAllSubtrees(Sentence sentence, ArrayList<String> wordPair, final UndirectedGraph<Token, DependencyEdge> graph) {

		DataBag dataBag = bagFactory.newDefaultBag();

		// make each word in the pair a Token
		List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence);
		List<Token> pair = Lists.newArrayListWithExpectedSize(wordPair.size());
		for (Token token : tokens) {
			// if (token.getCoveredText().equals(nounPair.get(0)) || token.getCoveredText().equals(nounPair.get(1))) {
			if (token.getLemma().getValue().equals(wordPair.get(0)) || token.getLemma().getValue().equals(wordPair.get(1))) {
				pair.add(token);
			}
		}
		if (pair.size() != wordPair.size()) {
			return dataBag;
		}

		Token w1 = pair.get(0);
		Token w2 = pair.get(1);

		if (w1 == null || w2 == null) {
			return dataBag;
		}

		// get powerset of tokens
		Set<Token> tokenSet = new HashSet<Token>(tokens);

		// Beware: magic number! dictacted by the powerSet function which only works for N <= 30
		if (tokenSet.size() > 30) {
			return dataBag;
		}

		Set<Set<Token>> tokenPowerSet = Sets.powerSet(tokenSet);

		// make a subgraph for each set in the powerset of tokens
		for (Set<Token> tSet : tokenPowerSet) {
			try {

				// if tSet contains more than specified max number of tokens, continue
				if (tSet.size() < 3 || tSet.size() > this.numMaxHops) {
					continue;
				}

				// if tSet doesn't contain w1 _and_ w2 continue
				if (!tSet.contains(w1) || !tSet.contains(w2)) {
					continue;
				}

				// make subgraph
				UndirectedSubgraph<Token, DependencyEdge> subgraph =
						new UndirectedSubgraph<Token, DependencyEdge>(graph, tSet, graph.edgeSet());

				// if the subgraph is not connected, dismiss this set
				ConnectivityInspector<Token, DependencyEdge> inspector =
						new ConnectivityInspector<Token, DependencyEdge>(subgraph);
				if (!inspector.isGraphConnected()) {
					continue;
				}

				Set<DependencyEdge> edgeSet = subgraph.edgeSet();
				Set<Token> vertexSet = Sets.newHashSet();
				// subgraph.vertexSet() is unmodifiable but sometimes an 'and' Token will be added manually,
				// hence the following copy action
				for(Token tkn : subgraph.vertexSet()) {
					vertexSet.add(tkn);
				}

				// if edgeSet contains the dependency 'conj', manually add the 'and' Token to vertexSet
				for(DependencyEdge edge : edgeSet) {
					if(edge.dependency.equals("conj")) {
						List<Token> tmpTokens = JCasUtil.selectBetween(jCas, Token.class, edge.from, edge.to);
						for(Token tkn : tmpTokens) {
							if(tkn.getCoveredText().equals("and")) {
								vertexSet.add(tkn);
							}
						}
					}
				}

				if(vertexSet.size() < 3) {
					continue;
				}

					// if subgraph contains noun, dismiss it
					boolean skipThisSet = false;
					int smallerBegin = (w1.getBegin() < w2.getBegin()) ? w1.getBegin() : w2.getBegin();

					for(Token vertex : vertexSet) {

						// if this set contains nouns other than w1 and w2, skip it
						if(vertex.getPos().getPosValue().startsWith("N") && vertex != w1 && vertex != w2) {
							skipThisSet = true;
						}
						// if any token is a fullstop, skip the set ( because the same pattern will appear without )
						//TODO: would suffice to check on the last token
						if(vertex.getCoveredText().equals(".")) {
							skipThisSet = true;
						}
					}
					if(skipThisSet) {
						continue;
					}

				// generate patterns
				// join all vertices in correct (linear) positional order
				SortedMap<Integer, Token> vertexMap = new TreeMap<Integer, Token>();
				for(Token vertex : vertexSet) {
					vertexMap.put(vertex.getBegin(), vertex);
				}
				List<Token> vertexList = Lists.newArrayListWithExpectedSize(vertexSet.size());
				int i = 0;
				for(Integer pos : vertexMap.keySet()) { // Note: is sorted ascendingly by key set
					vertexList.add(i, vertexMap.get(pos));
					i += 1;
				}

				// join lower-cased covered text on space
				String pattern = Joiner.on(" ").join(Iterables.transform(vertexList, new Function<Token, String>() {
					@Override
					public String apply(Token token) {
						return token.getCoveredText().replaceAll("\n", " ").toLowerCase();
					}
				}));

				// use regular expressions to include word boundaries in match; important in particular with very
				// short w1 and w2, e.g. 's'
				Pattern regex1 = Pattern.compile("\\b" + w1.getCoveredText().toLowerCase() + "\\b");
				Pattern regex2 = Pattern.compile("\\b" + w2.getCoveredText().toLowerCase() + "\\b");

				Matcher matcher1 = regex1.matcher(pattern);
				int idx1 = matcher1.find() ? matcher1.start() : -1;

				Matcher matcher2 = regex2.matcher(pattern);
				int idx2 = matcher2.find() ? matcher2.start() : -1;

				// figure out their order here as well, s. shortest path method
				boolean assumedOrder;
				if(idx1 < idx2) {
					assumedOrder = true;
				} else {
					assumedOrder = false;
				}

				// replace w1 with X and w2 with Y
				Matcher matcher = regex1.matcher(pattern);
				pattern = matcher.replaceAll("X");
				matcher = regex2.matcher(pattern);
				pattern = matcher.replaceAll("Y");

				//TODO: generate patterns with skips

				Tuple tuple = tupleFactory.newTuple(4);
				// return nouns in order observed in sentence rather than incoming order
				tuple.set(0, (assumedOrder ? pair.get(0).getLemma().getValue() : pair.get(1).getLemma().getValue()));
				tuple.set(1, (assumedOrder ? pair.get(1).getLemma().getValue() : pair.get(0).getLemma().getValue()));
				tuple.set(2, pattern.trim());
				//String sentence = JCasUtil.selectCovering(jCas, Sentence.class, w1.getBegin(),
				//		w2.getEnd()).get(0).getCoveredText();
				//String sentence = jCas.getDocumentText();
				tuple.set(3, sentence.getCoveredText());
				dataBag.add(tuple);

			} catch (ExecException e) {
				e.printStackTrace();
			}
		}
		return dataBag;
	}

	/**
	 * Extract the shortest path along the dependency parse
	 * between the two provided nouns.
	 *
	 * @param nounPair
	 * @param graph
	 * @return DataBag dataBag     	a data bag of ( entity_pair, pattern ) tuples
	 *         					 	with the shortest path as pattern
	 */
	public DataBag getShortestPath(ArrayList<String> nounPair, final UndirectedGraph<Token, DependencyEdge> graph)
			throws ExecException {

		DataBag dataBag = bagFactory.newDefaultBag();

		ArrayList<Token> tokens = Lists.newArrayList(JCasUtil.select(jCas, Token.class));
		List<Token> pair = Lists.newArrayListWithExpectedSize(nounPair.size());
		for (Token token : tokens) {
			//TODO: FIXME: matching by use of strings is unreliable, e.g., wrt multiple occurrence
			if (token.getLemma().getValue().equals(nounPair.get(0)) || token.getLemma().getValue().equals(nounPair.get(1))) {
				pair.add(token);
			}
		}
		if (pair.size() != nounPair.size()) {
			return dataBag;
		}

		FloydWarshallShortestPaths shortestPaths =
				new FloydWarshallShortestPaths<Token, DependencyEdge>(graph);
		if (shortestPaths == null) {
			return dataBag;
		}

		// extract the shortest path between the two nouns
		int shortestPathsCount = shortestPaths.getShortestPathsCount();

		Token n1 = pair.get(0);
		Token n2 = pair.get(1);

		try {

			// if e1 or e2 is NULL we might not have information about the dependencies in the sentence
			// which contains the NER
			// this is just an error as we are searching for links in the whole document, not just sentence wise
			if(n1 == null || n2 == null) {
				return dataBag;
			}

			GraphPath<Token, DependencyEdge> shortestPath = shortestPaths.getShortestPath(n1, n2);
			if ((shortestPathsCount == 0) || (shortestPath == null)) {
				return dataBag;
			}

			// retrieve vertices on the shortest path
			List<Token> vertexPath = Graphs.getPathVertexList(shortestPath);
			// the first and last entry are the start and end vertex -> skip them
			List<Token> vertices = vertexPath.subList(1, vertexPath.size() - 1);
			String vertexLabels = Joiner.on(" ").join(Iterables.transform(vertices, new Function<Token, String>() {
				@Override
				public String apply(Token token) {
					return token.getCoveredText().replaceAll("\n", " ").toLowerCase();
				}
			}));

			// retrieve edges /dependencies on the shortest path
			List<DependencyEdge> edges = shortestPath.getEdgeList();
			String edgeLabels = Joiner.on(",").join(Iterables.transform(edges, new Function<DependencyEdge, String>() {
				@Override
				public String apply(DependencyEdge edge) {
					return edge.dependency;
				}
			}));

			// attempt to include preceding prepositions and ensuing ccs
			Iterator<Sentence> sentences = JCasUtil.iterator(jCas, Sentence.class);
			String prepender = "";
			String postpender = "";
			while (sentences.hasNext()) {
				Sentence sentence = sentences.next();
				List<Dependency> dependencies = JCasUtil.selectCovered(Dependency.class, sentence);

				for (Dependency dependency : dependencies) {

					Token governor = dependency.getGovernor();
					Token dependent = dependency.getDependent();
					String dependencyType = dependency.getDependencyType();

					// include preceding pobj ( hopefully prepositions / IN mostly )
					if (dependencyType.equals("pobj") && dependent.equals(n1)) {
						// if dependencyType is in shortest path already, ignore it here
						if (!shortestPath.getEdgeList().contains(new DependencyEdge(governor, dependent, dependencyType))) {
							prepender = governor.getCoveredText() + " ";
						}
					}
					// include actual cc string in case of conj dependency
					if (dependencyType.equals("cc") && governor.equals(n1)) {
						postpender = dependent.getCoveredText();
					}
				}
			}

			// skip patterns where both vertexLabels and postpender are empty
			if (vertexLabels.isEmpty() && postpender.isEmpty()) {
				return dataBag;
			}
			else if (!vertexLabels.isEmpty() && !postpender.isEmpty()) {
				postpender += " ";
			}

			// concatenate pattern parts
			final String pattern;
			final boolean normalOrder;
			// now we need to find the order of the entities
			if(pair.get(0).getBegin() < pair.get(1).getBegin()) {
				// X -> Y
				pattern = prepender + "X " + postpender + vertexLabels + " Y"; //[" + edgeLabels + "]";
				normalOrder = true;
			} else {
				// Y -> X
				//TODO: add pre- and postpenders here, too ?
				pattern = "Y " + vertexLabels + " X"; //[" + edgeLabels + "]";
				normalOrder = false;
			}

			Tuple tuple = tupleFactory.newTuple(4);

			// output: first noun \t second noun \t pattern \t original sentence
			tuple.set(0, (normalOrder ? pair.get(0) : pair.get(1)).getLemma().getValue());
			tuple.set(1, (normalOrder ? pair.get(1) : pair.get(0)).getLemma().getValue());
			tuple.set(2, pattern);
			String coveredText = JCasUtil.selectCovering(jCas, Sentence.class, n1.getBegin(),
					n1.getEnd()).get(0).getCoveredText();
			tuple.set(3, coveredText);
			dataBag.add(tuple);

		} catch (ArrayIndexOutOfBoundsException e) {
			e.printStackTrace();
		} catch (NullPointerException e) {
			e.printStackTrace();
		}

		return dataBag;
	}

	/**
	 *
	 * Extract the k shortest paths between two given words along the dependency parse of a sentence.
	 * @param nounPair
	 * @param graph
	 * @return DataBag				a data bag of tuples of length 4 containing the word pair, the extracted pattern,
	 * 								and the underlying sentence.
	 * @throws ExecException
	 */
	public DataBag getKShortestPathes(ArrayList<String> nounPair, final UndirectedGraph<Token, DependencyEdge> graph)
		throws ExecException {

		DataBag dataBag = bagFactory.newDefaultBag();

		// get tokens for each of the two words
		ArrayList<Token> tokens = Lists.newArrayList(JCasUtil.select(jCas, Token.class));
		List<Token> pair = Lists.newArrayListWithExpectedSize(nounPair.size());
		for (Token token : tokens) {
			if (token.getLemma().getValue().equals(nounPair.get(0)) ||
					token.getLemma().getValue().equals(nounPair.get(1))) {
				pair.add(token);
			}
		}
		if (pair.size() != nounPair.size()) {
			return dataBag;
		}

		// for each word get its k shortest pathes to other words
		KShortestPaths kShortestPaths = new KShortestPaths(graph, pair.get(0), this.numSkipsInSubtree, this.numMaxHops);

		if (kShortestPaths == null) {
			return dataBag;
		}

		Token w1 = pair.get(0);
		Token w2 = pair.get(1);

		if(w1 == null || w2 == null) {
			return dataBag;
		}

		//TODO: finish writing this feature variant

		return dataBag;
	}

	/**
	 *
	 * Find all cliques in the given graph < well, untractable and such.
	 *
	 * @param graph             a graph
	 * @return DataBag dataBag  a data bag of tuples
	 */
	public DataBag findAllMaximalCliques(ArrayList<String> wordPair, final UndirectedGraph<Token, DependencyEdge> graph) throws ExecException {

		DataBag dataBag = bagFactory.newDefaultBag();

		// get tokens for each of the two words and check if they're contained in the sentence
		ArrayList<Token> tokens = Lists.newArrayList(JCasUtil.select(jCas, Token.class));
		List<Token> pair = Lists.newArrayListWithExpectedSize(wordPair.size());
		for (Token token : tokens) {
			if (token.getLemma().getValue().equals(wordPair.get(0)) ||
					token.getLemma().getValue().equals(wordPair.get(1))) {
				pair.add(token);
			}
		}
		if (pair.size() != wordPair.size()) {
			return dataBag;
		}

		BronKerboschCliqueFinder cliqueFinder = new BronKerboschCliqueFinder(graph);
		Collection cliques= cliqueFinder.getAllMaximalCliques();

		if (cliques == null || cliques.size() == 0) {
			return dataBag;
		}

		Token w1 = pair.get(0);
		Token w2 = pair.get(1);

		Iterator iterator = cliques.iterator();
		while (iterator.hasNext()) {

			HashSet<Token> hset = (HashSet<Token>) iterator.next();

			Iterator it2 = hset.iterator();
			Token source = (Token)it2.next();
			Token target = (Token)it2.next();

			ArrayList<Token> coveredTokens = (ArrayList<Token>) JCasUtil.selectCovered(jCas, Token.class,
					source.getBegin(), target.getEnd());

			ArrayList<String> pattern = Lists.newArrayList();
			ArrayList<String> pattern_variant = Lists.newArrayList();

			boolean foundW1 = false;
			boolean foundW2 = false;

			for(Token token : coveredTokens) {

				// check if both words are contained
				if(!foundW1 && token.equals(w1)) {
					foundW1 = true;
					pattern.add("X");
					pattern_variant.add("X");
				}
				else if(!foundW2 && token.equals(w2)) {
					foundW2 = true;
					pattern.add("Y");
					pattern_variant.add("Y");
				}
				else if(token.getPos().getPosValue().equals("JJ")) {
					pattern.add(token.getCoveredText().toLowerCase());
					pattern_variant.add("<adj>");
				}
				else if(token.getPos().getPosValue().startsWith("N")) {
					pattern.add(token.getCoveredText().toLowerCase());
					pattern_variant.add("<noun>");
				}
				else {
					pattern.add(token.getCoveredText().toLowerCase());
					pattern_variant.add(token.getCoveredText().toLowerCase());
				}
			}

			if(pattern.size() == 0 || !(foundW1 && foundW2)) {
				return dataBag;
			}

			Tuple tuple = tupleFactory.newTuple(4);
			tuple.set(0, w1.getLemma().getValue());
			tuple.set(1, w2.getLemma().getValue());
			tuple.set(2, StringUtils.join(pattern, " "));
			String sentence = JCasUtil.selectCovering(jCas, Sentence.class, source.getBegin(),
					target.getEnd()).get(0).getCoveredText();
			tuple.set(3, sentence);
			dataBag.add(tuple);

			// add a more abstracted version of the pattern, too, but only for short patterns, else long patterns get
			// unnecessary frequencies from abstraction
			if(coveredTokens.size() < 7) {
				Tuple tuple2 = tupleFactory.newTuple(4);
				tuple2.set(0, w1.getLemma().getValue());
				tuple2.set(1, w2.getLemma().getValue());
				tuple2.set(2, StringUtils.join(pattern_variant, " "));
				tuple2.set(3, sentence);
				dataBag.add(tuple2);
			}

			// add a another abstracted version of the pattern
			if(coveredTokens.size() < 10) {
				Tuple tuple3 = tupleFactory.newTuple(4);
				tuple3.set(0, w1.getLemma().getValue());
				tuple3.set(1, w2.getLemma().getValue());
				tuple3.set(2, StringUtils.join(pattern_variant, " ").replace("<noun>","").replace("<adj>",""));
				tuple3.set(3, sentence);
				dataBag.add(tuple3);
			}
		}
		return dataBag;
	}

	/**
	 * Extract the head of a named entity - for now pretend the head
	 * is the token with the highest degree in the dependency graph.
	 *
	 * @param entity 			the annotation (named entity, compound noun) whose head to get
	 * @param graph       		the dependency graph
	 * @return String head      the head of the annotation
	 */
	public Token getEntityHead(Annotation entity, UndirectedGraph<Token, DependencyEdge> graph) {

		// get all tokens
		List<Token> tokens = JCasUtil.selectCovered(Token.class, entity);
		Token head = null;
		int degree = -1;
		for (Token token : tokens) {
			if (graph.containsVertex(token)) {
				int tmp = graph.degreeOf(token);
				if (tmp >= degree) {
					degree = tmp;
					head = token;
				}
			}
		}
		return head;
	}

/*	@Override
	public List<FuncSpec> getArgToFuncMapping() throws FrontendException {

		List<FuncSpec> funcList = new ArrayList<FuncSpec>();

		Schema tupleSchema = new Schema();
		tupleSchema.add(new Schema.FieldSchema(null, DataType.CHARARRAY));

		funcList.add(new FuncSpec(this.getClass().getName(), tupleSchema));

		return funcList;
	}
*/

	private static class DependencyEdge {
		private Token from;
		private Token to;
		private String dependency;

		private DependencyEdge(Token from, Token to, String dependency) {
			this.from = from;
			this.to = to;
			this.dependency = dependency;
		}

		@Override
		public boolean equals(Object o) {
			if (this == o) return true;
			if (o == null || getClass() != o.getClass()) return false;

			DependencyEdge that = (DependencyEdge) o;

			if (dependency != null ? !dependency.equals(that.dependency) : that.dependency != null) return false;
			if (from != null ? !from.equals(that.from) : that.from != null) return false;
			if (to != null ? !to.equals(that.to) : that.to != null) return false;

			return true;
		}

		@Override
		public int hashCode() {
			int result = from != null ? from.hashCode() : 0;
			result = 31 * result + (to != null ? to.hashCode() : 0);
			result = 31 * result + (dependency != null ? dependency.hashCode() : 0);
			return result;
		}
	}
}