SKOSTaggerImpl.java example

Explorer

hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java

/**
 * Copyright (c) 2010, UNC-Chapel Hill and Nescent

All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided 
that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this list of conditions and 
 * the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the 
 * following disclaimer in the documentation and/or other materials provided with the distribution.
 * Neither the name of the UNC-Chapel Hill or Nescent nor the names of its contributors may be used to endorse or promote 
 * products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

@author Jose R. Perez-Aguera
 */

package edu.unc.ils.mrc.hive.api.impl.elmo;

import java.io.BufferedReader;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;

import org.apache.commons.configuration.Configuration;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.perf4j.StopWatch;
import org.perf4j.log4j.Log4JStopWatch;

import edu.unc.ils.mrc.hive.api.ConceptNode;
import edu.unc.ils.mrc.hive.api.ConceptTreeBuilder;
import edu.unc.ils.mrc.hive.api.SKOSConcept;
import edu.unc.ils.mrc.hive.api.SKOSScheme;
import edu.unc.ils.mrc.hive.api.SKOSSearcher;
import edu.unc.ils.mrc.hive.api.SKOSTagger;
import edu.unc.ils.mrc.hive.ir.tagging.Tagger;
import edu.unc.ils.mrc.hive.ir.tagging.TaggerFactory;
import edu.unc.ils.mrc.hive.util.TextManager;

/**
 * This class implements the SKOSTagger interface, supporting 
 * automatic subject term extraction from one or more 
 * thesauri.
 */
public class SKOSTaggerImpl implements SKOSTagger 
{
    private static final Log logger = LogFactory.getLog(SKOSTaggerImpl.class);
	
	private static final int LIMIT = 10;

	private TreeMap<String, Tagger> taggers;
	private TreeMap<String, SKOSScheme> vocabularies;
	private String algorithm;
	private Configuration config;

	/**
	 * Constructs a tagger based on the specified vocabularies
	 * and algorithm.
	 * 
	 * @param vocabularies	Vocabularies to be used for term extraction
	 * @param algorithm		Algorithm to be used for term extraction
	 */
	public SKOSTaggerImpl(TreeMap<String, SKOSScheme> vocabularies,
			String algorithm) 
	{
		this.algorithm = algorithm;
		this.vocabularies = vocabularies;
		this.taggers = new TreeMap<String, Tagger>();
		Set<String> set = vocabularies.keySet();
		Iterator<String> it = set.iterator();
				
		if (this.algorithm.equals("kea")) {
			while (it.hasNext()) {
				String vocName = it.next();
				SKOSScheme schema = vocabularies.get(vocName);
				TaggerFactory.selectTagger(TaggerFactory.KEATAGGER);
				Tagger tagger = TaggerFactory.getTagger(schema
						.getKEAtestSetDir(), schema.getKEAModelPath(), schema
						.getStopwordsPath(), schema);
				this.taggers.put(vocName, tagger);
			}
		} 
		else if (this.algorithm.equals("maui")) {
			while (it.hasNext()) {
				String vocName = it.next();
				SKOSScheme schema = vocabularies.get(vocName);
				TaggerFactory.selectTagger(TaggerFactory.MAUITAGGER);
				Tagger tagger = TaggerFactory.getTagger(schema
						.getKEAtestSetDir(), schema.getMauiModelPath(), schema
						.getStopwordsPath(), schema);
				this.taggers.put(vocName, tagger);
			}
		}
		else if (this.algorithm.equals("dummy")) {
			SKOSScheme schema = vocabularies.get(vocabularies.firstKey());
			TaggerFactory.selectTagger(TaggerFactory.DUMMYTAGGER);
			Tagger tagger = TaggerFactory.getTagger("", schema
					.getLingpipeModel(), "", null);
			this.taggers.put("Dummytagger", tagger);
		} else {
		    logger.fatal(this.algorithm + " algorithm is not supported");
		}
		logger.debug("NUMBER OF TAGGERS: " + this.taggers.size());
		for (Tagger tag : this.taggers.values()) {
		    logger.info("Tagger: " + tag.getVocabulary());
		}
	}

	/**
	 * Returns a list of SKOSConcept objects for the specified URL
	 * using the specified vocabularies and SKOSSearcher implementation. 
	 * The maximum number of hops indicates the number of levels of links
	 * to be crawled/traversed when indexing the site.
	 * 
	 * This method uses the TextManager utility to extract text from the 
	 * URL.
	 * 
	 * @param url			URL of desired web site
	 * @param vocabularies  List of vocabularies
	 * @param searcher		Searcher implementation
	 * @param maxHops		Maximum number of links to be traversed (hops)
	 * @param numTerms		Number of terms to be returned
	 * @return
	 */
	public List<SKOSConcept> getTags(URL url, List<String> vocabulary, 
			SKOSSearcher searcher, int maxHops, int numTerms, boolean diff, int minOccur)
	{
		try
		{
			String proxyHost = config.getString("hive.http.proxyHost", null);
			int proxyPort = config.getInt("hive.http.proxyPort", -1);
			String[] ignorePrefixes = config.getStringArray("hive.ignorePrefix");
			TextManager tm = new TextManager();
			tm.setProxy(proxyHost, proxyPort);
			tm.setIgnorePrefixes(ignorePrefixes);
			String text = tm.getPlainText(url, maxHops, diff);
			return getTagsInternal(text, vocabulary, searcher, numTerms, minOccur);
		} catch (Exception e) {
			logger.error(e);
		}
		return null; 			
	}
	
	/**
	 * Returns a list of SKOSConcept objects for the specified file
	 * using the specified vocabularies and SKOSSearcher implementation.
	 * 
	 * @param path			Path to the file
	 * @param vocabularies	List of vocabularies
	 * @param searcher		Searcher implementation
	 * @param numTerms		Number of terms to be returned
	 * @return
	 */
	public List<SKOSConcept> getTags(String filePath, List<String> vocabularies, 
			SKOSSearcher searcher, int numTerms, int minOccur) 
	{
		TextManager tm = new TextManager();
		String text = tm.getPlainText(filePath);
				
		return getTagsInternal(text, vocabularies, searcher, numTerms, minOccur);          
	}
	
	
	@Override
	public List<SKOSConcept> getTagsFromText(String text,
			List<String> vocabularies, SKOSSearcher searcher, 
			int maxTerms, int minOccur) {
		return getTagsInternal(text, vocabularies, searcher, maxTerms, minOccur);
	}
	
	@Override
	public List<ConceptNode> getTagsAsTree(String text, List<String> vocabularies,
			SKOSSearcher searcher, int maxTerms, int minOccur) 
	{
		List<SKOSConcept> concepts =  getTagsInternal(text, vocabularies, searcher, maxTerms, minOccur);
		ConceptTreeBuilder tree = new ConceptTreeBuilder();
		for (SKOSConcept concept: concepts) {
			tree.add(concept, searcher);
		}
		return tree.getTree();
	}	
	
	
	/**
	 * Returns a list of SKOSConcept objects for the specified text
	 * using the specified vocabularies and SKOSSearcher implementation.
	 * 
	 * @param text			Full-text of document
	 * @param vocabularies	List of vocabularies
	 * @param searcher		Searcher implementation
	 * @param numTerms		Number of terms to be returned
	 * @param minOccur		Minimum number of phrase occurrences
	 * @return
	 */
	private List<SKOSConcept> getTagsInternal(String text, List<String> vocabularies, 
			SKOSSearcher searcher, int numTerms, int minOccur)
	{
		StopWatch stopwatch = new Log4JStopWatch();

		List<SKOSConcept> result = new ArrayList<SKOSConcept>();
		stopwatch.lap("GetPlainText");
		
		if (this.algorithm.equals("kea")) 
		{
			for (String voc : vocabularies) 
			{
				File testDir = new File(this.vocabularies.get(voc).getKEAtestSetDir());
				
				String fileId = UUID.randomUUID().toString();
				
				String tempFileName = fileId;
				File keaInputFile =  new File(testDir + File.separator + tempFileName + ".txt");
				
				logger.debug("Creating " + keaInputFile.getAbsolutePath());
				FileOutputStream fos;
				try {
					fos = new FileOutputStream(keaInputFile);
					PrintWriter pr = new PrintWriter(fos);
					pr.print(text);
					pr.close();
					fos.close();
				} catch (FileNotFoundException e) {
				    logger.error(e);
				} catch (IOException e) {
                    logger.error(e);
				}
				Tagger tagger = this.taggers.get(voc);
				String vocabularyName = tagger.getVocabulary();
				logger.info("Indexing with " + vocabularyName);
				try {
					tagger.extractKeyphrasesFromFile(tempFileName, numTerms, minOccur);
				} catch (RuntimeException e) {
					logger.error(e);
				}
				
				File keaOutputFile =  new File(testDir + File.separator + tempFileName + ".key");
				logger.debug("Reading key file " + keaOutputFile.getAbsolutePath());
				try {
					FileInputStream fis = new FileInputStream(keaOutputFile);
					InputStreamReader isr = new InputStreamReader(fis);
					BufferedReader br = new BufferedReader(isr);
					String line = br.readLine();
					while (line != null) {
						String[] elements = line.split("\t");
						String uri = elements[1];
						String[] uri_elements = uri.split("#");
						SKOSConcept concept = searcher.searchConceptByURI(
								uri_elements[0] + "#", uri_elements[1]);
						concept.setScore(new Double(elements[2]));
						result.add(concept);
						line = br.readLine();
					}
					br.close();
					isr.close();
					fis.close();
				} catch (FileNotFoundException e) {
					logger.error("unable to find file", e);
				} catch (IOException e) {
				    logger.error("file processing problem", e);
				}
				
				// If we do not delete these files, they are re-read during subsequent
				// extractKeyphrases and cause performance degradation.
				logger.debug("Deleting "+ keaInputFile.getAbsolutePath());
				//keaInputFile.delete();
				logger.debug("Deleting "+ keaOutputFile.getAbsolutePath());
				//keaOutputFile.delete();
			}

		} 
		else if (this.algorithm.equals("maui")) {

			for (String voc : vocabularies) 
			{
				File testDir = new File(this.vocabularies.get(voc).getKEAtestSetDir());
				String fileId = UUID.randomUUID().toString();
				String tempFileName = fileId;
				File keaInputFile =  new File(testDir + File.separator + tempFileName + ".txt");
				
				logger.debug("Creating " + keaInputFile.getAbsolutePath());
				FileOutputStream fos;
				try {
					fos = new FileOutputStream(keaInputFile);
					PrintWriter pr = new PrintWriter(fos);
					pr.print(text);
					pr.close();
					fos.close();
				} catch (FileNotFoundException e) {
				    logger.error(e);
				} catch (IOException e) {
                    logger.error(e);
				}
				Tagger tagger = this.taggers.get(voc);
				String vocabularyName = tagger.getVocabulary();
				logger.info("Indexing with " + vocabularyName);
				try {
					tagger.extractKeyphrasesFromFile(tempFileName, numTerms, minOccur);
				} catch (RuntimeException e) {
					logger.error(e);
				}
				
				File keaOutputFile =  new File(testDir + File.separator + tempFileName + ".key");
				logger.debug("Reading key file " + keaOutputFile.getAbsolutePath());
				try {
					FileInputStream fis = new FileInputStream(keaOutputFile);
					InputStreamReader isr = new InputStreamReader(fis);
					BufferedReader br = new BufferedReader(isr);
					String line = br.readLine();
					while (line != null) {
						String[] elements = line.split("\t");
						String uri = elements[1];
						String[] uri_elements = uri.split("#");
						SKOSConcept concept = searcher.searchConceptByURI(
								uri_elements[0] + "#", uri_elements[1]);
						concept.setScore(new Double(elements[2]));
						result.add(concept);
						line = br.readLine();
						
						/*
						List<SKOSConcept> concepts = searcher
								.searchConceptByKeyword(concept);
						if (concepts.size() > 0) {
						    concepts.get(0).setScore(new Double(elements[2]));
					        result.add(concepts.get(0));
					        //logger.debug("concept QName = " + concepts.get(0).getQName());
						}
				        */
					}
					br.close();
					isr.close();
					fis.close();
				} catch (FileNotFoundException e) {
					logger.error("unable to find file", e);
				} catch (IOException e) {
				    logger.error("file processing problem", e);
				}
				
				// If we do not delete these files, they are re-read during subsequent
				// extractKeyphrases and cause performance degradation.
				logger.debug("Deleting "+ keaInputFile.getAbsolutePath());
				//keaInputFile.delete();
				logger.debug("Deleting "+ keaOutputFile.getAbsolutePath());
				//keaOutputFile.delete();
			}
		}
		else if (this.algorithm.equals("dummy")) {
			Tagger tagger = this.taggers.get("Dummytagger");
			logger.info("Dummy indexing with " + tagger.getVocabulary());
			logger.debug("extracting keyphrases");
			List<String> keywords = tagger.extractKeyphrases(text);
			logger.info("Number of keyphrases: " + keywords.size());
			int limit = numTerms;
			if (limit > keywords.size()) {
				limit = keywords.size();
			}
			logger.debug("searching for keyphrases in index");
			for (int i = 0; i < limit; i++) {
				List<SKOSConcept> concepts = searcher
						.searchConceptByKeyword(keywords.get(i));
				if (concepts.size() > 0)
					result.add(concepts.get(0));
				if (concepts.size() > 1)
					result.add(concepts.get(1));
				if (concepts.size() > 2)
					result.add(concepts.get(2));
			}
			logger.debug("tagging complete");
		}
		stopwatch.lap("GetTags");


		return result;
	}
	
	public void setConfig(Configuration config)
	{
		this.config = config;
	}
}