AnnotatorManager.java example

Explorer

tml-master
- tml
  - src
    - main
      - java
        SemanticSpace.java
        TmlCommandLine.java
        tml
        Configuration.java
        annotators
        AbstractAnnotator.java
        Annotator.java
        AnnotatorManager.java
        PennTreeAnnotator.java
        corpus
        Corpus.java
        CorpusParameters.java
        Dictionary.java
        ParagraphCorpus.java
        RepositoryCorpus.java
        SearchResultsCorpus.java
        SentenceCorpus.java
        SimpleCorpus.java
        Term.java
        TextDocument.java
        TextPassage.java
        sql
        DbConnection.java
        storage
        DocumentAnnotator.java
        DocumentCleanup.java
        Repository.java
        RepositoryEvent.java
        RepositoryListener.java
        TmlAnnotatorTask.java
        TmlCleanupTask.java
        TmlIndexerTask.java
        importers
        AbstractImporter.java
        HtmlImporter.java
        Importer.java
        PdfImporter.java
        TextImporter.java
        test
        AbstractTmlIndexingTest.java
        utils
        DBUtils.java
        DistanceLib.java
        Highlighting.java
        JDBCUtils.java
        LanczosSVDLIBCUtils.java
        LanczosSVDPACKCUtils.java
        LuceneUtils.java
        MatrixUtils.java
        RegexUtils.java
        StanfordUtils.java
        Stats.java
        WordNetUtils.java
        vectorspace
        EmptyTextPassageException.java
        NoDocumentsInCorpusException.java
        NotEnoughTermsInCorpusException.java
        SVD.java
        SemanticSpace.java
        TermWeighting.java
        TermWeightingException.java
        factorisation
        MatrixFactorisation.java
        MultiDimensionalScalingNR.java
        NonnegativeMatrixFactorisationED.java
        NonnegativeMatrixFactorisationKL.java
        PrincipalCoordinateAnalysis.java
        ProbabilisticLatentSemanticAnalysis.java
        SingularValueDecomposition.java
        SpaceDecomposition.java
        operations
        AbstractOperation.java
        ClassDiscovery.java
        CompoundNounsSummarized.java
        ConceptExtraction.java
        FactorAnalysisPlot.java
        LastPassage.java
        LexiconAnalysis.java
        Operation.java
        OperationEvent.java
        OperationListener.java
        ParagraphCoherenceIndex.java
        PassageDistances.java
        PassageExtractionSummarization.java
        PassagesSimilarity.java
        RapidAutomaticKeywordExtraction.java
        Readability.java
        RelationshipExtraction.java
        Summary.java
        TagClouds.java
        TermExtractionSummarization.java
        results
        AbstractResult.java
        FactorAnalysisPlotResult.java
        LastPassageResult.java
        LexiconAnalysisResult.java
        NullResult.java
        ParagraphCoherenceIndexResult.java
        PassageClusteringLingoResult.java
        PassageDistancesResult.java
        PassageExtractionSummarizationResult.java
        PassageSimilarityResult.java
        RapidAutomaticKeywordExtractionResult.java
        ReadabilityResult.java
        RelationshipExtractionResult.java
        Summary.java
        SummaryResult.java
        TagCloudsResult.java
        TermRankedResult.java
        TermsExtractionSummarizationResult.java
        summarization
        AbstractSummarizationOperation.java
        LatentSemanticAnalysisSummarization.java
        SummarizationOperation.java
        VectorLengthSummarization.java
        visualizations
        AbstractVisualization.java
        TagClouds.java
        Visualization.java
    - test
      - java
        tml
        test
        DbConnectionTest.java
        FactorAnalysisPlotTest.java
        IndexingDocumentsTest.java
        IndexingHtmlTest.java
        IndexingInvalidDocumentsTest.java
        IndexingPlainTextTest.java
        LanczosTest.java
        LuceneSearchTest.java
        NonNegativeMatrixFactorizationTest.java
        RapidAutomaticKeywordExtractionTest.java
        ReadabilityTest.java
        SimpleCorpusTest.java
        StemmingTest.java
        TagCloudsTest.java
        ValidateBerryDumaisTest.java
        ValidateDistancesTest.java
        ValidateHandbookOfLSATest.java
        ValidateIntroToLSATest.java
        ValidateSameDistancesAllDimensions.java

/*******************************************************************************
 *  Copyright 2007, 2009 Ming Liu
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License. 
 *  You may obtain a copy of the License at 
 *  
 *  	http://www.apache.org/licenses/LICENSE-2.0 
 *  	
 *  Unless required by applicable law or agreed to in writing, software 
 *  distributed under the License is distributed on an "AS IS" BASIS, 
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *  See the License for the specific language governing permissions and 
 *  limitations under the License.
 *******************************************************************************/
package tml.annotators;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
/*******************************************************************************
 * Copyright (C) 2001, 2009 University of Sydney
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
 * USA
 * 
 * http://www.gnu.org/licenses/gpl.txt
 *******************************************************************************/
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.log4j.Logger;
import org.apache.lucene.queryParser.ParseException;

import tml.Configuration;
import tml.corpus.SentenceCorpus;
import tml.corpus.TextDocument;
import tml.corpus.CorpusParameters.TermSelection;
import tml.storage.Repository;
import tml.utils.DBUtils;
import tml.vectorspace.NoDocumentsInCorpusException;
import tml.vectorspace.NotEnoughTermsInCorpusException;
import tml.vectorspace.TermWeightingException;

/**
 * This class implements the management of meta information in sentence level.
 * It searches the lucene index where the sentences are stored, and 
 * then parsed the sentence and insert the annotated sentence into Mysql database. 
 * The setting for lucene index file path and Mysql database are read from TML property file
 * 
 * @author Ming Liu
 * 
 */
public class AnnotatorManager {

	// General attributes
	/** The logger for log4j */
	private static Logger logger = Logger.getLogger(AnnotatorManager.class);
	private String driver;
	private String url;
	private String username;
	private String password;
	private String indexpath;
	private List<Annotator> annotators = new ArrayList<Annotator>();
//	private String docid ="document%3Adddxrkj5142cgn4dtd6-5";
	DBUtils dbutil = null;
	private Repository repository=null;
	
	public AnnotatorManager() throws Exception {
		
		// Read default properties and initialize database connection parameters
		//Configuration.getTmlProperties();
		driver = Configuration.getTmlProperties().getProperty(
				"tml.database.driver");
		url = Configuration.getTmlProperties().getProperty("tml.database.url");
		username = Configuration.getTmlProperties().getProperty(
				"tml.database.username");
		password = Configuration.getTmlProperties().getProperty(
				"tml.database.password");
		// TODO: Analyze if storing the indexpath in the properties file violates having
		// one repository per JVM. It should be a different properties file.
		indexpath=Configuration.getTmlProperties().getProperty(
		"tml.lucene.indexpath");
		dbutil = new DBUtils(driver,url,username,password);
		repository = new Repository(indexpath);

	}
	/**
	 *  insert the annotated text into Mysql DB. 
	 */
	public void insertMetainfoToDB()
	{
		dbutil.setConnection();
		getAnnotators();
		ArrayList<String> unprocessedList = searchDocTable();
		for(int i=0; i<unprocessedList.size();i++)
		{
		 String documentid=unprocessedList.get(i);
		 HashMap<String,String> sentencesandid = getSentenceFromLucene(documentid);
		 if (sentencesandid==null)
		 {
			 updateDocTable(documentid,"Unavailable in Lucene");
			 continue;
		 }
		 Set<Map.Entry<String, String>> entrySet = sentencesandid.entrySet();
		 Iterator<Entry<String, String>> it = entrySet.iterator();	
		 while (it.hasNext()) {
		  
		   Map.Entry<String, String> en= it.next();
		   for(int j=0;j<annotators.size();j++)
			{
				Annotator annotator = annotators.get(j);				
				double time = System.nanoTime();
				
				String annotatedText = annotator.getAnnotations(en.getValue());
				if (annotatedText==null)
				{
					updateDocTable(documentid,"failure");
				}
				time = (System.nanoTime() - time) * 10E-9;
				// avoid sql injection, particularly in single quote problem
				annotatedText = annotatedText.replace("'", "''");				
				dbinsert(en.getKey(),documentid,annotatedText,annotator.getFieldName(),time);
				updateDocTable(documentid,"processed");
			}
		  }
		}
		 dbutil.closeConnection();
		  
	}
	/**
	 * retrive sentence id and value in pair from lucene index by documentid
	 * @param documentid
	 * @return a Hashmap where the key contains sentenceid and the value contains its content.
	 */
	public HashMap<String,String> getSentenceFromLucene(String documentid) 
	{
		try {							
			TextDocument document = repository.getTextDocument(documentid);
			SentenceCorpus corpus = new SentenceCorpus(document);
//			corpus.getParameters().setCalculateSemanticSpace(false);
			corpus.getParameters().setTermSelectionCriterion(TermSelection.TF);
			corpus.getParameters().setTermSelectionThreshold(0);
			corpus.load(repository);
//			document.load(repository);
//			Corpus sentenceCorpus=document.getSentenceCorpus();	
			String[] sentences =corpus.getPassages();
			HashMap<String,String> sentenceContent = new HashMap<String,String> ();
			for(int i=0;i<sentences.length;i++)
			{
				sentenceContent.put(sentences[i],repository.getDocumentField(sentences[i],"contents"));				
			}
			return sentenceContent;
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			logger.error(e);			
		} catch (ParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (NotEnoughTermsInCorpusException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (NoDocumentsInCorpusException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TermWeightingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}
	/**
	 * retrieve metainfo by documentid and annotation type from Mysql database
	 * @param docid	
	 * @param type
	 * @return an ArrayList which contains the annotation information of each sentence
	 */
	public ArrayList<String> getMetaInfoBydocId(String docid,String type)
	{
		dbutil.setConnection();
	  ArrayList<String> metainfo=dbutil.sendQuery("select metadata from metainfo where docid='"+docid+"'and annotator='"+type+"';","metadata");
	
	  return metainfo;
	}
	
	/**
	 * insert meta info into Mysql database
	 * @param sentenceid
	 * @param docid
	 * @param annotatedtext
	 * @param type
	 */
	public void dbinsert(String sentenceid,String docid, String annotatedtext,String type,double time)
	{
		dbutil.setConnection();
		int result=dbutil.sendUpdate("insert into metainfo values('"+sentenceid+"','"+docid+"','"+annotatedtext+"','"+type+"','"+time+"');");
		if(result==-1)
		{
			logger.info("fail to insert to metainfo table");
		}
		
	}	
	/**
	 * get all the annotators configured in TML property file 
	 */
	
	@SuppressWarnings("rawtypes")
	public void getAnnotators()
	{
		// Loads default annotators
		String annotators=null;
		try {
			annotators = Configuration.getTmlProperties().getProperty("tml.annotators");
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			logger.info(e1.getMessage());
		}

		for(String annotatorName : annotators.split(",")) {
			if(annotatorName.trim().length() == 0)
				continue;
			
			Class classDefinition = null;
			Annotator annotator = null;
			try {
				classDefinition = Class.forName("tml.annotators." + annotatorName);
				annotator = (Annotator) classDefinition.newInstance();
				this.annotators.add(annotator);
				annotator.init();
			} catch (Exception e) {
				logger.error("Default annotator not found! " + annotatorName);
				logger.error(e);
				continue;
			}			
		}
	       	
	}
	
	public void insertDocTable(String docid)
	{
		dbutil.setConnection();
		SimpleDateFormat tempDate = new SimpleDateFormat("yyyy-MM-dd" + " " + "hh:mm:ss"); 
		String status="Unprocessed";
		String datetime = tempDate.format(new java.util.Date());		
	
		int result=dbutil.sendUpdate("insert into docs values('"+docid+"','"+status+"','"+datetime+"');");
		if(result==-1)
		{
			logger.info("fail to insert to metainfo table");
		}
		
	}
	
	public ArrayList<String> searchDocTable()
	{
		dbutil.setConnection();
		ArrayList<String> docidInfo=dbutil.sendQuery("select docid from docs where status='Unprocessed';","docid");
		 
		  return docidInfo;
	}
	
	public void updateDocTable(String docid,String status)
	{
		dbutil.setConnection();
		int result=dbutil.sendUpdate("update docs set status='"+status+"' where docid='"+docid+"'");
		if(result==-1)
		{
			logger.info("fail to update to doc table");
		}  
		 
	}
	
	
	
	
	

}