/******************************************************************************* * Copyright 2007, 2009 Ming Liu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.annotators; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; /******************************************************************************* * Copyright (C) 2001, 2009 University of Sydney * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 * USA * * http://www.gnu.org/licenses/gpl.txt *******************************************************************************/ import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import org.apache.log4j.Logger; import org.apache.lucene.queryParser.ParseException; import tml.Configuration; import tml.corpus.SentenceCorpus; import tml.corpus.TextDocument; import tml.corpus.CorpusParameters.TermSelection; import tml.storage.Repository; import tml.utils.DBUtils; import tml.vectorspace.NoDocumentsInCorpusException; import tml.vectorspace.NotEnoughTermsInCorpusException; import tml.vectorspace.TermWeightingException; /** * This class implements the management of meta information in sentence level. * It searches the lucene index where the sentences are stored, and * then parsed the sentence and insert the annotated sentence into Mysql database. * The setting for lucene index file path and Mysql database are read from TML property file * * @author Ming Liu * */ public class AnnotatorManager { // General attributes /** The logger for log4j */ private static Logger logger = Logger.getLogger(AnnotatorManager.class); private String driver; private String url; private String username; private String password; private String indexpath; private List<Annotator> annotators = new ArrayList<Annotator>(); // private String docid ="document%3Adddxrkj5142cgn4dtd6-5"; DBUtils dbutil = null; private Repository repository=null; public AnnotatorManager() throws Exception { // Read default properties and initialize database connection parameters //Configuration.getTmlProperties(); driver = Configuration.getTmlProperties().getProperty( "tml.database.driver"); url = Configuration.getTmlProperties().getProperty("tml.database.url"); username = Configuration.getTmlProperties().getProperty( "tml.database.username"); password = Configuration.getTmlProperties().getProperty( "tml.database.password"); // TODO: Analyze if storing the indexpath in the properties file violates having // one repository per JVM. It should be a different properties file. indexpath=Configuration.getTmlProperties().getProperty( "tml.lucene.indexpath"); dbutil = new DBUtils(driver,url,username,password); repository = new Repository(indexpath); } /** * insert the annotated text into Mysql DB. */ public void insertMetainfoToDB() { dbutil.setConnection(); getAnnotators(); ArrayList<String> unprocessedList = searchDocTable(); for(int i=0; i<unprocessedList.size();i++) { String documentid=unprocessedList.get(i); HashMap<String,String> sentencesandid = getSentenceFromLucene(documentid); if (sentencesandid==null) { updateDocTable(documentid,"Unavailable in Lucene"); continue; } Set<Map.Entry<String, String>> entrySet = sentencesandid.entrySet(); Iterator<Entry<String, String>> it = entrySet.iterator(); while (it.hasNext()) { Map.Entry<String, String> en= it.next(); for(int j=0;j<annotators.size();j++) { Annotator annotator = annotators.get(j); double time = System.nanoTime(); String annotatedText = annotator.getAnnotations(en.getValue()); if (annotatedText==null) { updateDocTable(documentid,"failure"); } time = (System.nanoTime() - time) * 10E-9; // avoid sql injection, particularly in single quote problem annotatedText = annotatedText.replace("'", "''"); dbinsert(en.getKey(),documentid,annotatedText,annotator.getFieldName(),time); updateDocTable(documentid,"processed"); } } } dbutil.closeConnection(); } /** * retrive sentence id and value in pair from lucene index by documentid * @param documentid * @return a Hashmap where the key contains sentenceid and the value contains its content. */ public HashMap<String,String> getSentenceFromLucene(String documentid) { try { TextDocument document = repository.getTextDocument(documentid); SentenceCorpus corpus = new SentenceCorpus(document); // corpus.getParameters().setCalculateSemanticSpace(false); corpus.getParameters().setTermSelectionCriterion(TermSelection.TF); corpus.getParameters().setTermSelectionThreshold(0); corpus.load(repository); // document.load(repository); // Corpus sentenceCorpus=document.getSentenceCorpus(); String[] sentences =corpus.getPassages(); HashMap<String,String> sentenceContent = new HashMap<String,String> (); for(int i=0;i<sentences.length;i++) { sentenceContent.put(sentences[i],repository.getDocumentField(sentences[i],"contents")); } return sentenceContent; } catch (IOException e) { // TODO Auto-generated catch block logger.error(e); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (NotEnoughTermsInCorpusException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (NoDocumentsInCorpusException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TermWeightingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } /** * retrieve metainfo by documentid and annotation type from Mysql database * @param docid * @param type * @return an ArrayList which contains the annotation information of each sentence */ public ArrayList<String> getMetaInfoBydocId(String docid,String type) { dbutil.setConnection(); ArrayList<String> metainfo=dbutil.sendQuery("select metadata from metainfo where docid='"+docid+"'and annotator='"+type+"';","metadata"); return metainfo; } /** * insert meta info into Mysql database * @param sentenceid * @param docid * @param annotatedtext * @param type */ public void dbinsert(String sentenceid,String docid, String annotatedtext,String type,double time) { dbutil.setConnection(); int result=dbutil.sendUpdate("insert into metainfo values('"+sentenceid+"','"+docid+"','"+annotatedtext+"','"+type+"','"+time+"');"); if(result==-1) { logger.info("fail to insert to metainfo table"); } } /** * get all the annotators configured in TML property file */ @SuppressWarnings("rawtypes") public void getAnnotators() { // Loads default annotators String annotators=null; try { annotators = Configuration.getTmlProperties().getProperty("tml.annotators"); } catch (IOException e1) { // TODO Auto-generated catch block logger.info(e1.getMessage()); } for(String annotatorName : annotators.split(",")) { if(annotatorName.trim().length() == 0) continue; Class classDefinition = null; Annotator annotator = null; try { classDefinition = Class.forName("tml.annotators." + annotatorName); annotator = (Annotator) classDefinition.newInstance(); this.annotators.add(annotator); annotator.init(); } catch (Exception e) { logger.error("Default annotator not found! " + annotatorName); logger.error(e); continue; } } } public void insertDocTable(String docid) { dbutil.setConnection(); SimpleDateFormat tempDate = new SimpleDateFormat("yyyy-MM-dd" + " " + "hh:mm:ss"); String status="Unprocessed"; String datetime = tempDate.format(new java.util.Date()); int result=dbutil.sendUpdate("insert into docs values('"+docid+"','"+status+"','"+datetime+"');"); if(result==-1) { logger.info("fail to insert to metainfo table"); } } public ArrayList<String> searchDocTable() { dbutil.setConnection(); ArrayList<String> docidInfo=dbutil.sendQuery("select docid from docs where status='Unprocessed';","docid"); return docidInfo; } public void updateDocTable(String docid,String status) { dbutil.setConnection(); int result=dbutil.sendUpdate("update docs set status='"+status+"' where docid='"+docid+"'"); if(result==-1) { logger.info("fail to update to doc table"); } } }