/** * */ package tml.storage; import java.util.List; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import tml.corpus.CorpusParameters; import tml.corpus.TextDocument; import tml.corpus.CorpusParameters.DimensionalityReduction; import tml.corpus.CorpusParameters.TermSelection; import tml.vectorspace.TermWeighting.GlobalWeight; import tml.vectorspace.TermWeighting.LocalWeight; /** * @author Jorge Villalon * */ public class DocumentCleanup implements Runnable { private static Logger logger = Logger.getLogger(DocumentCleanup.class); private Repository repository; private CorpusParameters params; public DocumentCleanup(Repository repo) { this.repository = repo; this.params = new CorpusParameters(); this.params.setDimensionalityReduction(DimensionalityReduction.NO); this.params.setDimensionalityReductionThreshold(0); this.params.setLanczosSVD(false); this.params.setNormalizeDocuments(false); this.params.setTermSelectionCriterion(TermSelection.DF); this.params.setTermSelectionThreshold(0); this.params.setTermWeightGlobal(GlobalWeight.None); this.params.setTermWeightLocal(LocalWeight.TF); } /* (non-Javadoc) * @see java.lang.Runnable#run() */ @Override public void run() { logger.debug("Document cleanup started"); int total = 0; List<TextDocument> docs; try { docs = this.repository.getAllTextDocuments(); } catch (Exception e) { logger.error(e.getMessage()); return; } if(docs == null) { logger.debug("No documents to cleanup"); return; } for(TextDocument doc : docs) { try { String[][] subs = this.repository.getDbConnection().getSubDocuments(doc.getExternalId()); if(subs.length <= 1) { logger.debug("Inserting document in the database:" + doc.getExternalId()); Document document = repository.getIndexReader().document(doc.getLuceneId()); this.repository.getDbConnection().insertDocument(repository, document); doc.setParameters(this.params); doc.load(repository); for(int id : doc.getSentenceCorpus().getPassagesLuceneIds()) { Document sentence = repository.getIndexReader().document(id); this.repository.getDbConnection().insertDocument(repository, sentence); } for(int id : doc.getParagraphCorpus().getPassagesLuceneIds()) { Document sentence = repository.getIndexReader().document(id); this.repository.getDbConnection().insertDocument(repository, sentence); } total++; } } catch (Exception e) { logger.error(e.getMessage()); continue; } } if(total > 0) logger.info("Cleaned " + total + " documents"); else logger.debug("Nothing to clean!"); } }