package org.bbaw.wsp.cms.lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.bbaw.wsp.cms.collections.Collection;
import org.bbaw.wsp.cms.collections.CollectionReader;
import org.bbaw.wsp.cms.dochandler.DocumentHandler;
import org.bbaw.wsp.cms.document.Hits;
import org.bbaw.wsp.cms.document.MetadataRecord;
import org.bbaw.wsp.cms.document.Token;
import org.bbaw.wsp.cms.document.XQuery;
import org.bbaw.wsp.cms.general.Constants;
import org.bbaw.wsp.cms.scheduler.CmsDocOperation;
import org.bbaw.wsp.cms.translator.MicrosoftTranslator;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Form;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma;
import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
public class IndexHandler {
private static IndexHandler instance;
private IndexWriter documentsIndexWriter;
private IndexWriter nodesIndexWriter;
private SearcherManager documentsSearcherManager;
private SearcherManager nodesSearcherManager;
private IndexReader documentsIndexReader;
private PerFieldAnalyzerWrapper documentsPerFieldAnalyzer;
private PerFieldAnalyzerWrapper nodesPerFieldAnalyzer;
// private TaxonomyWriter taxonomyWriter; // TODO facet
// private TaxonomyReader taxonomyReader; // TODO facet
public static IndexHandler getInstance() throws ApplicationException {
if (instance == null) {
instance = new IndexHandler();
instance.init();
}
return instance;
}
private void init() throws ApplicationException {
documentsIndexWriter = getDocumentsWriter();
documentsIndexWriter.setMaxFieldLength(1000000);
nodesIndexWriter = getNodesWriter();
nodesIndexWriter.setMaxFieldLength(1000000);
documentsSearcherManager = getNewSearcherManager(documentsIndexWriter);
nodesSearcherManager = getNewSearcherManager(nodesIndexWriter);
documentsIndexReader = getDocumentsReader();
// taxonomyWriter = getTaxonomyWriter(); // TODO facet
// taxonomyReader = getTaxonomyReader(); // TODO facet
}
public void indexDocument(CmsDocOperation docOperation) throws ApplicationException {
try {
// first delete document in documentsIndex and nodesIndex
deleteDocumentLocal(docOperation);
indexDocumentLocal(docOperation);
// taxonomyWriter.commit(); // TODO facet
documentsIndexWriter.commit();
nodesIndexWriter.commit();
} catch (Exception e) {
try {
// taxonomyWriter.rollback(); // TODO facet
documentsIndexWriter.rollback();
nodesIndexWriter.rollback();
} catch (Exception ex) {
// nothing
}
throw new ApplicationException(e);
}
}
private void indexDocumentLocal(CmsDocOperation docOperation) throws ApplicationException {
try {
MetadataRecord mdRecord = docOperation.getMdRecord();
String docId = mdRecord.getDocId();
// List<CategoryPath> categories = new ArrayList<CategoryPath>(); // TODO facet
Document doc = new Document();
Field docIdField = new Field("docId", docId, Field.Store.YES, Field.Index.ANALYZED);
doc.add(docIdField);
String docIdSortedStr = docId.toLowerCase(); // so that sorting is lower case
Field docIdFieldSorted = new Field("docIdSorted", docIdSortedStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(docIdFieldSorted);
String identifier = mdRecord.getIdentifier();
if (identifier != null) {
Field identifierField = new Field("identifier", identifier, Field.Store.YES, Field.Index.ANALYZED);
doc.add(identifierField);
}
String uri = mdRecord.getUri();
if (uri == null)
uri = docOperation.getSrcUrl();
if (uri != null) {
Field uriField = new Field("uri", uri, Field.Store.YES, Field.Index.ANALYZED);
doc.add(uriField);
}
String collectionNames = mdRecord.getCollectionNames();
if (collectionNames != null) {
Field collectionNamesField = new Field("collectionNames", collectionNames, Field.Store.YES, Field.Index.ANALYZED);
doc.add(collectionNamesField);
}
if (mdRecord.getCreator() != null) {
// categories.add(new CategoryPath("author", mdRecord.getCreator())); // TODO facet
Field authorField = new Field("author", mdRecord.getCreator(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(authorField);
String authorStr = mdRecord.getCreator();
if (authorStr != null)
authorStr = authorStr.toLowerCase(); // so that sorting is lower case
Field authorFieldSorted = new Field("authorSorted", authorStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(authorFieldSorted);
}
if (mdRecord.getTitle() != null) {
Field titleField = new Field("title", mdRecord.getTitle(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(titleField);
String titleStr = mdRecord.getTitle();
if (titleStr != null)
titleStr = titleStr.toLowerCase(); // so that sorting is lower case
Field titleFieldSorted = new Field("titleSorted", titleStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(titleFieldSorted);
}
if (mdRecord.getPublisher() != null) {
Field publisherField = new Field("publisher", mdRecord.getPublisher(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(publisherField);
String publisherStr = mdRecord.getPublisher();
if (publisherStr != null)
publisherStr = publisherStr.toLowerCase(); // so that sorting is lower case
Field publisherFieldSorted = new Field("publisherSorted", publisherStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(publisherFieldSorted);
}
String yearStr = mdRecord.getYear();
if (yearStr == null) {
Date pubDate = mdRecord.getPublishingDate();
if (pubDate != null) {
Calendar cal = Calendar.getInstance();
cal.setTime(pubDate);
int year = cal.get(Calendar.YEAR);
yearStr = String.valueOf(year);
}
}
if (yearStr != null) {
Field dateField = new Field("date", yearStr, Field.Store.YES, Field.Index.ANALYZED);
doc.add(dateField);
Field dateFieldSorted = new Field("dateSorted", yearStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(dateFieldSorted);
}
if (mdRecord.getSubject() != null) {
Field subjectField = new Field("subject", mdRecord.getSubject(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(subjectField);
}
if (mdRecord.getRights() != null) {
Field rightsField = new Field("rights", mdRecord.getRights(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(rightsField);
}
if (mdRecord.getLicense() != null) {
Field licenseField = new Field("license", mdRecord.getLicense(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(licenseField);
}
if (mdRecord.getAccessRights() != null) {
Field accessRightsField = new Field("accessRights", mdRecord.getAccessRights(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(accessRightsField);
}
if (mdRecord.getLastModified() != null) {
Date lastModified = mdRecord.getLastModified();
String xsDateStr = new Util().toXsDate(lastModified);
Field lastModifiedField = new Field("lastModified", xsDateStr, Field.Store.YES, Field.Index.ANALYZED);
doc.add(lastModifiedField);
long time = lastModified.getTime();
String timeStr = String.valueOf(time);
Field lastModifiedFieldSorted = new Field("lastModifiedSorted", timeStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(lastModifiedFieldSorted);
}
if (mdRecord.getSchemaName() != null) {
Field schemaField = new Field("schemaName", mdRecord.getSchemaName(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(schemaField);
String schemaStr = mdRecord.getSchemaName();
if (schemaStr != null)
schemaStr = schemaStr.toLowerCase(); // so that sorting is lower case
Field schemaFieldSorted = new Field("schemaNameSorted", schemaStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(schemaFieldSorted);
}
if (mdRecord.getPersons() != null) {
Field personsField = new Field("persons", mdRecord.getPersons(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(personsField);
}
if (mdRecord.getPlaces() != null) {
Field placesField = new Field("places", mdRecord.getPlaces(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(placesField);
}
String language = mdRecord.getLanguage();
if (language != null) {
Field languageField = new Field("language", mdRecord.getLanguage(), Field.Store.YES, Field.Index.ANALYZED);
doc.add(languageField);
String langStr = mdRecord.getLanguage();
if (langStr != null)
langStr = langStr.toLowerCase(); // so that sorting is lower case
Field languageFieldSorted = new Field("languageSorted", langStr, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.add(languageFieldSorted);
}
int pageCount = mdRecord.getPageCount();
if (pageCount != -1) {
String pageCountStr = String.valueOf(pageCount);
Field pageCountField = new Field("pageCount", pageCountStr, Field.Store.YES, Field.Index.ANALYZED);
doc.add(pageCountField);
}
String docTokensOrig = mdRecord.getTokenOrig();
if (docTokensOrig != null) {
Field tokenOrigField = new Field("tokenOrig", docTokensOrig, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(tokenOrigField);
}
String docTokensReg = mdRecord.getTokenReg();
if (docTokensReg != null) {
Field tokenRegField = new Field("tokenReg", docTokensReg, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(tokenRegField);
}
String docTokensNorm = mdRecord.getTokenNorm();
if (docTokensNorm != null) {
Field tokenNormField = new Field("tokenNorm", docTokensNorm, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(tokenNormField);
}
String docTokensMorph = mdRecord.getTokenMorph();
if (docTokensMorph != null) {
Field tokenMorphField = new Field("tokenMorph", docTokensMorph, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(tokenMorphField);
}
String contentXml = mdRecord.getContentXml();
if (contentXml != null) {
Field contentXmlField = new Field("xmlContent", contentXml, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(contentXmlField);
}
String content = mdRecord.getContent();
if (content != null) {
Field contentField = new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(contentField);
}
// save the webUrl field
Hashtable<String, XQuery> xqueriesHashtable = mdRecord.getxQueries();
if (xqueriesHashtable != null) {
String webUri = null;
Enumeration<String> keys = xqueriesHashtable.keys();
if (keys != null && keys.hasMoreElements()) {
XQuery xQueryWebId = xqueriesHashtable.get("webId");
if (xQueryWebId != null) {
String webId = xQueryWebId.getResult();
String collectionName = mdRecord.getCollectionNames();
if (collectionName != null && webId != null) {
webId = webId.trim();
Collection collection = CollectionReader.getInstance().getCollection(collectionName);
String webBaseUrl = collection.getWebBaseUrl();
if (webBaseUrl != null)
webUri = webBaseUrl + "/" + webId;
}
}
}
if (webUri != null) {
Field webUriField = new Field("webUri", webUri, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(webUriField);
}
}
// TODO facet
// facet creation
// CategoryDocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxonomyWriter);
// categoryDocBuilder.setCategoryPaths(categories);
// categoryDocBuilder.build(doc);
documentsIndexWriter.addDocument(doc);
DocumentHandler docHandler = new DocumentHandler();
boolean docIsXml = docHandler.isDocXml(docId);
if (docIsXml) {
// add all elements with the specified names of the document to nodesIndex
ArrayList<XmlTokenizerContentHandler.Element> xmlElements = mdRecord.getXmlElements();
for (int i = 0; i < xmlElements.size(); i++) {
XmlTokenizerContentHandler.Element element = xmlElements.get(i);
Document nodeDoc = new Document();
nodeDoc.add(docIdField);
String nodeLanguage = element.lang;
if (nodeLanguage == null)
nodeLanguage = language;
String nodePageNumber = String.valueOf(element.pageNumber);
String nodeLineNumber = String.valueOf(element.lineNumber);
String nodeElementName = String.valueOf(element.name);
String nodeElementDocPosition = String.valueOf(element.docPosition);
String nodeElementAbsolutePosition = String.valueOf(element.position);
String nodeElementPagePosition = String.valueOf(element.pagePosition);
String nodeElementPosition = String.valueOf(element.elemPosition);
String nodeXmlId = element.xmlId;
String nodeXpath = element.xpath;
String nodeXmlContent = element.toXmlString();
String nodeTokensOrig = element.getTokensStr("orig");
String nodeTokensReg = element.getTokensStr("reg");
String nodeTokensNorm = element.getTokensStr("norm");
String nodeTokensMorph = element.getTokensStr("morph");
if (nodeLanguage != null) {
Field nodeLanguageField = new Field("language", nodeLanguage, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeLanguageField);
}
Field nodePageNumberField = new Field("pageNumber", nodePageNumber, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodePageNumberField);
Field nodeLineNumberField = new Field("lineNumber", nodeLineNumber, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeLineNumberField);
Field nodeElementNameField = new Field("elementName", nodeElementName, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeElementNameField);
Field nodeElementDocPositionField = new Field("elementDocPosition", nodeElementDocPosition, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeElementDocPositionField);
Field nodeElementDocPositionFieldSorted = new Field("elementDocPositionSorted", nodeElementDocPosition, Field.Store.YES, Field.Index.NOT_ANALYZED);
nodeDoc.add(nodeElementDocPositionFieldSorted);
Field nodeElementAbsolutePositionField = new Field("elementAbsolutePosition", nodeElementAbsolutePosition, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeElementAbsolutePositionField);
Field nodeElementPagePositionField = new Field("elementPagePosition", nodeElementPagePosition, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeElementPagePositionField);
Field nodeElementPositionField = new Field("elementPosition", nodeElementPosition, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeElementPositionField);
if (nodeXmlId != null) {
Field nodeXmlIdField = new Field("xmlId", nodeXmlId, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeXmlIdField);
}
if (nodeXpath != null) {
Field nodeXpathField = new Field("xpath", nodeXpath, Field.Store.YES, Field.Index.ANALYZED);
nodeDoc.add(nodeXpathField);
}
if (nodeXmlContent != null) {
Field nodeXmlContentField = new Field("xmlContent", nodeXmlContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
nodeDoc.add(nodeXmlContentField);
}
if (nodeXmlContent != null) {
String nodeXmlContentTokenized = toTokenizedXmlString(nodeXmlContent, nodeLanguage);
Field nodeXmlContentTokenizedField = new Field("xmlContentTokenized", nodeXmlContentTokenized, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
nodeDoc.add(nodeXmlContentTokenizedField);
}
if (nodeTokensOrig != null) {
Field nodeTokenOrigField = new Field("tokenOrig", nodeTokensOrig, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
nodeDoc.add(nodeTokenOrigField);
}
if (nodeTokensReg != null) {
Field nodeTokenRegField = new Field("tokenReg", nodeTokensReg, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
nodeDoc.add(nodeTokenRegField);
}
if (nodeTokensNorm != null) {
Field nodeTokenNormField = new Field("tokenNorm", nodeTokensNorm, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
nodeDoc.add(nodeTokenNormField);
}
if (nodeTokensMorph != null) {
Field nodeTokenMorphField = new Field("tokenMorph", nodeTokensMorph, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
nodeDoc.add(nodeTokenMorphField);
}
nodesIndexWriter.addDocument(nodeDoc);
}
}
} catch (Exception e) {
throw new ApplicationException(e);
}
}
public void deleteDocument(CmsDocOperation docOperation) throws ApplicationException {
try {
deleteDocumentLocal(docOperation);
documentsIndexWriter.commit();
nodesIndexWriter.commit();
} catch (Exception e) {
try {
documentsIndexWriter.rollback();
nodesIndexWriter.rollback();
} catch (Exception ex) {
// nothing
}
throw new ApplicationException(e);
}
}
private void deleteDocumentLocal(CmsDocOperation docOperation) throws ApplicationException {
String docId = docOperation.getDocIdentifier();
try {
Term termIdentifier = new Term("docId", docId);
documentsIndexWriter.deleteDocuments(termIdentifier);
nodesIndexWriter.deleteDocuments(termIdentifier);
} catch (Exception e) {
throw new ApplicationException(e);
}
}
public Hits queryDocuments(String queryStr, String[] sortFieldNames, String language, int from, int to, boolean withHitFragments, boolean translate) throws ApplicationException {
Hits hits = null;
IndexSearcher searcher = null;
try {
makeDocumentsSearcherManagerUpToDate();
searcher = documentsSearcherManager.acquire();
String defaultQueryFieldName = "tokenOrig";
QueryParser queryParser = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, documentsPerFieldAnalyzer);
Query query = null;
if (queryStr.equals("*")) {
query = new MatchAllDocsQuery();
} else {
query = queryParser.parse(queryStr);
}
Query morphQuery = buildMorphQuery(query, language, false, translate);
Query highlighterQuery = buildMorphQuery(query, language, true, translate);
if (query instanceof PhraseQuery || query instanceof PrefixQuery || query instanceof FuzzyQuery || query instanceof TermRangeQuery) {
highlighterQuery = query; // TODO wenn sie rekursiv enthalten sind
}
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
QueryScorer queryScorer = new QueryScorer(highlighterQuery);
Highlighter highlighter = new Highlighter(htmlFormatter, queryScorer);
// TODO facet
// TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, true);
// FacetSearchParams facetSearchParams = new FacetSearchParams();
// facetSearchParams.addFacetRequest(new CountFacetRequest(new CategoryPath("author"), 10));
// FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, documentsIndexReader, taxonomyReader);
// searcher.search(morphQuery, MultiCollector.wrap(topDocsCollector, facetsCollector));
// List<FacetResult> facetResult = facetsCollector.getFacetResults();
TopDocs resultDocs = null;
if (sortFieldNames != null) {
Sort sort = buildSort(sortFieldNames, "doc"); // build sort criteria
resultDocs = searcher.search(morphQuery, 10000, sort);
} else {
resultDocs = searcher.search(morphQuery, 10000);
}
resultDocs.setMaxScore(1);
int toTmp = to;
if (resultDocs.scoreDocs.length <= to)
toTmp = resultDocs.scoreDocs.length - 1;
if (resultDocs != null) {
ArrayList<org.bbaw.wsp.cms.document.Document> docs = new ArrayList<org.bbaw.wsp.cms.document.Document>();
for (int i=from; i<=toTmp; i++) {
int docID = resultDocs.scoreDocs[i].doc;
FieldSelector docFieldSelector = getDocFieldSelector();
Document luceneDoc = searcher.doc(docID, docFieldSelector);
org.bbaw.wsp.cms.document.Document doc = new org.bbaw.wsp.cms.document.Document(luceneDoc);
if (withHitFragments) {
ArrayList<String> hitFragments = new ArrayList<String>();
Fieldable docContentField = luceneDoc.getFieldable("content");
if (docContentField != null) {
String docContent = docContentField.stringValue();
TokenStream tokenStream = TokenSources.getAnyTokenStream(this.documentsIndexReader, docID, docContentField.name(), documentsPerFieldAnalyzer);
TextFragment[] textfragments = highlighter.getBestTextFragments(tokenStream, docContent, true, 3);
if (textfragments.length > 0) {
for (int j=0; j<textfragments.length; j++) {
hitFragments.add(checkHitFragment(textfragments[j].toString()));
}
}
}
if (! hitFragments.isEmpty())
doc.setHitFragments(hitFragments);
}
docs.add(doc);
}
if (docs != null) {
hits = new Hits(docs, from, to);
hits.setSize(resultDocs.scoreDocs.length);
hits.setQuery(morphQuery);
}
}
} catch (Exception e) {
throw new ApplicationException(e);
} finally {
try {
if (searcher != null)
documentsSearcherManager.release(searcher);
} catch (IOException e) {
// nothing
}
}
// Do not use searcher after this!
searcher = null;
return hits;
}
public Hits queryDocument(String docId, String queryStr, int from, int to) throws ApplicationException {
Hits hits = null;
IndexSearcher searcher = null;
MetadataRecord docMetadataRecord = getDocMetadata(docId);
if (docMetadataRecord == null)
return null; // no document with that docId is in index
try {
makeNodesSearcherManagerUpToDate();
searcher = nodesSearcherManager.acquire();
String fieldNameDocId = "docId";
Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, nodesPerFieldAnalyzer).parse(docId);
String defaultQueryFieldName = "tokenOrig";
Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr);
String language = docMetadataRecord.getLanguage();
if (language == null || language.equals("")) {
String collectionNames = docMetadataRecord.getCollectionNames();
Collection collection = CollectionReader.getInstance().getCollection(collectionNames);
if (collection != null) {
String mainLang = collection.getMainLanguage();
if (mainLang != null)
language = mainLang;
}
}
Query morphQuery = buildMorphQuery(query, language);
BooleanQuery queryDoc = new BooleanQuery();
queryDoc.add(queryDocId, BooleanClause.Occur.MUST);
queryDoc.add(morphQuery, BooleanClause.Occur.MUST);
Sort sortByPosition = new Sort(new SortField("position", SortField.INT));
TopDocs topDocs = searcher.search(queryDoc, 100000, sortByPosition);
topDocs.setMaxScore(1);
int toTmp = to;
if (topDocs.scoreDocs.length <= to)
toTmp = topDocs.scoreDocs.length - 1;
if (topDocs != null) {
ArrayList<org.bbaw.wsp.cms.document.Document> docs = new ArrayList<org.bbaw.wsp.cms.document.Document>();
for (int i=from; i<=toTmp; i++) {
int docID = topDocs.scoreDocs[i].doc;
FieldSelector nodeFieldSelector = getNodeFieldSelector();
Document luceneDoc = searcher.doc(docID, nodeFieldSelector);
org.bbaw.wsp.cms.document.Document doc = new org.bbaw.wsp.cms.document.Document(luceneDoc);
docs.add(doc);
}
if (docs != null) {
hits = new Hits(docs, from, to);
hits.setSize(topDocs.scoreDocs.length);
}
}
searcher.close();
} catch (Exception e) {
throw new ApplicationException(e);
} finally {
try {
if (searcher != null)
documentsSearcherManager.release(searcher);
} catch (IOException e) {
// nothing
}
}
// Do not use searcher after this!
searcher = null;
return hits;
}
public MetadataRecord getDocMetadata(String docId) throws ApplicationException {
MetadataRecord mdRecord = null;
Document doc = getDocument(docId);
if (doc != null) {
String identifier = null;
Fieldable identifierField = doc.getFieldable("identifier");
if (identifierField != null)
identifier = identifierField.stringValue();
String uri = null;
Fieldable uriField = doc.getFieldable("uri");
if (uriField != null)
uri = uriField.stringValue();
String webUri = null;
Fieldable webUriField = doc.getFieldable("webUri");
if (webUriField != null)
webUri = webUriField.stringValue();
String collectionNames = null;
Fieldable collectionNamesField = doc.getFieldable("collectionNames");
if (collectionNamesField != null)
collectionNames = collectionNamesField.stringValue();
String author = null;
Fieldable authorField = doc.getFieldable("author");
if (authorField != null)
author = authorField.stringValue();
String title = null;
Fieldable titleField = doc.getFieldable("title");
if (titleField != null)
title = titleField.stringValue();
String language = null;
Fieldable languageField = doc.getFieldable("language");
if (languageField != null)
language = languageField.stringValue();
else {
Collection collection = CollectionReader.getInstance().getCollection(collectionNames);
if (collection != null) {
String mainLang = collection.getMainLanguage();
if (mainLang != null)
language = mainLang;
}
}
Date yearDate = null;
Fieldable dateField = doc.getFieldable("date");
if (dateField != null) {
String dateStr = dateField.stringValue();
if (dateStr != null && !dateStr.equals("")) {
dateStr = StringUtils.deresolveXmlEntities(dateStr);
String yearStr = new Util().toYearStr(dateStr); // test if possible
// etc
if (yearStr != null) {
yearDate = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
}
}
}
String rights = null;
Fieldable rightsField = doc.getFieldable("rights");
if (rightsField != null)
rights = rightsField.stringValue();
String license = null;
Fieldable licenseField = doc.getFieldable("license");
if (licenseField != null)
license = licenseField.stringValue();
String accessRights = null;
Fieldable accessRightsField = doc.getFieldable("accessRights");
if (accessRightsField != null)
accessRights = accessRightsField.stringValue();
int pageCount = -1;
Fieldable pageCountField = doc.getFieldable("pageCount");
if (pageCountField != null) {
String pageCountStr = pageCountField.stringValue();
pageCount = Integer.valueOf(pageCountStr);
}
String personsStr = null;
Fieldable personsField = doc.getFieldable("persons");
if (personsField != null) {
personsStr = personsField.stringValue();
}
String placesStr = null;
Fieldable placesField = doc.getFieldable("places");
if (placesField != null) {
placesStr = placesField.stringValue();
}
String schemaName = null;
Fieldable schemaNameField = doc.getFieldable("schemaName");
if (schemaNameField != null)
schemaName = schemaNameField.stringValue();
Date lastModified = null;
Fieldable lastModifiedField = doc.getFieldable("lastModified");
if (lastModifiedField != null) {
String lastModifiedXSDateStr = lastModifiedField.stringValue();
lastModified = new Util().toDate(lastModifiedXSDateStr);
}
mdRecord = new MetadataRecord();
mdRecord.setDocId(docId);
mdRecord.setUri(uri);
mdRecord.setWebUri(webUri);
mdRecord.setIdentifier(identifier);
mdRecord.setCollectionNames(collectionNames);
mdRecord.setCreator(author);
mdRecord.setTitle(title);
mdRecord.setDate(yearDate);
mdRecord.setLanguage(language);
mdRecord.setLicense(license);
mdRecord.setRights(rights);
mdRecord.setAccessRights(accessRights);
mdRecord.setPageCount(pageCount);
mdRecord.setPersons(personsStr);
mdRecord.setPlaces(placesStr);
mdRecord.setSchemaName(schemaName);
mdRecord.setLastModified(lastModified);
}
return mdRecord;
}
public ArrayList<Token> getToken(String fieldName, String value, int count) throws ApplicationException {
ArrayList<Token> retToken = null;
int counter = 0;
TermEnum terms = null;
try {
if (value == null)
value = "";
Term term = new Term(fieldName, value);
makeIndexReaderUpToDate();
terms = documentsIndexReader.terms(term);
while (terms != null && fieldName != null && fieldName.equals(terms.term().field()) && counter < count) {
if (retToken == null)
retToken = new ArrayList<Token>();
Term termContent = terms.term();
Token token = new Token(termContent);
retToken.add(token);
counter++;
if (!terms.next())
break;
}
} catch (Exception e) {
throw new ApplicationException(e);
} finally {
if (terms != null) {
try {
terms.close();
} catch (IOException e) {
// nothing
}
}
}
return retToken;
}
public ArrayList<Token> getToken(String docId, String fieldName, String value, int count) throws ApplicationException {
ArrayList<Token> retToken = null;
if (value == null)
value = "";
int counter = 0;
IndexSearcher searcher = null;
try {
makeDocumentsSearcherManagerUpToDate();
makeIndexReaderUpToDate();
searcher = documentsSearcherManager.acquire();
Query queryDocId = new TermQuery(new Term("docId", docId));
TopDocs topDocs = searcher.search(queryDocId, 1);
if (topDocs != null) {
int docIdInt = topDocs.scoreDocs[0].doc;
TermFreqVector termFreqVector = documentsIndexReader.getTermFreqVector(docIdInt, fieldName);
if (termFreqVector != null) {
String[] terms = termFreqVector.getTerms();
int[] freqs = termFreqVector.getTermFrequencies();
boolean success = false;
if (terms != null) {
retToken = new ArrayList<Token>();
for (int i = 0; i < terms.length; i++) {
String termStr = terms[i];
if (termStr.startsWith(value))
success = true;
if (success) {
counter++;
int freq = freqs[i];
Term t = new Term(fieldName, termStr);
Token tok = new Token(t);
tok.setFreq(freq);
retToken.add(tok);
}
if (counter >= count)
break;
}
}
}
}
} catch (Exception e) {
throw new ApplicationException(e);
} finally {
try {
if (searcher != null)
documentsSearcherManager.release(searcher);
} catch (IOException e) {
// nothing
}
}
// Do not use searcher after this!
searcher = null;
return retToken;
}
public void end() throws ApplicationException {
try {
// TODO facet
// if (taxonomyWriter != null)
// taxonomyWriter.close();
if (documentsIndexWriter != null)
documentsIndexWriter.close();
if (nodesIndexWriter != null)
nodesIndexWriter.close();
if (documentsSearcherManager != null)
documentsSearcherManager.close();
if (nodesSearcherManager != null)
nodesSearcherManager.close();
if (documentsIndexReader != null)
documentsIndexReader.close();
} catch (IOException e) {
throw new ApplicationException(e);
}
}
private Query buildMorphQuery(Query query, String language) throws ApplicationException {
return buildMorphQuery(query, language, false, false);
}
private Query buildMorphQuery(Query query, String language, boolean withAllForms, boolean translate) throws ApplicationException {
Query morphQuery = null;
if (query instanceof TermQuery) {
TermQuery termQuery = (TermQuery) query;
morphQuery = buildMorphQuery(termQuery, language, withAllForms, translate);
} else if (query instanceof BooleanQuery) {
BooleanQuery booleanQuery = (BooleanQuery) query;
morphQuery = buildMorphQuery(booleanQuery, language, withAllForms, translate);
} else {
morphQuery = query; // all other cases: PrefixQuery, PhraseQuery, FuzzyQuery, TermRangeQuery, ...
}
return morphQuery;
}
private Query buildMorphQuery(TermQuery inputTermQuery, String fromLang, boolean withAllForms, boolean translate) throws ApplicationException {
String[] toLanguages = {"deu", "eng", "fra"}; // TODO
String fromLanguage = null;
String inputTerm = inputTermQuery.getTerm().text();
if (fromLang == null || fromLang.isEmpty()) {
String detectedLang = MicrosoftTranslator.detectLanguageCode(inputTerm);
if (detectedLang != null)
fromLanguage = detectedLang;
} else {
fromLanguage = fromLang;
}
LexHandler lexHandler = LexHandler.getInstance();
String fieldName = inputTermQuery.getTerm().field();
ArrayList<TermQuery> queryTerms = new ArrayList<TermQuery>();
if (fieldName != null && fieldName.equals("tokenMorph")) {
ArrayList<Lemma> lemmas = lexHandler.getLemmas(inputTerm, "form", fromLanguage, Normalizer.DICTIONARY, true);
if (lemmas == null) { // if no lemmas are found then do a query in tokenOrig TODO should this really be done ?
if (translate) {
String[] terms = {inputTerm};
ArrayList<String> translatedTerms = MicrosoftTranslator.translate(terms, fromLanguage, toLanguages);
for (int i=0; i<translatedTerms.size(); i++) {
String translatedTerm = translatedTerms.get(i);
Term translatedTermTokenOrig = new Term("tokenOrig", translatedTerm);
TermQuery translatedTermQueryInTokenOrig = new TermQuery(translatedTermTokenOrig);
queryTerms.add(translatedTermQueryInTokenOrig);
}
} else {
Term termTokenOrig = new Term("tokenOrig", inputTerm);
TermQuery termQueryInTokenOrig = new TermQuery(termTokenOrig);
queryTerms.add(termQueryInTokenOrig);
}
} else {
if (translate) {
ArrayList<String> morphTerms = new ArrayList<String>();
for (int i=0; i<lemmas.size(); i++) {
Lemma lemma = lemmas.get(i);
if (withAllForms) { // all word forms are put into the query as boolean or clauses: needed in fragments search when all forms should be highlighted
ArrayList<Form> forms = lemma.getFormsList();
for (int j=0; j<forms.size(); j++) {
Form form = forms.get(j);
String formName = form.getFormName();
morphTerms.add(formName);
}
} else {
String lemmaName = lemma.getLemmaName();
morphTerms.add(lemmaName);
}
}
String[] morphTermsArray = morphTerms.toArray(new String[morphTerms.size()]);
ArrayList<String> translatedMorphTerms = MicrosoftTranslator.translate(morphTermsArray, fromLanguage, toLanguages);
for (int i=0; i<translatedMorphTerms.size(); i++) {
String translatedMorphTermStr = translatedMorphTerms.get(i);
Term translatedMorphTerm = new Term(fieldName, translatedMorphTermStr);
TermQuery translatedMorphTermQuery = new TermQuery(translatedMorphTerm);
queryTerms.add(translatedMorphTermQuery);
}
} else {
for (int i = 0; i < lemmas.size(); i++) {
Lemma lemma = lemmas.get(i);
if (withAllForms) { // all word forms are put into the query as boolean or clauses: needed in fragments search when all forms should be highlighted
ArrayList<Form> forms = lemma.getFormsList();
for (int j=0; j<forms.size(); j++) {
Form form = forms.get(j);
Term formTerm = new Term(fieldName, form.getFormName());
TermQuery morphTermQuery = new TermQuery(formTerm);
queryTerms.add(morphTermQuery);
}
} else {
Term lemmaTerm = new Term(fieldName, lemma.getLemmaName());
TermQuery morphTermQuery = new TermQuery(lemmaTerm);
queryTerms.add(morphTermQuery);
}
}
}
}
} else {
// if it is not the morph field then do a normal query
if (translate) {
String inputTermQueryField = inputTermQuery.getTerm().field();
String inputTermQueryStr = inputTermQuery.getTerm().text();
String[] terms = {inputTermQueryStr};
ArrayList<String> translatedTerms = MicrosoftTranslator.translate(terms, fromLanguage, toLanguages);
for (int i=0; i<translatedTerms.size(); i++) {
String translatedTerm = translatedTerms.get(i);
Term translatedTermTokenOrig = new Term(inputTermQueryField, translatedTerm);
TermQuery translatedTermQueryInTokenOrig = new TermQuery(translatedTermTokenOrig);
queryTerms.add(translatedTermQueryInTokenOrig);
}
} else {
queryTerms.add(inputTermQuery);
}
//TODO ?? perhaps other fields should also be queried morphological e.g. title etc.
}
Query retQuery = buildBooleanShouldQuery(queryTerms);
return retQuery;
}
private Query buildBooleanShouldQuery(ArrayList<TermQuery> queryTerms) throws ApplicationException {
BooleanQuery retBooleanQuery = new BooleanQuery();
for (int i = 0; i < queryTerms.size(); i++) {
TermQuery termQuery = queryTerms.get(i);
retBooleanQuery.add(termQuery, BooleanClause.Occur.SHOULD);
}
return retBooleanQuery;
}
private Query buildMorphQuery(BooleanQuery query, String language, boolean withAllForms, boolean translate) throws ApplicationException {
BooleanQuery morphBooleanQuery = new BooleanQuery();
BooleanClause[] booleanClauses = query.getClauses();
for (int i = 0; i < booleanClauses.length; i++) {
BooleanClause boolClause = booleanClauses[i];
Query q = boolClause.getQuery();
Query morphQuery = buildMorphQuery(q, language, withAllForms, translate);
BooleanClause.Occur occur = boolClause.getOccur();
morphBooleanQuery.add(morphQuery, occur);
}
return morphBooleanQuery;
}
public ArrayList<String> fetchTerms(String queryStr) throws ApplicationException {
ArrayList<String> terms = null;
String defaultQueryFieldName = "tokenOrig";
try {
Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr);
terms = fetchTerms(query);
} catch (Exception e) {
throw new ApplicationException(e);
}
return terms;
}
/**
* recursively fetch all terms of the query
*
* @param query
* @return
*/
private ArrayList<String> fetchTerms(Query query) throws ApplicationException {
ArrayList<String> terms = new ArrayList<String>();
if (query instanceof TermQuery) {
TermQuery termQuery = (TermQuery) query;
String termQueryStr = termQuery.getTerm().text();
terms.add(termQueryStr);
} else if (query instanceof BooleanQuery) {
BooleanQuery booleanQuery = (BooleanQuery) query;
terms = fetchTerms(booleanQuery);
} else {
String queryStr = query.toString();
terms.add(queryStr); // all other cases: PrefixQuery, PhraseQuery,
// FuzzyQuery, TermRangeQuery, ...
}
return terms;
}
private ArrayList<String> fetchTerms(BooleanQuery query) throws ApplicationException {
ArrayList<String> terms = new ArrayList<String>();
BooleanClause[] booleanClauses = query.getClauses();
for (int i = 0; i < booleanClauses.length; i++) {
BooleanClause boolClause = booleanClauses[i];
Query q = boolClause.getQuery();
ArrayList<String> qTerms = fetchTerms(q);
BooleanClause.Occur occur = boolClause.getOccur();
if (occur == BooleanClause.Occur.SHOULD || occur == BooleanClause.Occur.MUST)
terms.addAll(qTerms);
}
return terms;
}
public ArrayList<String> fetchTerms(String queryStr, String language) throws ApplicationException {
ArrayList<String> terms = null;
String defaultQueryFieldName = "tokenOrig";
try {
Query query = new QueryParser(Version.LUCENE_35, defaultQueryFieldName, nodesPerFieldAnalyzer).parse(queryStr);
terms = fetchTerms(query, language);
} catch (Exception e) {
throw new ApplicationException(e);
}
return terms;
}
/**
* recursively fetch all terms of the query
*
* @param query
* @return
*/
private ArrayList<String> fetchTerms(Query query, String language) throws ApplicationException {
ArrayList<String> terms = new ArrayList<String>();
if (query instanceof TermQuery) {
TermQuery termQuery = (TermQuery) query;
terms = fetchTerms(termQuery, language);
} else if (query instanceof BooleanQuery) {
BooleanQuery booleanQuery = (BooleanQuery) query;
terms = fetchTerms(booleanQuery, language);
} else {
String queryStr = query.toString();
terms.add(queryStr);
// all other cases: PrefixQuery, PhraseQuery, FuzzyQuery, TermRangeQuery, ...
}
return terms;
}
private ArrayList<String> fetchTerms(TermQuery termQuery, String language) throws ApplicationException {
if (language == null)
language = "eng";
ArrayList<String> terms = new ArrayList<String>();
Term termQueryTerm = termQuery.getTerm();
String term = termQuery.getTerm().text();
String fieldName = termQueryTerm.field();
if (fieldName != null && fieldName.equals("tokenMorph")) {
LexHandler lexHandler = LexHandler.getInstance();
ArrayList<Lemma> lemmas = lexHandler.getLemmas(term, "form", language, Normalizer.DICTIONARY, true);
// TODO : language über den translator service holen
if (lemmas == null) {
terms.add(term);
} else {
for (int i = 0; i < lemmas.size(); i++) {
Lemma lemma = lemmas.get(i);
ArrayList<Form> forms = lemma.getFormsList();
for (int j = 0; j < forms.size(); j++) {
Form form = forms.get(j);
String formName = form.getFormName();
terms.add(formName);
}
}
}
} else {
terms.add(term);
}
return terms;
}
private ArrayList<String> fetchTerms(BooleanQuery query, String language) throws ApplicationException {
ArrayList<String> terms = new ArrayList<String>();
BooleanClause[] booleanClauses = query.getClauses();
for (int i = 0; i < booleanClauses.length; i++) {
BooleanClause boolClause = booleanClauses[i];
Query q = boolClause.getQuery();
ArrayList<String> qTerms = fetchTerms(q, language);
BooleanClause.Occur occur = boolClause.getOccur();
if (occur == BooleanClause.Occur.SHOULD || occur == BooleanClause.Occur.MUST)
terms.addAll(qTerms);
}
return terms;
}
private Document getDocument(String docId) throws ApplicationException {
Document doc = null;
IndexSearcher searcher = null;
try {
makeDocumentsSearcherManagerUpToDate();
searcher = documentsSearcherManager.acquire();
String fieldNameDocId = "docId";
Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, documentsPerFieldAnalyzer).parse(docId);
TopDocs topDocs = searcher.search(queryDocId, 100000);
topDocs.setMaxScore(1);
if (topDocs != null && topDocs.scoreDocs != null && topDocs.scoreDocs.length > 0) {
int docID = topDocs.scoreDocs[0].doc;
FieldSelector docFieldSelector = getDocFieldSelector();
doc = searcher.doc(docID, docFieldSelector);
}
searcher.close();
} catch (Exception e) {
throw new ApplicationException(e);
} finally {
try {
if (searcher != null)
documentsSearcherManager.release(searcher);
} catch (IOException e) {
// nothing
}
}
// Do not use searcher after this!
searcher = null;
return doc;
}
public Hits moreLikeThis(String docId, int from, int to) throws ApplicationException {
Hits hits = null;
ArrayList<org.bbaw.wsp.cms.document.Document> wspDocs = null;
IndexSearcher searcher1 = null;
IndexSearcher searcher2 = null;
try {
makeDocumentsSearcherManagerUpToDate();
searcher1 = documentsSearcherManager.acquire();
String fieldNameDocId = "docId";
Query queryDocId = new QueryParser(Version.LUCENE_35, fieldNameDocId, documentsPerFieldAnalyzer).parse(docId);
TopDocs topDocs = searcher1.search(queryDocId, 100000);
topDocs.setMaxScore(1);
int docID = -1;
if (topDocs != null && topDocs.scoreDocs != null && topDocs.scoreDocs.length > 0) {
docID = topDocs.scoreDocs[0].doc;
}
makeDocumentsSearcherManagerUpToDate();
searcher2 = documentsSearcherManager.acquire();
MoreLikeThis mlt = new MoreLikeThis(documentsIndexReader); // TODO documentsIndexReader is ok ?
mlt.setFieldNames(new String[]{"content"}); // similarity function works against these fields
mlt.setMinWordLen(2);
mlt.setBoost(true);
Query queryMoreLikeThis = mlt.like(docID);
TopDocs moreLikeThisDocs = searcher2.search(queryMoreLikeThis, 10);
moreLikeThisDocs.setMaxScore(10);
if (moreLikeThisDocs != null) {
if (wspDocs == null)
wspDocs = new ArrayList<org.bbaw.wsp.cms.document.Document>();
for (int i=0; i<moreLikeThisDocs.scoreDocs.length; i++) {
int docIdent = moreLikeThisDocs.scoreDocs[i].doc;
Document luceneDoc = searcher2.doc(docIdent);
org.bbaw.wsp.cms.document.Document wspDoc = new org.bbaw.wsp.cms.document.Document(luceneDoc);
wspDocs.add(wspDoc);
}
}
if (wspDocs != null) {
hits = new Hits(wspDocs, from, to);
hits.setSize(moreLikeThisDocs.scoreDocs.length);
}
} catch (Exception e) {
throw new ApplicationException(e);
} finally {
try {
if (searcher1 != null)
documentsSearcherManager.release(searcher1);
if (searcher2 != null)
documentsSearcherManager.release(searcher2);
} catch (IOException e) {
// nothing
}
}
// Do not use searcher after this!
searcher1 = null;
searcher2 = null;
return hits;
}
private IndexWriter getDocumentsWriter() throws ApplicationException {
IndexWriter writer = null;
String luceneDocsDirectoryStr = Constants.getInstance().getLuceneDocumentsDir();
File luceneDocsDirectory = new File(luceneDocsDirectoryStr);
try {
Map<String, Analyzer> documentsFieldAnalyzers = new HashMap<String, Analyzer>();
documentsFieldAnalyzers.put("docId", new KeywordAnalyzer());
documentsFieldAnalyzers.put("identifier", new KeywordAnalyzer());
documentsFieldAnalyzers.put("uri", new KeywordAnalyzer());
documentsFieldAnalyzers.put("webUri", new KeywordAnalyzer());
documentsFieldAnalyzers.put("collectionNames", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("author", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("title", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("language", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("publisher", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("date", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("subject", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("rights", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("license", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("accessRights", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("type", new KeywordAnalyzer()); // e.g. mime type "text/xml"
documentsFieldAnalyzers.put("pageCount", new KeywordAnalyzer());
documentsFieldAnalyzers.put("schemaName", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("lastModified", new KeywordAnalyzer());
documentsFieldAnalyzers.put("tokenOrig", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("tokenReg", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("tokenNorm", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("tokenMorph", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("xmlContent", new StandardAnalyzer(Version.LUCENE_35));
documentsFieldAnalyzers.put("content", new StandardAnalyzer(Version.LUCENE_35));
documentsPerFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_35), documentsFieldAnalyzers);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, documentsPerFieldAnalyzer);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
conf.setRAMBufferSizeMB(300); // 300 MB because some documents are big; 16 MB is default
FSDirectory fsDirectory = FSDirectory.open(luceneDocsDirectory);
writer = new IndexWriter(fsDirectory, conf);
writer.commit(); // when directory is empty this creates init files
} catch (IOException e) {
throw new ApplicationException(e);
}
return writer;
}
private IndexWriter getNodesWriter() throws ApplicationException {
IndexWriter writer = null;
String luceneNodesDirectoryStr = Constants.getInstance().getLuceneNodesDir();
File luceneNodesDirectory = new File(luceneNodesDirectoryStr);
try {
Map<String, Analyzer> nodesFieldAnalyzers = new HashMap<String, Analyzer>();
nodesFieldAnalyzers.put("docId", new KeywordAnalyzer());
nodesFieldAnalyzers.put("language", new StandardAnalyzer(Version.LUCENE_35)); // language (through xml:id): e.g. "lat"
nodesFieldAnalyzers.put("pageNumber", new KeywordAnalyzer()); // page number (through element pb): e.g. "13"
nodesFieldAnalyzers.put("lineNumber", new KeywordAnalyzer()); // line number on the page (through element lb): e.g. "17"
nodesFieldAnalyzers.put("elementName", new KeywordAnalyzer()); // element name: e.g. "tei:s"
nodesFieldAnalyzers.put("elementDocPosition", new KeywordAnalyzer()); // absolute position of element in document: e.g. "4711"
nodesFieldAnalyzers.put("elementPosition", new KeywordAnalyzer()); // position in parent node (in relation to other nodes of the same name): e.g. "5"
nodesFieldAnalyzers.put("elementAbsolutePosition", new KeywordAnalyzer()); // absolute position in document (in relation to other nodes of the same name): e.g. "213"
nodesFieldAnalyzers.put("elementPagePosition", new KeywordAnalyzer()); // position in relation to other nodes of the same name: e.g. "213"
nodesFieldAnalyzers.put("xmlId", new KeywordAnalyzer()); // xml id: e.g. "4711bla"
nodesFieldAnalyzers.put("xpath", new KeywordAnalyzer()); // xpath: e.g. "/echo[1]/text[1]/p[1]/s[5]"
nodesFieldAnalyzers.put("tokenOrig", new StandardAnalyzer(Version.LUCENE_35));
nodesFieldAnalyzers.put("tokenReg", new StandardAnalyzer(Version.LUCENE_35));
nodesFieldAnalyzers.put("tokenNorm", new StandardAnalyzer(Version.LUCENE_35));
nodesFieldAnalyzers.put("tokenMorph", new StandardAnalyzer(Version.LUCENE_35));
nodesFieldAnalyzers.put("xmlContent", new StandardAnalyzer(Version.LUCENE_35));
nodesPerFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_35), nodesFieldAnalyzers);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, nodesPerFieldAnalyzer);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
conf.setRAMBufferSizeMB(300); // 300 MB because some documents are big; 16 MB is default
FSDirectory fsDirectory = FSDirectory.open(luceneNodesDirectory);
writer = new IndexWriter(fsDirectory, conf);
writer.commit();
} catch (IOException e) {
throw new ApplicationException(e);
}
return writer;
}
private Sort buildSort(String[] sortFieldNames, String type) {
Sort sort = new Sort();
ArrayList<SortField> sortFields = new ArrayList<SortField>();
for (int i=0; i<sortFieldNames.length; i++) {
String sortFieldName = sortFieldNames[i];
int sortFieldType = getDocSortFieldType(sortFieldName);
if (type.equals("node"))
sortFieldType = getNodeSortFieldType(sortFieldName);
String realSortFieldName = getDocSortFieldName(sortFieldName);
SortField sortField = new SortField(realSortFieldName, sortFieldType);
sortFields.add(sortField);
}
if (sortFieldNames.length == 1) {
SortField sortField1 = sortFields.get(0);
sort.setSort(sortField1);
} else if (sortFieldNames.length == 2) {
SortField sortField1 = sortFields.get(0);
SortField sortField2 = sortFields.get(1);
sort.setSort(sortField1, sortField2);
} else if (sortFieldNames.length == 2) {
SortField sortField1 = sortFields.get(0);
SortField sortField2 = sortFields.get(1);
SortField sortField3 = sortFields.get(2);
sort.setSort(sortField1, sortField2, sortField3);
}
return sort;
}
private String getDocSortFieldName(String fieldName) {
String sortFieldName = fieldName + "Sorted";
return sortFieldName;
}
private int getDocSortFieldType(String fieldName) {
int type = SortField.STRING;
if (fieldName.equals("lastModified"))
type = SortField.LONG;
return type;
}
private int getNodeSortFieldType(String fieldName) {
int type = SortField.STRING;
if (fieldName.equals("pageNumber") || fieldName.equals("lineNumber") || fieldName.equals("elementDocPosition"))
type = SortField.INT;
return type;
}
private FieldSelector getDocFieldSelector() {
HashSet<String> fields = new HashSet<String>();
fields.add("docId");
fields.add("identifier");
fields.add("uri");
fields.add("webUri");
fields.add("collectionNames");
fields.add("author");
fields.add("title");
fields.add("language");
fields.add("publisher");
fields.add("date");
fields.add("subject");
fields.add("rights");
fields.add("license");
fields.add("type");
fields.add("pageCount");
fields.add("schemaName");
fields.add("lastModified");
fields.add("persons");
fields.add("places");
fields.add("content");
FieldSelector fieldSelector = new SetBasedFieldSelector(fields, fields);
return fieldSelector;
}
private FieldSelector getNodeFieldSelector() {
HashSet<String> fields = new HashSet<String>();
fields.add("docId");
fields.add("language");
fields.add("pageNumber");
fields.add("lineNumber");
fields.add("elementName");
fields.add("elementDocPosition");
fields.add("elementPosition");
fields.add("elementAbsolutePosition");
fields.add("elementPagePosition");
fields.add("xmlId");
fields.add("xpath");
fields.add("xmlContent");
fields.add("xmlContentTokenized");
FieldSelector fieldSelector = new SetBasedFieldSelector(fields, fields);
return fieldSelector;
}
private SearcherManager getNewSearcherManager(IndexWriter indexWriter) throws ApplicationException {
SearcherManager searcherManager = null;
try {
searcherManager = new SearcherManager(indexWriter, true, null, null);
} catch (IOException e) {
throw new ApplicationException(e);
}
return searcherManager;
}
private IndexReader getDocumentsReader() throws ApplicationException {
IndexReader reader = null;
String luceneDocsDirectoryStr = Constants.getInstance().getLuceneDocumentsDir();
File luceneDocsDirectory = new File(luceneDocsDirectoryStr);
try {
FSDirectory fsDirectory = FSDirectory.open(luceneDocsDirectory);
reader = IndexReader.open(fsDirectory, true);
} catch (IOException e) {
throw new ApplicationException(e);
}
return reader;
}
// TODO facet
private TaxonomyWriter getTaxonomyWriter() throws ApplicationException {
TaxonomyWriter taxonomyWriter = null;
String taxonomyDirStr = Constants.getInstance().getLuceneTaxonomyDir();
File taxonomyDirF = new File(taxonomyDirStr);
try {
Directory taxonomyDir = FSDirectory.open(taxonomyDirF);
taxonomyWriter = new DirectoryTaxonomyWriter(taxonomyDir, OpenMode.CREATE_OR_APPEND);
taxonomyWriter.commit();
} catch (IOException e) {
throw new ApplicationException(e);
}
return taxonomyWriter;
}
// TODO facet
private TaxonomyReader getTaxonomyReader() throws ApplicationException {
TaxonomyReader taxonomyReader = null;
String taxonomyDirStr = Constants.getInstance().getLuceneTaxonomyDir();
File taxonomyDirF = new File(taxonomyDirStr);
try {
Directory taxonomyDir = FSDirectory.open(taxonomyDirF);
taxonomyReader = new DirectoryTaxonomyReader(taxonomyDir);
} catch (IOException e) {
throw new ApplicationException(e);
}
return taxonomyReader;
}
private void makeIndexReaderUpToDate() throws ApplicationException {
try {
boolean isCurrent = documentsIndexReader.isCurrent();
if (!isCurrent) {
documentsIndexReader = IndexReader.openIfChanged(documentsIndexReader);
}
// taxonomyReader.refresh(); // TODO facet
} catch (Exception e) {
throw new ApplicationException(e);
}
}
private void makeDocumentsSearcherManagerUpToDate() throws ApplicationException {
try {
boolean isCurrent = documentsSearcherManager.isSearcherCurrent();
if (!isCurrent) {
documentsSearcherManager.maybeReopen();
}
// taxonomyReader.refresh(); // TODO facet
} catch (Exception e) {
throw new ApplicationException(e);
}
}
private void makeNodesSearcherManagerUpToDate() throws ApplicationException {
try {
boolean isCurrent = nodesSearcherManager.isSearcherCurrent();
if (!isCurrent) {
nodesSearcherManager.maybeReopen();
}
} catch (IOException e) {
throw new ApplicationException(e);
}
}
private String toTokenizedXmlString(String xmlStr, String language) throws ApplicationException {
String xmlPre = "<tokenized xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">";
String xmlPost = "</tokenized>";
String xmlStrTmp = xmlPre + xmlStr + xmlPost;
StringReader xmlInputStringReader = new StringReader(xmlStrTmp);
XmlTokenizer xmlTokenizer = new XmlTokenizer(xmlInputStringReader);
xmlTokenizer.setLanguage(language);
String[] outputOptions = { "withLemmas" };
xmlTokenizer.setOutputOptions(outputOptions);
xmlTokenizer.tokenize();
String result = xmlTokenizer.getXmlResult();
return result;
}
private String escapeLuceneChars(String inputStr) {
String luceneCharsStr = "+-&|!(){}[]^~*?:\\"; // Lucene escape symbols
StringBuilder retStrBuilder = new StringBuilder();
for (int i = 0; i < inputStr.length(); i++) {
char c = inputStr.charAt(i);
if (luceneCharsStr.contains(String.valueOf(c)))
retStrBuilder.append("\\");
retStrBuilder.append(c);
}
return retStrBuilder.toString();
}
/**
* sorgt für sinnvolle satzanfänge
*
* @param fragment
*/
private String checkHitFragment(String fragment) {
if (fragment.startsWith(".")
|| fragment.startsWith(":")
|| fragment.startsWith(",")
|| fragment.startsWith("-")
|| fragment.startsWith(";")
|| fragment.startsWith("?")
|| fragment.startsWith(")")
|| fragment.startsWith("!")) {
fragment = fragment.substring(1, fragment.length());
// finds first occurence of a given string out.println("first index of point : "+StringUtils.indexOfAny(fragment, "."));
}
return fragment;
}
}