package org.bbaw.wsp.cms.dochandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.FileNameMap;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.logging.Logger;
import net.sf.saxon.s9api.Axis;
import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;
import net.sf.saxon.s9api.XdmSequenceIterator;
import org.apache.commons.io.FileUtils;
import org.bbaw.wsp.cms.dochandler.parser.document.IDocument;
import org.bbaw.wsp.cms.dochandler.parser.text.parser.DocumentParser;
import org.bbaw.wsp.cms.document.MetadataRecord;
import org.bbaw.wsp.cms.document.XQuery;
import org.bbaw.wsp.cms.general.Constants;
import org.bbaw.wsp.cms.lucene.IndexHandler;
import org.bbaw.wsp.cms.scheduler.CmsDocOperation;
import org.bbaw.wsp.cms.transform.GetFragmentsContentHandler;
import org.bbaw.wsp.cms.transform.XslResourceTransformer;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import com.sun.org.apache.xerces.internal.parsers.SAXParser;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.general.Language;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
import de.mpg.mpiwg.berlin.mpdl.util.StringUtils;
import de.mpg.mpiwg.berlin.mpdl.util.Util;
import de.mpg.mpiwg.berlin.mpdl.xml.xquery.XQueryEvaluator;
/**
* Handler for documents (singleton).
*/
public class DocumentHandler {
private static Logger LOGGER = Logger.getLogger(DocumentHandler.class.getName());
public void doOperation(CmsDocOperation docOperation) throws ApplicationException{
String operationName = docOperation.getName();
if (operationName.equals("create")) {
create(docOperation);
} else if (operationName.equals("delete")) {
delete(docOperation);
}
}
private void create(CmsDocOperation docOperation) throws ApplicationException {
try {
String operationName = docOperation.getName();
String srcUrlStr = docOperation.getSrcUrl();
String docId = docOperation.getDocIdentifier();
String mainLanguage = docOperation.getMainLanguage();
String[] elementNames = docOperation.getElementNames();
if (elementNames == null) {
String[] defaultElementNames = {"persName", "placeName", "p", "s", "head"};
docOperation.setElementNames(defaultElementNames); // default
}
String docDirName = getDocDir(docId);
String docDestFileName = getDocFullFileName(docId);
boolean docIsXml = isDocXml(docId);
URL srcUrl = null;
String protocol = null;
if (srcUrlStr != null && ! srcUrlStr.equals("empty")) {
srcUrl = new URL(srcUrlStr);
protocol = srcUrl.getProtocol();
}
File docDestFile = new File(docDestFileName);
// perform operation on file system
if (protocol.equals("file")) {
docOperation.setStatus("upload file: " + srcUrlStr + " to CMS");
} else {
docOperation.setStatus("download file from: " + srcUrlStr + " to CMS");
}
FileUtils.copyURLToFile(srcUrl, docDestFile, 100000, 100000);
MetadataRecord mdRecord = docOperation.getMdRecord();
mdRecord.setLastModified(new Date());
String mimeType = getMimeType(docId);
mdRecord.setType(mimeType);
// document is of type XML
if (docIsXml) {
// parse validation on file
XQueryEvaluator xQueryEvaluator = new XQueryEvaluator();
XdmNode docNode = xQueryEvaluator.parse(srcUrl); // if it is not parseable an exception with a detail message is thrown
String docType = getNodeType(docNode);
docType = docType.trim();
if (docType == null) {
FileUtils.deleteQuietly(docDestFile);
docOperation.setErrorMessage("file type of: " + srcUrlStr + "is not supported");
return;
}
// replace anchor in echo documents and also add the number attribute to figures
String docDestFileNameUpgrade = docDestFileName + ".upgrade";
File docDestFileUpgrade = new File(docDestFileNameUpgrade);
XslResourceTransformer replaceAnchorTransformer = new XslResourceTransformer("replaceAnchor.xsl");
String docDestFileUrlStr = docDestFile.getPath();
String result = replaceAnchorTransformer.transform(docDestFileUrlStr);
FileUtils.writeStringToFile(docDestFileUpgrade, result, "utf-8");
// generate toc file (toc, figure, handwritten, persons, places, pages)
XslResourceTransformer tocTransformer = new XslResourceTransformer("toc.xsl");
File tocFile = new File(docDirName + "/toc.xml");
String tocResult = tocTransformer.transform(docDestFileNameUpgrade);
FileUtils.writeStringToFile(tocFile, tocResult, "utf-8");
String persons = getPersons(tocResult, xQueryEvaluator);
String places = getPlaces(tocResult, xQueryEvaluator);
// Get metadata info out of the xml document
mdRecord.setPersons(persons);
mdRecord.setPlaces(places);
docOperation.setStatus("extract metadata of: " + srcUrlStr + " to CMS");
mdRecord = getMetadataRecord(docDestFileUpgrade, docType, mdRecord, xQueryEvaluator);
String mdRecordLanguage = mdRecord.getLanguage();
String langId = Language.getInstance().getLanguageId(mdRecordLanguage); // test if language code is supported
if (langId == null)
mdRecordLanguage = null;
if (mdRecordLanguage == null && mainLanguage != null)
mdRecord.setLanguage(mainLanguage);
// save all pages as single xml files (untokenized and tokenized)
docOperation.setStatus("extract page fragments of: " + srcUrlStr + " to CMS");
File docDir = new File(docDirName + "/pages");
FileUtils.deleteQuietly(docDir); // first delete pages directory
Hashtable<Integer, StringBuilder> pageFragments = getFragments(docDestFileNameUpgrade, "pb");
int pageCount = pageFragments.size();
if (pageCount == 0) {
// no pb element is found: then the whole document is the first page
String docXmlStr = FileUtils.readFileToString(docDestFileUpgrade, "utf-8");
docXmlStr = docXmlStr.replaceAll("<\\?xml.*?\\?>", ""); // remove the xml declaration if it exists
pageFragments = new Hashtable<Integer, StringBuilder>();
pageFragments.put(new Integer(1), new StringBuilder(docXmlStr));
pageCount = 1;
}
for (int page=1; page<=pageCount; page++) {
String fragment = pageFragments.get(new Integer(page)).toString();
fragment = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + fragment;
String docPageFileName = docDirName + "/pages/page-" + page + ".xml";
File docPageFile = new File(docPageFileName);
FileUtils.writeStringToFile(docPageFile, fragment, "utf-8");
String language = mdRecord.getLanguage();
String tokenizedXmlStr = tokenizeWithLemmas(fragment, language);
tokenizedXmlStr = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + tokenizedXmlStr;
String docPageTokenizedFileName = docDirName + "/pages/page-" + page + "-morph.xml";
File docPageTokenizedFile = new File(docPageTokenizedFileName);
FileUtils.writeStringToFile(docPageTokenizedFile, tokenizedXmlStr, "utf-8");
}
}
// build the documents fulltext fields
buildFulltextFields(docOperation);
// perform operation on Lucene
docOperation.setStatus(operationName + " document: " + docId + " in CMS");
IndexHandler indexHandler = IndexHandler.getInstance();
indexHandler.indexDocument(docOperation);
} catch (IOException e) {
throw new ApplicationException(e);
}
}
private void delete(CmsDocOperation docOperation) throws ApplicationException {
String operationName = docOperation.getName();
String docIdentifier = docOperation.getDocIdentifier();
if (docIdentifier == null || docIdentifier.trim().equals(""))
throw new ApplicationException("Your document identifier is empty. Please specify a document identifier for your document.");
String docDirStr = getDocDir(docIdentifier);
File docDir = new File(docDirStr);
boolean docExists = docDir.exists();
if (! docExists) {
throw new ApplicationException("Document:" + docIdentifier + " does not exists. Please use a name that exists and perform the operation \"Delete\" again.");
}
// perform operation on file system
docOperation.setStatus(operationName + " document: " + docIdentifier + " in CMS");
FileUtils.deleteQuietly(docDir);
// perform operation on Lucene
IndexHandler indexHandler = IndexHandler.getInstance();
indexHandler.deleteDocument(docOperation);
}
private void buildFulltextFields(CmsDocOperation docOperation) throws ApplicationException {
try {
MetadataRecord mdRecord = docOperation.getMdRecord();
String docId = mdRecord.getDocId();
String language = mdRecord.getLanguage();
boolean docIsXml = isDocXml(docId);
String docTokensOrig = null;
String docTokensNorm = null;
String docTokensMorph = null;
String contentXml = null;
String content = null;
String docFileName = getDocFullFileName(docId);
XmlTokenizer docXmlTokenizer = null;
if (docIsXml) {
docFileName = getDocFullFileName(docId) + ".upgrade";
InputStreamReader docFileReader = new InputStreamReader(new FileInputStream(docFileName), "utf-8");
// to guarantee that utf-8 is used (if not done, it does not work on Tomcat which has another default charset)
docXmlTokenizer = new XmlTokenizer(docFileReader);
docXmlTokenizer.setDocIdentifier(docId);
docXmlTokenizer.setLanguage(language);
docXmlTokenizer.setOutputFormat("string");
String[] outputOptionsWithLemmas = { "withLemmas" }; // so all tokens are
// fetched with lemmas (costs performance)
docXmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
String[] normFunctionNone = { "none" };
docXmlTokenizer.setNormFunctions(normFunctionNone);
docXmlTokenizer.tokenize();
int pageCount = docXmlTokenizer.getPageCount();
if (pageCount <= 0)
pageCount = 1; // each document at least has one page
String[] outputOptionsEmpty = {};
docXmlTokenizer.setOutputOptions(outputOptionsEmpty);
// must be set to null so that the normalization function works
docTokensOrig = docXmlTokenizer.getStringResult();
String[] normFunctionNorm = { "norm" };
docXmlTokenizer.setNormFunctions(normFunctionNorm);
docTokensNorm = docXmlTokenizer.getStringResult();
docXmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
docTokensMorph = docXmlTokenizer.getStringResult();
// fetch original xml content of the documents file
File docFile = new File(docFileName);
contentXml = FileUtils.readFileToString(docFile, "utf-8");
// fetch original content of the documents file (without xml tags)
XslResourceTransformer charsTransformer = new XslResourceTransformer("chars.xsl");
content = charsTransformer.transform(docFileName);
// get elements from xml tokenizer
String[] elementNamesArray = docOperation.getElementNames();
String elementNames = "";
for (int i = 0; i < elementNamesArray.length; i++) {
String elemName = elementNamesArray[i];
elementNames = elementNames + elemName + " ";
}
elementNames = elementNames.substring(0, elementNames.length() - 1);
ArrayList<XmlTokenizerContentHandler.Element> xmlElements = docXmlTokenizer.getElements(elementNames);
// fill mdRecord
mdRecord.setTokenOrig(docTokensOrig);
mdRecord.setTokenNorm(docTokensNorm);
mdRecord.setTokenMorph(docTokensMorph);
mdRecord.setContentXml(contentXml);
mdRecord.setContent(content);
mdRecord.setXmlElements(xmlElements);
mdRecord.setPageCount(pageCount);
} else {
DocumentParser tikaParser = new DocumentParser();
try {
IDocument tikaDoc = tikaParser.parse(docFileName);
docTokensOrig = tikaDoc.getTextOrig();
MetadataRecord tikaMDRecord = tikaDoc.getMetadata();
if (tikaMDRecord != null) {
int pageCount = tikaMDRecord.getPageCount();
if (pageCount <= 0)
pageCount = 1; // each document at least has one page
mdRecord.setPageCount(pageCount);
}
} catch (ApplicationException e) {
LOGGER.severe(e.getLocalizedMessage());
}
String mdRecordLanguage = mdRecord.getLanguage();
String mainLanguage = docOperation.getMainLanguage();
if (mdRecordLanguage == null && mainLanguage != null)
mdRecord.setLanguage(mainLanguage);
String lang = mdRecord.getLanguage();
DocumentTokenizer docTokenizer = DocumentTokenizer.getInstance();
String[] normFunctions = {"norm"};
ArrayList<Token> normTokens = docTokenizer.getToken(docTokensOrig, lang, normFunctions);
docTokensNorm = docTokenizer.buildStr(normTokens, lang, "norm");
docTokensMorph = docTokenizer.buildStr(normTokens, lang, "morph");
content = docTokensOrig; // content is the same as docTokensOrig
// fill mdRecord
mdRecord.setTokenOrig(docTokensOrig);
mdRecord.setTokenNorm(docTokensNorm);
mdRecord.setTokenMorph(docTokensMorph);
mdRecord.setContent(content);
}
} catch (Exception e) {
throw new ApplicationException(e);
}
}
private MetadataRecord getMetadataRecord(File xmlFile, String schemaName, MetadataRecord mdRecord, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
if (schemaName == null)
return mdRecord;
try {
URL srcUrl = xmlFile.toURI().toURL();
if (schemaName.equals("TEI"))
mdRecord = getMetadataRecordTei(xQueryEvaluator, srcUrl, mdRecord);
else if (schemaName.equals("html"))
mdRecord = getMetadataRecordHtml(xQueryEvaluator, srcUrl, mdRecord);
else
mdRecord.setSchemaName(schemaName); // all other cases: set docType to schemaName
evaluateXQueries(xQueryEvaluator, srcUrl, mdRecord);
} catch (MalformedURLException e) {
throw new ApplicationException(e);
}
return mdRecord;
}
private MetadataRecord evaluateXQueries(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) {
Hashtable<String, XQuery> xqueriesHashtable = mdRecord.getxQueries();
if (xqueriesHashtable != null) {
Enumeration<String> keys = xqueriesHashtable.keys();
while (keys != null && keys.hasMoreElements()) {
String key = keys.nextElement();
XQuery xQuery = xqueriesHashtable.get(key);
String xQueryCode = xQuery.getCode();
try {
String xQueryResult = xQueryEvaluator.evaluateAsString(srcUrl, xQueryCode);
xQuery.setResult(xQueryResult);
} catch (Exception e) {
// nothing
}
}
}
return mdRecord;
}
private MetadataRecord getMetadataRecordTei(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/*:TEI/*:teiHeader");
if (metadataXmlStr != null) {
String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:idno");
if (identifier != null)
identifier = StringUtils.deresolveXmlEntities(identifier);
String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:author");
if (creator != null)
creator = StringUtils.deresolveXmlEntities(creator);
String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:titleStmt/*:title");
if (title != null)
title = StringUtils.deresolveXmlEntities(title);
String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:langUsage/*:language[1]/@ident)");
if (language != null && language.isEmpty())
language = null;
if (language != null) {
language = StringUtils.deresolveXmlEntities(language);
language = Language.getInstance().getISO639Code(language);
}
String place = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:pubPlace");
if (place != null)
place = StringUtils.deresolveXmlEntities(place);
String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:date");
Date date = null;
if (yearStr != null && ! yearStr.equals("")) {
yearStr = StringUtils.deresolveXmlEntities(yearStr);
yearStr = new Util().toYearStr(yearStr); // test if possible etc
if (yearStr != null) {
try {
date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
} catch (Exception e) {
// nothing
}
}
}
String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:profileDesc/*:textClass/*:keywords/*:term)");
if (subject != null)
subject = StringUtils.deresolveXmlEntities(subject);
String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability");
if (rights == null)
rights = "open access";
rights = StringUtils.deresolveXmlEntities(rights);
String license = "http://echo.mpiwg-berlin.mpg.de/policy/oa_basics/declaration";
String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/*:teiHeader/*:fileDesc/*:publicationStmt/*:availability/@status)");
if (accessRights == null)
accessRights = "free";
accessRights = StringUtils.deresolveXmlEntities(accessRights);
mdRecord.setIdentifier(identifier);
mdRecord.setLanguage(language);
mdRecord.setCreator(creator);
mdRecord.setTitle(title);
mdRecord.setPublisher(place);
mdRecord.setRights(rights);
mdRecord.setDate(date);
mdRecord.setSubject(subject);
mdRecord.setLicense(license);
mdRecord.setAccessRights(accessRights);
}
String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//*:pb)");
int pageCount = Integer.valueOf(pageCountStr);
mdRecord.setPageCount(pageCount);
mdRecord.setSchemaName("TEI");
return mdRecord;
}
private MetadataRecord getMetadataRecordHtml(XQueryEvaluator xQueryEvaluator, URL srcUrl, MetadataRecord mdRecord) throws ApplicationException {
String metadataXmlStr = xQueryEvaluator.evaluateAsString(srcUrl, "/html/head");
if (metadataXmlStr != null) {
String identifier = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.identifier']/@content)");
if (identifier != null && ! identifier.isEmpty())
identifier = StringUtils.deresolveXmlEntities(identifier);
String creator = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.creator']/@content)");
if (creator != null && ! creator.isEmpty())
creator = StringUtils.deresolveXmlEntities(creator);
String title = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.title']/@content)");
if (title != null && ! title.isEmpty())
title = StringUtils.deresolveXmlEntities(title);
String language = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.language']/@content)");
if (language != null && language.isEmpty())
language = null;
if (language != null && ! language.isEmpty()) {
language = StringUtils.deresolveXmlEntities(language);
language = Language.getInstance().getISO639Code(language);
}
String publisher = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.publisher']/@content)");
if (publisher != null)
publisher = StringUtils.deresolveXmlEntities(publisher);
String yearStr = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.date']/@content)");
Date date = null;
if (yearStr != null && ! yearStr.equals("")) {
yearStr = StringUtils.deresolveXmlEntities(yearStr);
yearStr = new Util().toYearStr(yearStr); // test if possible etc
if (yearStr != null) {
try {
date = new Util().toDate(yearStr + "-01-01T00:00:00.000Z");
} catch (Exception e) {
// nothing
}
}
}
String subject = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.subject']/@content)");
if (subject != null)
subject = StringUtils.deresolveXmlEntities(subject);
String rights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.rights']/@content)");
if (rights != null && ! rights.isEmpty())
rights = StringUtils.deresolveXmlEntities(rights);
String license = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.license']/@content)");
if (license != null && ! license.isEmpty())
license = StringUtils.deresolveXmlEntities(license);
String accessRights = xQueryEvaluator.evaluateAsStringValueJoined(metadataXmlStr, "string(/meta[@name = 'DC.accessRights']/@content)");
if (accessRights != null && ! accessRights.isEmpty())
accessRights = StringUtils.deresolveXmlEntities(accessRights);
mdRecord.setIdentifier(identifier);
mdRecord.setLanguage(language);
mdRecord.setCreator(creator);
mdRecord.setTitle(title);
mdRecord.setPublisher(publisher);
mdRecord.setRights(rights);
mdRecord.setDate(date);
mdRecord.setSubject(subject);
mdRecord.setLicense(license);
mdRecord.setAccessRights(accessRights);
}
String pageCountStr = xQueryEvaluator.evaluateAsString(srcUrl, "count(//pb)");
int pageCount = Integer.valueOf(pageCountStr);
mdRecord.setPageCount(pageCount);
mdRecord.setSchemaName("html");
return mdRecord;
}
private String getPersons(String tocString, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
String persons = xQueryEvaluator.evaluateAsStringValueJoined(tocString, "/list/list[@type='persons']/item[not(. = preceding::item)]", "###"); // [not(. = preceding::item)] removes duplicates
return persons;
}
private String getPlaces(String tocString, XQueryEvaluator xQueryEvaluator) throws ApplicationException {
String places = xQueryEvaluator.evaluateAsStringValueJoined(tocString, "/list/list[@type='places']/item[not(. = preceding::item)]", "###");
return places;
}
private String getNodeType(XdmNode node) {
String nodeType = null;
XdmSequenceIterator iter = node.axisIterator(Axis.CHILD);
if (iter != null) {
while (iter.hasNext()) {
XdmNode firstChild = (XdmNode) iter.next();
if (firstChild != null) {
XdmNodeKind nodeKind = firstChild.getNodeKind();
if (nodeKind.ordinal() == XdmNodeKind.ELEMENT.ordinal()) {
QName nodeQName = firstChild.getNodeName();
nodeType = nodeQName.getLocalName();
}
}
}
}
return nodeType;
}
public String getDocFullFileName(String docId) {
if (docId == null || docId.trim().isEmpty())
return null;
String docDir = getDocDir(docId);
String docFileName = getDocFileName(docId);
String docFullFileName = docDir + "/" + docFileName;
return docFullFileName;
}
public boolean isDocXml(String docId) {
boolean isXml = false;
String fileExt = getDocFileExtension(docId);
if (fileExt != null) {
fileExt = fileExt.toLowerCase();
if (fileExt.equals("xml") || fileExt.equals("html") || fileExt.equals("xhtml"))
isXml = true;
}
return isXml;
}
private String getMimeType(String docId) {
String mimeType = null;
String fileName = getDocFileName(docId);
if (fileName != null) {
fileName = fileName.toLowerCase();
FileNameMap fileNameMap = URLConnection.getFileNameMap();
mimeType = fileNameMap.getContentTypeFor(fileName);
}
return mimeType;
}
public String getDocDir(String docId) {
if (docId == null || docId.trim().isEmpty())
return null;
String documentsDirectory = Constants.getInstance().getDocumentsDir();
String docDir = documentsDirectory;
String docFileName = getDocFileName(docId);
String docFilePath = getDocFilePath(docId);
if (docFilePath != null)
docDir = docDir + docFilePath;
if (docFileName != null)
docDir = docDir + "/" + docFileName;
else
docDir = docDir + "/" + "XXXXX";
return docDir;
}
private String getDocFileName(String docId) {
String docFileName = docId.trim();
int index = docId.lastIndexOf("/");
if (index != -1) {
docFileName = docId.substring(index + 1);
}
return docFileName;
}
private String getDocFileExtension(String docId) {
docId = docId.trim();
String fileExt = null;
int index = docId.lastIndexOf(".");
if (index != -1) {
fileExt = docId.substring(index + 1);
}
return fileExt;
}
private String getDocFilePath(String docId) {
String docFilePath = null;
int index = docId.lastIndexOf("/");
if (index >= 0) {
docFilePath = docId.substring(0, index);
}
if (docFilePath != null && ! docFilePath.startsWith("/"))
docFilePath = docFilePath + "/";
return docFilePath;
}
private Hashtable<Integer, StringBuilder> getFragments(String fileName, String milestoneElementName) throws ApplicationException {
try {
GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler(milestoneElementName);
XMLReader xmlParser = new SAXParser();
xmlParser.setContentHandler(getFragmentsContentHandler);
InputSource inputSource = new InputSource(fileName);
xmlParser.parse(inputSource);
Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
return resultFragments;
} catch (SAXException e) {
throw new ApplicationException(e);
} catch (IOException e) {
throw new ApplicationException(e);
}
}
private String tokenizeWithLemmas(String xmlStr, String language) throws ApplicationException {
StringReader strReader = new StringReader(xmlStr);
XmlTokenizer xmlTokenizer = new XmlTokenizer(strReader);
xmlTokenizer.setLanguage(language);
String[] outputOptionsWithLemmas = {"withLemmas"}; // so all tokens are fetched with lemmas (costs performance)
String[] nwbElements = {"lb", "br", "cb"}; // non word breaking elements // TODO: "hi" cause bug
xmlTokenizer.setNWBElements(nwbElements);
xmlTokenizer.setOutputOptions(outputOptionsWithLemmas);
xmlTokenizer.tokenize();
String retStr = xmlTokenizer.getXmlResult();
return retStr;
}
}