package org.bbaw.wsp.cms.test;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;
import java.util.List;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.Term;
import org.bbaw.wsp.cms.collections.Collection;
import org.bbaw.wsp.cms.collections.CollectionManager;
import org.bbaw.wsp.cms.collections.CollectionReader;
import org.bbaw.wsp.cms.dochandler.DocumentHandler;
import org.bbaw.wsp.cms.document.Hits;
import org.bbaw.wsp.cms.lucene.IndexHandler;
import org.bbaw.wsp.cms.scheduler.CmsDocOperation;
import org.bbaw.wsp.cms.transform.GetFragmentsContentHandler;
import org.bbaw.wsp.cms.transform.HighlightContentHandler;
import org.bbaw.wsp.cms.translator.GlosbeTranslator;
import org.bbaw.wsp.cms.translator.MicrosoftTranslator;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import com.memetix.mst.language.Language;
import com.sun.jndi.toolkit.url.Uri;
import com.sun.org.apache.xerces.internal.parsers.SAXParser;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
import de.mpg.mpiwg.berlin.mpdl.lt.dict.db.LexHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.WordContentHandler;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizer;
import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.XmlTokenizerContentHandler;
public class TestLocal {
private IndexHandler indexer;
public static void main(String[] args) throws ApplicationException {
try {
TestLocal test = new TestLocal();
test.init();
// test.testXml();
// test.tokenizeXmlFragment();
// test.getFragments("/Users/jwillenborg/tmp/writeFragments/Benedetti_1585.xml");
// File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/echo/la/Benedetti_1585/pages/page-13-morph.xml");
// String page13 = FileUtils.readFileToString(srcFile, "utf-8");
// test.highlight(page13, "s", 6, "reg", "relatiuum");
// test.queries();
// test.translator2();
// test.testCollectionReader();
// test.createAllDocuments();
// test.testCalls();
// test.bla();
test.end();
} catch (Exception e) {
e.printStackTrace();
}
}
private void init() throws ApplicationException {
indexer = IndexHandler.getInstance();
}
private void end() throws ApplicationException {
indexer.end();
}
private void bla() {
String blabla = "huhu###haha###";
String[] places = blabla.split("xxx");
String bla = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html><bla></bla></html>";
bla = bla.replaceAll("<\\?xml.*?\\?>", "");
bla = "<persName name=\"Friedrich Wilhelm III.\">König</persName>";
bla = bla.replaceAll("<persName name=\"(.+)\".*>", "$1");
bla = "<persName name=\"Friedrich Wilhelm III.\">König</persName>";
bla = bla.replaceAll("<persName.*?>(.*)</persName>", "$1");
String b = "";
String url = "http://bla.com/test?bla bla";
try {
String encodedUri = URIUtil.encodeQuery(url, "utf-8");
String c = "";
} catch (Exception e) {
e.printStackTrace();
}
}
private void testXml() throws ApplicationException {
try {
File srcFile = new File("/Users/jwillenborg/mpdl/data/xml/documents/tei/de/dt-ptolemaeus-tei-merge2.xml");
FileReader docFileReader = new FileReader(srcFile);
XmlTokenizer docXmlTokenizer = new XmlTokenizer(docFileReader);
docXmlTokenizer.setDocIdentifier("/tei/de/dt-ptolemaeus-tei-merge2.xml");
docXmlTokenizer.tokenize();
ArrayList<XmlTokenizerContentHandler.Element> elements = docXmlTokenizer.getElements("s");
String bla = "";
} catch (Exception e) {
e.printStackTrace();
}
}
private void testCollectionReader() throws ApplicationException {
CollectionReader confReader = CollectionReader.getInstance();
Collection registres = confReader.getCollection("registres");
CollectionReader collReader = CollectionReader.getInstance();
Collection avh = collReader.getCollection("AvH");
String bla = "";
}
private void createAllDocuments() throws ApplicationException {
CollectionManager collectionManager = CollectionManager.getInstance();
// collectionManager.updateCollections();
// collectionManager.updateCollection("test", true);
// collectionManager.updateCollection("AvH", true);
// collectionManager.updateCollection("registres", true);
// collectionManager.updateCollection("mes", true);
// collectionManager.updateCollection("MEGA", true);
collectionManager.updateCollection("IG", true);
}
private void testCalls() throws ApplicationException {
Date before = new Date();
System.out.println("Indexing start: " + before.getTime());
DocumentHandler docHandler = new DocumentHandler();
String docIdAvhNew = "/test/Dok280E18xml.xml";
String docSrcUrlStr = "http://telota.bbaw.de:8085/exist/rest/db/AvHBriefedition/Briefe/Dok280E18xml.xml";
CmsDocOperation docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdAvhNew);
docOperation.setCollectionNames("test");
docOperation.setMainLanguage("deu");
String[] elemNames = {"p", "s", "head"};
docOperation.setElementNames(elemNames);
// docHandler.doOperation(docOperation);
String docIdMega = "/mega/docs/MEGA_A2_B001-01_ETX.xml";
docSrcUrlStr = "http://telota.bbaw.de:8085/exist/rest/db/mega/docs/MEGA_A2_B001-01_ETX.xml";
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMega);
docOperation.setCollectionNames("mega");
docOperation.setMainLanguage("deu");
docOperation.setElementNames(elemNames);
// docHandler.doOperation(docOperation);
String docIdGoerz = "/tei/de/dt-ptolemaeus-tei-merge2.xml";
docSrcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=" + docIdGoerz;
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdGoerz);
// docHandler.doOperation(docOperation);
String docIdBenedetti = "/echo/la/Benedetti_1585.xml";
docSrcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=" + docIdBenedetti;
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdBenedetti);
// docHandler.doOperation(docOperation);
String docIdAdams = "/echo/de/Adams_1785_S7ECRGW8.xml";
docSrcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=" + docIdAdams;
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdAdams);
// docHandler.doOperation(docOperation);
String docIdMonte = "/archimedes/la/monte_mecha_036_la_1577.xml";
docSrcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/getDoc?doc=" + docIdMonte;
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdMonte);
// docHandler.doOperation(docOperation);
String docIdEinstein = "/diverse/de/Einst_Antwo_de_1912.xml";
docSrcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de:30060/mpdl/getDoc?doc=" + docIdEinstein;
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdEinstein);
// docHandler.doOperation(docOperation);
String docIdAvh = "/tei/de/Dok280E18xml.xml";
docSrcUrlStr = "http://telota.bbaw.de:8085/exist/rest/db/AvHBriefedition/Briefe/Dok280E18xml.xml";
docOperation = new CmsDocOperation("create", docSrcUrlStr, null, docIdAvh);
// docHandler.doOperation(docOperation);
// indexer.deleteDocument(docIdGoerz);
// indexer.deleteDocument(docIdBenedetti);
/*
Date end = new Date();
System.out.println("Indexing end: : " + end.getTime());
String queryStr = "tokenOrig:tempore";
System.out.println("Query: " + queryStr);
// ArrayList<Document> docs = indexer.queryDocuments(queryField, queryStr);
ArrayList<Document> docs = indexer.queryDocument(docIdMonte, queryStr);
for (int i=0; i<docs.size(); i++) {
Document doc = docs.get(i);
Fieldable f = doc.getFieldable("docId");
if (f != null) {
String id = f.stringValue();
System.out.print("<doc>" + id + "</doc>");
}
Fieldable fContent = doc.getFieldable("xmlContent");
if (fContent != null) {
String content = fContent.stringValue();
System.out.print("<doc>" + content + "</doc>");
}
}
System.out.println("");
System.out.print("Browse documents: ");
ArrayList<Term> terms = indexer.getTerms("docId", "", 1000);
for (int i=0; i<terms.size(); i++) {
Term term = terms.get(i);
System.out.print(term + ", ");
}
System.out.print("Tokens in tokenOrig: ");
ArrayList<Term> tokenTerms = indexer.getTerms("tokenOrig", "a", 100);
for (int i=0; i<tokenTerms.size(); i++) {
Term term = tokenTerms.get(i);
System.out.print(term + ", ");
}
*/
MorphologyCache.getInstance().end();
LexHandler.getInstance().end();
}
private Hashtable<Integer, StringBuilder> getFragments(String fileName) throws ApplicationException {
try {
GetFragmentsContentHandler getFragmentsContentHandler = new GetFragmentsContentHandler();
XMLReader xmlParser = new SAXParser();
xmlParser.setContentHandler(getFragmentsContentHandler);
InputSource inputSource = new InputSource(fileName);
xmlParser.parse(inputSource);
Hashtable<Integer, StringBuilder> resultFragments = getFragmentsContentHandler.getResultPages();
return resultFragments;
} catch (SAXException e) {
throw new ApplicationException(e);
} catch (IOException e) {
throw new ApplicationException(e);
}
}
private String tokenizeXmlFragment() throws ApplicationException {
String result = null;
try {
String xmlFragment = new String(FileUtils.readFileToByteArray(new File("/Users/jwillenborg/tmp/testFragment2.xml")), "utf-8");
String srcUrlStr = "http://mpdl-system.mpiwg-berlin.mpg.de/mpdl/page-query-result.xql?document=/echo/la/Benedetti_1585.xml&mode=pureXml&pn=13";
URL srcUrl = new URL(srcUrlStr);
InputStream inputStream = srcUrl.openStream();
BufferedInputStream in = new BufferedInputStream(inputStream);
xmlFragment = IOUtils.toString(in, "utf-8");
in.close();
XmlTokenizer xmlTokenizer = new XmlTokenizer(new StringReader(xmlFragment));
xmlTokenizer.setLanguage("lat");
String[] stopElements = {"var"};
// xmlTokenizer.setOutputFormat("string");
String[] outputOptions = {"withLemmas"};
xmlTokenizer.setOutputOptions(outputOptions);
xmlTokenizer.setStopElements(stopElements);
xmlTokenizer.tokenize();
result = xmlTokenizer.getXmlResult();
System.out.println(result);
} catch (Exception e) {
throw new ApplicationException(e);
}
return result;
}
private String normalizeWords(String xmlStr) throws ApplicationException {
try {
WordContentHandler wordContentHandler = new WordContentHandler("norm");
XMLReader xmlParser = new SAXParser();
xmlParser.setContentHandler(wordContentHandler);
StringReader strReader = new StringReader(xmlStr);
InputSource inputSource = new InputSource(strReader);
xmlParser.parse(inputSource);
String result = wordContentHandler.getResult();
return result;
} catch (SAXException e) {
throw new ApplicationException(e);
} catch (IOException e) {
throw new ApplicationException(e);
}
}
private String highlight(String xmlStr, String highlightElem, int highlightElemPos, String highlightQueryType, String highlightQuery, String language) throws ApplicationException {
String result = null;
try {
xmlStr = normalizeWords(xmlStr);
HighlightContentHandler highlightContentHandler = new HighlightContentHandler(highlightElem, highlightElemPos, highlightQueryType, highlightQuery, language);
highlightContentHandler.setFirstPageBreakReachedMode(true);
XMLReader xmlParser = new SAXParser();
xmlParser.setContentHandler(highlightContentHandler);
StringReader stringReader = new StringReader(xmlStr);
InputSource inputSource = new InputSource(stringReader);
xmlParser.parse(inputSource);
result = highlightContentHandler.getResult().toString();
} catch (SAXException e) {
throw new ApplicationException(e);
} catch (IOException e) {
throw new ApplicationException(e);
}
return result;
}
private void translator() throws ApplicationException {
try {
Language lang = Language.fromString("de");
String languageCode = MicrosoftTranslator.detectLanguageName("café car");
String bla = "";
} catch (Exception e) {
throw new ApplicationException(e);
}
}
private void translator2() throws ApplicationException {
try {
String[] query = {"haus", "moor"};
String[] translations = GlosbeTranslator.getInstance().translate(query, "deu", "eng");
String[] translations2 = GlosbeTranslator.getInstance().translate(query, "deu", "fra");
String lang = GlosbeTranslator.getInstance().detectLanguageCode("haus");
lang = GlosbeTranslator.getInstance().detectLanguageCode("house");
lang = GlosbeTranslator.getInstance().detectLanguageCode("maison");
lang = GlosbeTranslator.getInstance().detectLanguageCode("ZZZZZZZZ");
String bla = "";
} catch (Exception e) {
throw new ApplicationException(e);
}
}
private void queries() throws ApplicationException {
Hits docsss = indexer.queryDocuments("tokenMorph:haus", null, "eng", 1, 10, true, true);
String docId = "/mega/docs/MEGA_A2_B001-01_ETX.xml";
// String query = "mod_date:[20020101 TO 20030101]";
// String query = "tokenOrig:\"Haben beide\"~2";
// String query = "tokenOrig:Habe~";
// String query = "tokenOrig:\"Haben Sie beide\"";
// String query = "+tokenMorph:gebrauchen +tokenMorph:schmutzig";
// String query = "title:a*";
// String query = "tokenMorph:gebrauchen AND tokenMorph:schmutzig";
String query = "tokenMorph:wird";
// Hits docsss = indexer.queryDocuments("tokenOrig:\"politischen Oekonomie\"", null, 1, 10, false, false);
// Hits docsss = indexer.queryDocuments("tokenMorph:wird", null, 1, 10, true, false);
Hits persHits = indexer.queryDocument("/mes/mes/data/MzE_7_2.xml", "elementName:persName", 0, 100);
ArrayList<org.bbaw.wsp.cms.document.Document> namesList = persHits.getHits();
for (org.bbaw.wsp.cms.document.Document nameDoc : namesList) {
Fieldable docPersNameField = nameDoc.getFieldable("xmlContent");
String docPersName = docPersNameField.stringValue();
docPersName = docPersName.replaceAll("\\n", "");
String persNameAttribute = docPersName;
if(persNameAttribute.contains("persName nymRef"))
persNameAttribute = docPersName.replaceAll("<persName nymRef=\"(.+?)\".+?</persName>", "$1");
if(persNameAttribute.contains("persName name="))
persNameAttribute = docPersName.replaceAll("<persName name=\"(.+?)\".+?</persName>", "$1");
if(persNameAttribute.contains("persName key="))
persNameAttribute = docPersName.replaceAll("<persName.*?>(.*?)</persName>", "$1");
persNameAttribute = persNameAttribute.replaceAll("<persName.*?>(.*?)</persName>", "$1");
persNameAttribute = persNameAttribute.trim();
}
// ArrayList<String> terms = indexer.fetchTerms(query, "de");
Hits docs = indexer.queryDocument(docId, query, 1, 1000);
// docs = indexer.queryDocument("/tei/de/Dok280E18xml.xml", "+elementName:persName +tokenOrig:alexander", 0, 1000);
String bla = "";
}
}