package org.bbaw.wsp.cms.dochandler.parser.text.parser;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.bbaw.wsp.cms.dochandler.parser.document.CharCodeManager;
import org.bbaw.wsp.cms.dochandler.parser.document.GeneralDocument;
import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException;
/**
* This class parses a DOC file. It uses the Singleton pattern. Only one
* instance can exist. Last change: Improved handling of footnotes. They are now
* marked by superscripts and appended to the textOrig (fulltext). See
* {@link OdfParserImpl}.
*
* @author Sascha Feldmann (wsp-shk1)
* @date 20.09.2012
*
*/
public class DocParserImpl extends ResourceParser {
/**
* UTF 16 REPLACEMENT CHARACTER (which TIKA uses to mark footnotes).
*/
public static final char CODE_FOOTNOTE = 0xFFFD;
/*
* This value defines the minimum distance to the last footnote.
*/
private static final int FOOTNOTES_TOLERANCE = 2;
private static DocParserImpl instance;
/**
* Return the only existing instance. The instance uses an Apache TIKA Doc
* parser.
*
* @return
*/
public static DocParserImpl getInstance() {
if (instance == null) {
return new DocParserImpl();
}
return instance;
}
private DocParserImpl() {
super(new OfficeParser());
}
/*
* (non-Javadoc)
*
* @see
* org.bbaw.wsp.cms.dochandler.parser.text.parser.ResourceParser#parse(java
* .lang.String, java.lang.String)
*/
public Object parse(final String startUri, final String uri) throws ApplicationException {
GeneralDocument doc = (GeneralDocument) super.parse(startUri, uri);
String[] lines = doc.getTextOrig().split("\n"); // parse lines
TreeMap<Integer, String> footnotes = new TreeMap<Integer, String>();
StringBuilder newTextOrigBuilder = new StringBuilder(); // StringBuilder to
// cut the footnotes
// from the original
// String
int linesSinceLastNote = -1; // If a footnote is identified, this int number
// saves the number of lines until the next
// footnote
int lineNumber = 0; // identify the current line number
int footnoteNumber = 0; // identify the current footnote number
for (String line : lines) { // Find footnotes
if (footnoteNumber == 0) { // cut textOrig, remove footnotes
newTextOrigBuilder.append(line + "\n");
}
lineNumber++;
// identifiy footnotes after the first half of the document
if (lineNumber > (lines.length / 2)) {
boolean match = false;
final Pattern p = Pattern.compile(CODE_FOOTNOTE + "\t(.*)");
for (Matcher m = p.matcher(line); m.find();) {
match = true;
linesSinceLastNote = 0;
/*
* configure tolerance if conditions change (e.g. if the parser
* seperates footnotes by three empty lines instead of two)
*/
if (linesSinceLastNote < FOOTNOTES_TOLERANCE) {
footnoteNumber++;
final String footnote = m.group(1);
if (footnoteNumber == 1) { // remove first footnote from
// StringBuilder
newTextOrigBuilder.delete(newTextOrigBuilder.toString().indexOf(CODE_FOOTNOTE + "\t" + footnote), newTextOrigBuilder.length());
}
footnotes.put(footnoteNumber, footnote);
}
}
if (!match) {
linesSinceLastNote++;
}
}
}
doc.setTextOrig(newTextOrigBuilder.toString());
// Find superscripts and replace them and concatenate to textOrig
for (int key : footnotes.keySet()) {
doc.setTextOrig(doc.getTextOrig().replaceFirst(CODE_FOOTNOTE + "", CharCodeManager.returnNumberSuperscript(key)));
doc.setTextOrig(doc.getTextOrig().concat(CharCodeManager.returnNumberSuperscript(key) + " " + footnotes.get(key)) + "\n");
}
return doc;
}
}