package org.bbaw.wsp.cms.dochandler.parser.text.parser; import java.util.List; import org.bbaw.wsp.cms.dochandler.parser.document.GeneralDocument; import org.bbaw.wsp.cms.dochandler.parser.document.IDocument; import org.bbaw.wsp.cms.dochandler.parser.document.PdfDocument; /** * This class realizes a DocumentModel - Strategy. That means the strategy * creates a new document model which is accessible by the {@link IDocument} * interface for each parsed document. Last change: saveFile() now uses a * {@link StringBuilder} to concatenate the fulltext String. * * @author Sascha Feldmann (wsp-shk1) * @date 16.08.2012 * */ public class DocumentModelStrategy { /** * Generate a {@link GeneralDocument} which keeps information of any kind of * parsed document. * * @param startURI * - the URI where the parsing job was started. * @param uri * - the URI of the parsed document. * @param text * - the fetched fulltext as String. * @return an {@link IDocument} */ public Object generateDocumentModel(final String startURI, final String uri, final String text) { IDocument document = new GeneralDocument(uri, text); return document; } /** * Generate a {@link PdfDocument} which keeps information of a parsed PDF * document. * * @param - the URI where the parsing job was started. * @param uri * - the URI of the parsed document. * @param textPages * - a list of String. Each entry represents a page of the parsed * document. * @return an {@link IDocument}. You can extract pages by using the * {@link PdfDocument} instance. */ public Object generateDocumentModel(final String startURI, final String uri, final List<String> textPages) { StringBuilder textBuilder = new StringBuilder(); for (int i = 1; i <= textPages.size(); i++) { textBuilder.append("[page=" + i + "]\n" + textPages.get(i - 1)); } IDocument document = new PdfDocument(uri, textBuilder.toString(), textPages); return document; } }