package org.rr.jeborker.converter; import static org.rr.commons.utils.StringUtil.EMPTY; import static org.rr.commons.utils.StringUtil.isNotEmpty; import static org.rr.commons.utils.StringUtil.replace; import java.io.IOException; import java.io.OutputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; import org.rr.commons.io.PrintWriterFilter; import org.rr.commons.log.LoggerFactory; import org.rr.commons.mufs.IResourceHandler; import org.rr.commons.mufs.ResourceHandlerFactory; import org.rr.commons.utils.BooleanUtils; import org.rr.commons.utils.MathUtils; import org.rr.jeborker.app.JeboorkerConstants; import org.rr.jeborker.app.JeboorkerConstants.SUPPORTED_MIMES; import org.rr.jeborker.app.preferences.APreferenceStore; import org.rr.jeborker.app.preferences.PreferenceStoreFactory; import org.rr.jeborker.gui.ConverterPreferenceController; import org.rr.jeborker.gui.MainController; import org.rr.jeborker.metadata.pdf.PDFUtils; import com.itextpdf.text.Document; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; public class PdfToTxtConverter implements IEBookConverter { private static final String EXTRACTION_MODE_LABEL = Bundle.getString("PdfToTxtConverter.extractionMode.label"); private static final String SIMPLE_TEXT_EXTRACTION = Bundle.getString("PdfToTxtConverter.extractionMode.simple"); private static final String LOCATION_BASED_TEXT_EXTRACTION = Bundle.getString("PdfToTxtConverter.extractionMode.location"); private static final String REMOVE_PAGE_NUMBERS_LABEL = Bundle.getString("PdfToTxtConverter.removePageNumbers.label"); private static final String REMOVE_HYPHEN_LABEL = Bundle.getString("PdfToTxtConverter.removeHyphen.label"); private APreferenceStore preferenceStore = PreferenceStoreFactory.getPreferenceStore(PreferenceStoreFactory.DB_STORE); private ConverterPreferenceController converterPreferenceController; private IResourceHandler pdfResource; PdfToTxtConverter(IResourceHandler pdfSource) { this.pdfResource = pdfSource; } @Override public IResourceHandler convert() throws IOException { ConverterPreferenceController converterPreferenceDialog = getConverterPreferenceController(); if (!converterPreferenceDialog.isConfirmed()) { return null; } Document document = new Document(); IResourceHandler targetTxtResource = ResourceHandlerFactory.getUniqueResourceHandler(this.pdfResource, "txt"); PdfReader reader = null; try (OutputStream txtOutputStream = targetTxtResource.getContentOutputStream(false)) { reader = PDFUtils.getReader(this.pdfResource.toFile()); PrintWriterFilter printWriter = new PrintWriterFilter(new PrintWriter(txtOutputStream), PrintWriterFilter.getAcceptAllLineFilter()); if (isRemovePageNumersEnabled()) { printWriter = createNumberFilterPrintWriter(printWriter); } if(isRemoveHypenEnabled()) { printWriter = createHypenRemovePrintWriter(printWriter); } extractTextFromPdf(document, reader, printWriter); printWriter.flush(); } catch (IOException e) { throw e; } catch (Throwable e) { throw new IOException("Failed to convert PDF " + pdfResource.getName(), e); } finally { if (reader != null) { try { reader.close(); } catch (Exception e) { LoggerFactory.getLogger().log(Level.WARNING, "Failed to close pdf reader", e); } } } storeCheckboxValue(REMOVE_PAGE_NUMBERS_LABEL, isRemovePageNumersEnabled()); storeCheckboxValue(REMOVE_HYPHEN_LABEL, isRemoveHypenEnabled()); storeComboboxValue(EXTRACTION_MODE_LABEL, getExtractionMode()); return targetTxtResource; } private PrintWriterFilter createNumberFilterPrintWriter(PrintWriterFilter printWriter) { return new PrintWriterFilter(printWriter, new PrintWriterFilter.LineFilter() { private Pattern numberPattern = Pattern.compile("(\\d*)"); @Override public String filter(String text, int page) { List<Integer> numbers = getNumbers(text); for (Integer num : numbers) { if(num > 0 && MathUtils.between(num, page -1, page + 1)) { text = replace(text, num.toString(), EMPTY); } } return text; } private List<Integer> getNumbers(String line) { List<Integer> result = new ArrayList<>(); Matcher matcher = numberPattern.matcher(line); while (matcher.find()) { String group = matcher.group(1); if(isNotEmpty(group)) { result.add(NumberUtils.toInt(group)); } } return result; } }); } private PrintWriterFilter createHypenRemovePrintWriter(PrintWriterFilter printWriter) { return new PrintWriterFilter(printWriter, new PrintWriterFilter.LineFilter() { @Override public String filter(String text, int page) { return StringUtils.removeAll(text, "-\\s*\\n\\s*"); } }); } private boolean isRemovePageNumersEnabled() { return getConverterPreferenceController().getCommonValueAsBoolean(REMOVE_PAGE_NUMBERS_LABEL); } private boolean isRemoveHypenEnabled() { return getConverterPreferenceController().getCommonValueAsBoolean(REMOVE_HYPHEN_LABEL); } private String getExtractionMode() { return getConverterPreferenceController().getCommonValueAsString(EXTRACTION_MODE_LABEL); } private void extractTextFromPdf(Document document, PdfReader reader, PrintWriterFilter out) throws IOException { for (int i = 1; i <= reader.getNumberOfPages(); i++) { String textFromPage = PdfTextExtractor.getTextFromPage(reader, i, getExtractionStrategy()); out.println(textFromPage, i); } } private TextExtractionStrategy getExtractionStrategy() { String extractionMode = getExtractionMode(); TextExtractionStrategy extractionStrategy; if (StringUtils.equals(extractionMode, SIMPLE_TEXT_EXTRACTION)) { extractionStrategy = new SimpleTextExtractionStrategy(); } else if (StringUtils.equals(extractionMode, LOCATION_BASED_TEXT_EXTRACTION)) { extractionStrategy = new LocationTextExtractionStrategy(); } else { throw new RuntimeException("Undefined extraction strategy."); } return extractionStrategy; } /** * Gets the {@link ConverterPreferenceController} for this instance. Creates a new {@link ConverterPreferenceController} if no one is * created previously. * * @see #createConverterPreferenceController() */ private ConverterPreferenceController getConverterPreferenceController() { if (this.converterPreferenceController == null) { this.converterPreferenceController = this.createConverterPreferenceController(); } if (!this.converterPreferenceController.hasShown()) { this.converterPreferenceController.showPreferenceDialog(); } return this.converterPreferenceController; } @Override public SUPPORTED_MIMES getConversionSourceType() { return JeboorkerConstants.SUPPORTED_MIMES.MIME_PDF; } @Override public SUPPORTED_MIMES getConversionTargetType() { return JeboorkerConstants.SUPPORTED_MIMES.MIME_TXT; } public void setConverterPreferenceController(ConverterPreferenceController controller) { this.converterPreferenceController = controller; } public ConverterPreferenceController createConverterPreferenceController() { ConverterPreferenceController controller = MainController.getController().getConverterPreferenceController(); controller.setShowLandscapePageEntries(false); controller.addCommonListSelection(EXTRACTION_MODE_LABEL, Arrays.asList(LOCATION_BASED_TEXT_EXTRACTION, SIMPLE_TEXT_EXTRACTION), getRestoredComboboxValue(EXTRACTION_MODE_LABEL, LOCATION_BASED_TEXT_EXTRACTION)); controller.addCommonCheckBox(REMOVE_PAGE_NUMBERS_LABEL, getRestoredCheckboxValue(REMOVE_PAGE_NUMBERS_LABEL)); controller.addCommonCheckBox(REMOVE_HYPHEN_LABEL, getRestoredCheckboxValue(REMOVE_HYPHEN_LABEL)); return controller; } private String getRestoredComboboxValue(String key, String defaultValue) { return preferenceStore.getGenericEntryAsString(key, defaultValue); } private void storeComboboxValue(String key, String value) { preferenceStore.addGenericEntryAsString(key, value); } private Boolean getRestoredCheckboxValue(String key) { return BooleanUtils.toBoolean(preferenceStore.getGenericEntryAsString(key, Boolean.TRUE.toString())); } private void storeCheckboxValue(String key, Boolean value) { preferenceStore.addGenericEntryAsString(key, value.toString()); } }