/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2009 Arno Peters
2015 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.filters2.pdf;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.omegat.filters2.AbstractFilter;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.Instance;
import org.omegat.filters2.TranslationException;
import org.omegat.util.OStrings;
/**
* PDF input filter
* @author Arno Peters
* @author Aaron Madlon-Kay
*/
public class PdfFilter extends AbstractFilter {
private static final Pattern LINEBREAK_PATTERN = Pattern.compile("^\\s*?$");
@Override
public String getFileFormatName() {
return OStrings.getString("PDFFILTER_FILTER_NAME");
}
@Override
public Instance[] getDefaultInstances() {
return new Instance[] { new Instance("*.pdf", null, null, TFP_NAMEONLY + ".txt") };
}
@Override
public boolean isSourceEncodingVariable() {
return false;
}
@Override
public boolean isTargetEncodingVariable() {
return true;
}
@Override
public BufferedReader createReader(File infile, String encoding)
throws IOException, TranslationException {
PDFTextStripper stripper;
stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
stripper.setSortByPosition(true);
try (PDDocument document = PDDocument.load(infile)) {
String text = stripper.getText(document);
return new BufferedReader(new StringReader(text));
} catch (NoClassDefFoundError ex) {
Logger.getLogger(getClass().getName()).log(Level.WARNING,
OStrings.getString("PDFFILTER_ENCRYPTED_FILE"), infile);
throw new TranslationException(ex);
}
}
@Override
public void processFile(BufferedReader in, BufferedWriter out, FilterContext fc) {
StringBuilder sb = new StringBuilder();
String s = "";
try {
while ((s = in.readLine()) != null) {
Matcher m = LINEBREAK_PATTERN.matcher(s);
if (m.find()) {
out.write(processEntry(sb.toString()));
sb.setLength(0);
out.write("\n\n");
} else {
sb.append(s);
sb.append(" ");
}
}
if (sb.length() > 0) {
out.write(processEntry(sb.toString()));
sb.setLength(0);
out.write("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}