package org.jabref.logic.pdf;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import org.jabref.model.pdf.FileAnnotation;
import org.jabref.model.pdf.FileAnnotationType;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class PdfAnnotationImporter implements AnnotationImporter {
private static final Log LOGGER = LogFactory.getLog(PdfAnnotationImporter.class);
/**
* Imports the comments from a pdf specified by its path
*
* @param path a path to a pdf
* @return a list with the all the annotations found in the file of the path
*/
@Override
public List<FileAnnotation> importAnnotations(final Path path) {
if (!validatePath(path)) {
// Path could not be validated, return default result
return Collections.emptyList();
}
List<FileAnnotation> annotationsList = new LinkedList<>();
try (PDDocument document = PDDocument.load(path.toString())) {
List pdfPages = document.getDocumentCatalog().getAllPages();
for (int pageIndex = 0; pageIndex < pdfPages.size(); pageIndex++) {
PDPage page = (PDPage) pdfPages.get(pageIndex);
for (PDAnnotation annotation : page.getAnnotations()) {
if (!isSupportedAnnotationType(annotation)) {
continue;
}
if (FileAnnotationType.UNDERLINE.toString().equals(annotation.getSubtype()) ||
FileAnnotationType.HIGHLIGHT.toString().equals(annotation.getSubtype())) {
annotationsList.add(createMarkedAnnotations(pageIndex, page, annotation));
} else {
FileAnnotation fileAnnotation = new FileAnnotation(annotation, pageIndex + 1);
if (fileAnnotation.getContent() != null && !fileAnnotation.getContent().isEmpty()) {
annotationsList.add(fileAnnotation);
}
}
}
}
} catch (IOException e) {
LOGGER.error(String.format("Failed to read file '%s'.", path), e);
}
return annotationsList;
}
private boolean isSupportedAnnotationType(PDAnnotation annotation) {
try {
if (!Arrays.asList(FileAnnotationType.values()).contains(FileAnnotationType.valueOf(annotation.getSubtype()))) {
return false;
}
} catch (IllegalArgumentException e) {
LOGGER.debug(String.format("Could not parse the FileAnnotation %s into any known FileAnnotationType. It was %s!", annotation, annotation.getSubtype()));
}
return true;
}
private FileAnnotation createMarkedAnnotations(int pageIndex, PDPage page, PDAnnotation annotation) {
FileAnnotation annotationBelongingToMarking = new FileAnnotation(
annotation.getDictionary().getString(COSName.T), FileAnnotation.extractModifiedTime(annotation.getModifiedDate()),
pageIndex + 1, annotation.getContents(), FileAnnotationType.valueOf(annotation.getSubtype().toUpperCase(Locale.ROOT)), Optional.empty());
try {
if (FileAnnotationType.HIGHLIGHT.toString().equals(annotation.getSubtype()) || FileAnnotationType.UNDERLINE.toString().equals(annotation.getSubtype())) {
annotation.setContents(extractMarkedText(page, annotation));
}
} catch (IOException e) {
annotation.setContents("JabRef: Could not extract any marked text!");
}
//Marked text that has a sticky note on it should be linked to the sticky note
return new FileAnnotation(annotation, pageIndex + 1, annotationBelongingToMarking);
}
private String extractMarkedText(PDPage page, PDAnnotation annotation) throws IOException {
//highlighted or underlined text has to be extracted by the rectangle calculated from the marking
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
COSArray quadsArray = (COSArray) annotation.getDictionary().getDictionaryObject(COSName.getPDFName("QuadPoints"));
String markedText = "";
for (int j = 1,
k = 0;
j <= (quadsArray.size() / 8);
j++) {
COSFloat upperLeftX = (COSFloat) quadsArray.get(k);
COSFloat upperLeftY = (COSFloat) quadsArray.get(1 + k);
COSFloat upperRightX = (COSFloat) quadsArray.get(2 + k);
COSFloat upperRightY = (COSFloat) quadsArray.get(3 + k);
COSFloat lowerLeftX = (COSFloat) quadsArray.get(4 + k);
COSFloat lowerLeftY = (COSFloat) quadsArray.get(5 + k);
k += 8;
float ulx = upperLeftX.floatValue() - 1;
float uly = upperLeftY.floatValue();
float width = upperRightX.floatValue() - lowerLeftX.floatValue();
float height = upperRightY.floatValue() - lowerLeftY.floatValue();
PDRectangle pageSize = page.getMediaBox();
uly = pageSize.getHeight() - uly;
Rectangle2D.Float rectangle = new Rectangle2D.Float(ulx, uly, width, height);
stripperByArea.addRegion("markedRegion", rectangle);
stripperByArea.extractRegions(page);
String markedTextInLine = stripperByArea.getTextForRegion("markedRegion");
if (j > 1) {
markedText = markedText.concat(markedTextInLine);
} else {
markedText = markedTextInLine;
}
}
return markedText.trim();
}
private boolean validatePath(Path path) {
Objects.requireNonNull(path);
if (!path.toString().toLowerCase(Locale.ROOT).endsWith(".pdf")) {
LOGGER.warn(String.format("File %s does not end with .pdf!", path));
return false;
}
if (!Files.exists(path)) {
LOGGER.warn(String.format("File %s does not exist!", path));
return false;
}
if (!Files.isRegularFile(path) || !Files.isReadable(path)) {
LOGGER.warn(String.format("File %s is not readable!", path));
return false;
}
return true;
}
}