package de.uni_koeln.ub.drc.reader;
import java.io.IOException;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
/**
* Tests for the {@link PdfContentExtractor} class.
*/
public class PdfExtractionTests {
private String pdfName = "res/tests/PPN345572629_0004 - 0007.pdf"; //$NON-NLS-1$
private PageInfo pi = PdfContentExtractor.extractContentFromPdf(pdfName);
/**
* Check word extraction and tokenization.
*
* @throws IOException
* On loading issues.
*/
@Test
public void words() throws IOException {
System.out.println("PageInfo: " + pi); //$NON-NLS-1$
List<Paragraph> paragraps = pi.getParagraphs();
for (Paragraph paragraph : paragraps) {
List<ExtractedWord> words = paragraph.getWords();
for (ExtractedWord word : words) {
Assert.assertFalse("There should be no empty words in: " //$NON-NLS-1$
+ words, word.getText().trim().length() == 0);
Assert.assertFalse("Encoding should be correct", word.getText() //$NON-NLS-1$
.contains("�")); //$NON-NLS-1$
}
}
}
/**
* Count extracted text chunks.
*/
@Test
public void findTextChunks() {
List<Paragraph> ps = pi.getParagraphs();
String toFind = "pievel"; //$NON-NLS-1$
int counts = 0;
for (Paragraph para : ps) {
// List<ExtractedWord> words = para.getWordsInLine();
for (ExtractedWord extractedWord : para.getWords()) {
if (extractedWord.getText().contains(toFind)) {
counts++;
}
}
}
Assert.assertTrue(
String.format("'%s' should be found 2 times, but occurs " //$NON-NLS-1$
+ counts + " times", toFind), counts == 2); //$NON-NLS-1$
}
/**
* Test font scaling.
*/
@Test
public void fontSizeScaling() {
List<ExtractedWord> words = pi.getParagraphs().get(1).getWords();
for (ExtractedWord extractedWord : words) {
int fontSize1 = extractedWord.getFontSizeScaled(1440);
int fontSize2 = extractedWord.getFontSizeScaled(900);
Assert.assertTrue(String.format(
"Font size %s should be larger than %s", fontSize1, //$NON-NLS-1$
fontSize2), fontSize1 > fontSize2);
Assert.assertTrue(
String.format(
"Font size %s should be larger than unscaled size %s (different measure)", //$NON-NLS-1$
fontSize1, extractedWord.getFontSize()),
fontSize1 > extractedWord.getFontSize());
}
}
/**
* Test paragraph detection.
*/
@Test
public void paragraphs() {
List<Paragraph> paragraphs = pi.getParagraphs();
Assert.assertTrue(paragraphs.get(0).getWords().get(0).getText()
.startsWith("DANiEL")); //$NON-NLS-1$
Assert.assertTrue(paragraphs.size() == 4);
}
/**
* Test coordinates scaling.
*/
@Test
public void point() {
List<ExtractedWord> words = pi.getParagraphs().get(1).getWords();
Point scaledStart = words.get(0).getStartPointScaled(900, 1440);
Point p = new Point(192, 564);
Assert.assertEquals(p, scaledStart);
Assert.assertTrue(words.get(0).getText().toString()
.startsWith("(Abgedruckt")); //$NON-NLS-1$
}
}