/* * ModeShape (http://www.modeshape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.modeshape.extractor.tika; import static org.hamcrest.core.Is.is; import static org.hamcrest.core.IsNull.notNullValue; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import java.util.Random; import java.util.TreeSet; import org.apache.tika.mime.MediaType; import org.junit.Assert; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import org.modeshape.common.FixFor; import org.modeshape.common.util.FileUtil; import org.modeshape.common.util.IoUtil; import org.modeshape.jcr.InMemoryTestBinary; import org.modeshape.jcr.LocalEnvironment; import org.modeshape.jcr.mimetype.tika.TikaContentDetector; import org.modeshape.jcr.mimetype.MimeTypeDetector; import org.modeshape.jcr.text.TextExtractorContext; import org.modeshape.jcr.text.TextExtractorOutput; /** * Unit test for {@link TikaTextExtractor} */ public class TikaTextExtractorTest { private static final MimeTypeDetector DETECTOR = new TikaContentDetector(new LocalEnvironment()); private static final int DEFAULT_TIKA_WRITE_LIMIT = 100000; private static final String CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; private static final Random RANDOM = new Random(); private TikaTextExtractor extractor; private LinkedList<String> extracted = null; private LinkedList<String> expected = null; @Before public void beforeEach() { extractor = new TikaTextExtractor(); extractor.initialize(); extracted = new LinkedList<String>(); expected = new LinkedList<String>(); } @Test public void shouldHavePredefinedMimeTypesByDefault() { assertThat(extractor.getIncludedMediaTypes().isEmpty(), is(true)); Assert.assertEquals(new TreeSet<MediaType>(TikaTextExtractor.DEFAULT_EXCLUDED_MIME_TYPES), new TreeSet<MediaType>(extractor.getExcludedMediaTypes())); assertFalse(extractor.getParserSupportedMediaTypes().isEmpty()); } @Test public void shouldSupportExtractingFromTextFiles() throws Exception { assertThat(extractor.supportsMimeType("text/plain"), is(true)); } @Test public void shouldSupportExtractingFromPdfFiles() throws Exception { assertThat(extractor.supportsMimeType("application/pdf"), is(true)); } @Test public void shouldNotSupportExtractingFromPostscriptFiles() throws Exception { assertThat(extractor.supportsMimeType("application/postscript"), is(false)); } @Test public void shouldSupportExtractingFromDocWordFiles() throws Exception { assertThat(extractor.supportsMimeType("application/msword"), is(true)); } @Test public void shouldSupportExtractingFromDocxWordFiles() throws Exception { assertThat(extractor.supportsMimeType("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), is(true)); } @Test public void shouldExtractTextFromTextFile1() throws Exception { extractTermsFrom("modeshape.txt"); loadExpectedFrom("modeshape.txt"); extractedShouldHave(remainingExpectedTerms()); } @Test public void shouldExtractTextFromTextFile2() throws Exception { extractTermsFrom("text-file.txt"); loadExpectedFrom("text-file.txt"); extractedShouldHave(remainingExpectedTerms()); } @Test public void shouldExtractTextFromDocFile() throws Exception { extractTermsFrom("modeshape.doc"); loadExpectedFrom("modeshape.txt"); extractedShouldHave(remainingExpectedTerms()); } @Test public void shouldExtractTextFromDocxFile() throws Exception { extractTermsFrom("modeshape.docx"); loadExpectedFrom("modeshape.txt"); } @Test public void shouldExtractTextFromPdfFileGS() throws Exception { extractTermsFrom("modeshape_gs.pdf"); assertExtractedMatchesExpected(); } @Test @FixFor( "MODE-1561" ) public void shouldExtractUsingWriteLimit() throws Exception { int stringLength = DEFAULT_TIKA_WRITE_LIMIT + 2; String rndString = randomString(stringLength); File tempFile = File.createTempFile("tika_extraction_", ".txt"); try { IoUtil.write(rndString, tempFile); extractor.setWriteLimit(stringLength); TextExtractorOutput output = new TextExtractorOutput(); extractor.extractFrom(new InMemoryTestBinary(new FileInputStream(tempFile)), output, new TextExtractorContext(DETECTOR)); assertEquals(rndString, output.getText()); } finally { FileUtil.delete(tempFile); } } @Test @Ignore( "Exposes the Tika/PDF box bug that characters get duplicated when parsing pdfs produced by PDF Context" ) public void shouldExtractTextFromPdfFilePdfContext() throws Exception { extractTermsFrom("modeshape_pdfcontext.pdf"); assertExtractedMatchesExpected(); } @Test @FixFor( "MODE-1810" ) public void shouldExtractTextFromXlsxFile() throws Exception { extractTermsFrom("sample-file.xlsx"); assertTrue(!extracted.isEmpty()); } public static String randomString(int length) { //write a text only header to make sure Tika Mimetype detector doesn't get confused... String header = "this is a text file "; StringBuilder rndStringBuilder = new StringBuilder(length); rndStringBuilder.append(header); for (int i = 0; i < length - header.length(); i++) { rndStringBuilder.append(CHARS.charAt(RANDOM.nextInt(CHARS.length()))); } return rndStringBuilder.toString(); } private void assertExtractedMatchesExpected() throws IOException { loadExpectedFrom("modeshape.txt"); extractedShouldHave("2011-01-24"); extractedShouldHave("-", "1/2", "-"); extractedShouldHave(expectedTermsThrough("-", "versioning")); extractedShouldHave("2011-01-24"); extractedShouldHave("-", "2/2", "-"); extractedShouldHave(remainingExpectedTerms()); } private List<String> remainingExpectedTerms() { return expected; } private void extractedShouldHave( String... words ) { for (String word : words) { assertThat(extracted.pop(), is(word)); } } private void extractedShouldHave( List<String> words ) { List<String> missingWords = new LinkedList<String>(); for (String word : words) { String extractedWord = null; try { extractedWord = extracted.pop(); } catch (NoSuchElementException e) { missingWords.add(word); continue; } assertThat(extractedWord, is(word)); } assertThat("Missing words: " + missingWords, missingWords.size(), is(0)); } private List<String> expectedTermsThrough( String... words ) { if (words == null || words.length == 0) { return Collections.emptyList(); } LinkedList<String> result = new LinkedList<String>(); String nextWord = words[0]; while (nextWord != null && !expected.isEmpty()) { String word = expected.pop(); result.add(word); if (word.equals(nextWord)) { boolean foundAll = true; for (int i = 1; i != words.length; ++i) { String next = expected.pop(); result.add(next); if (!next.equals(words[i])) { foundAll = false; break; } } if (foundAll) { return result; } } } return result; } private void extractTermsFrom( String resourcePath ) throws Exception { InputStream stream = getClass().getClassLoader().getResourceAsStream(resourcePath); assertThat(stream, is(notNullValue())); TextExtractorOutput output = new TextExtractorOutput(); extractor.extractFrom(new InMemoryTestBinary(stream), output, new TextExtractorContext(DETECTOR)); output.toString(); addWords(extracted, output.getText()); } private void loadExpectedFrom( String resourcePath ) throws IOException { InputStream stream = getClass().getClassLoader().getResourceAsStream(resourcePath); assertThat(stream, is(notNullValue())); try { addWords(expected, IoUtil.read(stream)); } finally { stream.close(); } } private void addWords( List<String> words, String input ) { for (String word : input.split("[\\s\"]+")) { if (word.length() > 0) { words.add(word); } } } }