/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.extractor.tika;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.jcr.NodeIterator;
import javax.jcr.RepositoryException;
import javax.jcr.query.Query;
import javax.jcr.query.QueryManager;
import javax.jcr.query.QueryResult;
import org.junit.Assert;
import org.junit.Test;
import org.modeshape.common.FixFor;
import org.modeshape.jcr.SingleUseAbstractTest;
import org.modeshape.jcr.api.JcrTools;
import org.modeshape.jcr.query.JcrQuery;
/**
* Integration test which configures a repository to use a Tika-based extractor, creates and saves a node which has a binary value
* from a text file and using the query mechanism tests that the text is extracted from the binary value and stored in the
* indexes.
*
* @author Horia Chiorean
*/
public class TikaTextExtractorRepositoryTest extends SingleUseAbstractTest {
private JcrTools jcrTools = new JcrTools();
@Test
public void shouldExtractAndIndexContentFromPlainTextFile() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config.json"));
uploadFile("text-file.txt");
assertExtractedTextHasBeenIndexed("select [jcr:path] from [nt:resource] as res where contains(res.*, 'The Quick Red Fox Jumps Over the Lazy Brown Dog')");
}
@Test
public void shouldExtractAndIndexContentFromDocFile() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config.json"));
uploadFile("modeshape.doc");
assertExtractedTextHasBeenIndexed("select [jcr:path] from [nt:resource] as res where contains(res.*, 'ModeShape supports')");
}
@Test
public void shouldExtractAndIndexContentFromPdfGSFile() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config.json"));
uploadFile("modeshape_gs.pdf");
assertExtractedTextHasBeenIndexed("select [jcr:path] from [nt:resource] as res where contains(res.*, 'ModeShape supports')");
}
@Test
public void shouldExtractAndIndexContentFromXMLFile() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config.json"));
uploadFile("cars.xml");
assertExtractedTextHasBeenIndexed("select [jcr:path] from [nt:resource] as res where contains(res.*, 'sports')");
}
@Test
@FixFor( "MODE-1561" )
public void shouldExtractPartiallyPastWriteLimit() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config-text-extraction-limit.json"));
// configured in the cfg file
int configuredWriteLimit = 100;
// generate a string the size of the configured limit and check that it's been indexed
String randomString = TikaTextExtractorTest.randomString(configuredWriteLimit);
jcrTools.uploadFile(session, "/testFile", new ByteArrayInputStream(randomString.getBytes()));
session.save();
// test text extraction via querying, since that's where it's actually used
String sql = "select [jcr:path] from [nt:base] where contains([nt:base].*, '" + randomString + "')";
queryAndExpectResults(sql, 1);
// generate a string larger than the limit and check that it hasn't been indexed
randomString = TikaTextExtractorTest.randomString(configuredWriteLimit + 1);
jcrTools.uploadFile(session, "testFile1", new ByteArrayInputStream(randomString.getBytes()));
session.save();
sql = "select [jcr:path] from [nt:base] where contains([nt:base].*, '" + randomString + "')";
queryAndExpectResults(sql, 1);
}
private void queryAndExpectResults( String queryString,
int howMany ) throws RepositoryException {
QueryManager queryManager = ((javax.jcr.Workspace)session.getWorkspace()).getQueryManager();
Query query = queryManager.createQuery(queryString, Query.JCR_SQL2);
NodeIterator nodes = query.execute().getNodes();
Assert.assertEquals(howMany, nodes.getSize());
}
@Test
public void shouldIgnoreMissingTikaDefaultDependendcy() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config.json"));
uploadFile("image_file.jpg");
}
@Test
@FixFor( "MODE-2107" )
public void shouldSupportMimeTypeInclusionsAndExclusions() throws Exception {
startRepositoryWithConfiguration(getResource("repo-config-exclusions-inclusions.json"));
uploadFile("text-file.txt");
assertExtractedTextHasNotBeenIndexed("select [jcr:path] from [nt:resource] as res where contains(res.*, 'The Quick Red Fox Jumps Over the Lazy Brown Dog')");
uploadFile("modeshape_gs.pdf");
assertExtractedTextHasBeenIndexed("select [jcr:path] from [nt:resource] as res where contains(res.*, 'ModeShape supports')");
}
private void assertExtractedTextHasBeenIndexed( String validationQuery ) throws RepositoryException {
Query query = jcrSession().getWorkspace().getQueryManager().createQuery(validationQuery, JcrQuery.JCR_SQL2);
QueryResult result = query.execute();
assertEquals("Node with text content not found", 1, result.getNodes().getSize());
}
private void assertExtractedTextHasNotBeenIndexed( String validationQuery ) throws RepositoryException {
Query query = jcrSession().getWorkspace().getQueryManager().createQuery(validationQuery, JcrQuery.JCR_SQL2);
QueryResult result = query.execute();
assertEquals("Node with text content was found", 0, result.getNodes().getSize());
}
private void uploadFile( String filepath ) throws RepositoryException, IOException, InterruptedException {
// this will create jcr:content of type nt:resource with the jcr:data property
jcrTools.uploadFile(session, "/" + filepath, getResource(filepath));
session.save();
// wait a bit to make sure the text extraction has happened
Thread.sleep(500);
}
private InputStream getResource( String path ) {
return getClass().getClassLoader().getResourceAsStream(path);
}
}