package com.constellio.model.services.parser; import static java.util.Arrays.asList; import static junit.framework.Assert.fail; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.data.MapEntry.entry; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.constellio.sdk.tests.annotations.InDevelopmentTest; import org.junit.Before; import org.junit.Test; import com.constellio.data.io.streamFactories.StreamFactory; import com.constellio.model.entities.Language; import com.constellio.model.entities.records.ParsedContent; import com.constellio.model.services.migrations.ConstellioEIMConfigs; import com.constellio.model.services.parser.FileParserException.FileParserException_CannotParse; import com.constellio.model.services.parser.FileParserException.FileParserException_FileSizeExceedLimitForParsing; import com.constellio.sdk.tests.ConstellioTest; //@SlowTest public class FileParserAcceptanceTest extends ConstellioTest { StreamFactory<InputStream> inputStreamFactory; private FileParser fileParser; @Test public void givenStreamOfDOCMimetypeWhenParsingThenValidParsedContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.doc"); long length = getLengthOf("testFile.doc"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("This is the content of").contains("a doc file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()).isEqualTo("application/msword"); assertThat(parsedContent.getLength()).isEqualTo(22528L); assertThat(parsedContent.getProperties()).containsEntry("Company", "DocuLibre"); } //@Test public void parse1000times() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("architecture_logicielle.pdf"); long length = getLengthOf("testFile.doc"); long start = new Date().getTime(); for (int i = 0; i < 1000; i++) { System.out.println(i); fileParser.parse(inputStreamFactory, length); } System.out.println(new Date().getTime() - start); } //@Test public void detectMimetype1000times() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("architecture_logicielle.pdf"); long length = getLengthOf("testFile.doc"); long start = new Date().getTime(); for (int i = 0; i < 10000; i++) { System.out.println(i); fileParser.detectMimetype(inputStreamFactory, "test.pdf"); } System.out.println(new Date().getTime() - start); } @Test public void givenStreamOfDOCXMimetypeWhenParsingThenValidParsedContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.docx"); long length = getLengthOf("testFile.docx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("This is the content of").contains("a docx file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()) .isEqualTo("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); assertThat(parsedContent.getLength()).isEqualTo(13771L); assertThat(parsedContent.getProperties()).containsEntry("Title", "Document sans titre.docx"); } @Test public void givenStreamOfHTMLMimetypeWhenParsingThenCorrectContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.html"); long length = getLengthOf("testFile.html"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("This is the content of").contains("a html file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()).startsWith("text/html;"); assertThat(parsedContent.getProperties()).isEmpty(); } @Test public void givenStreamOfPDFMimetypeWhenParsingThenCorrectContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.pdf"); long length = getLengthOf("testFile.pdf"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("This is the content of").contains("a pdf file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()).isEqualTo("application/pdf"); assertThat(parsedContent.getLength()).isEqualTo(27171L); assertThat(parsedContent.getProperties()).containsEntry("Title", "Untitled"); } @Test public void givenPasswordProtectedPDFFileThenReturnEmptyParsedContentWithUnknownLanguageAndPDFMimetype() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("passwordProtected.pdf"); long length = getLengthOf("passwordProtected.pdf"); try { fileParser.parse(inputStreamFactory, length); fail("Exception expected"); } catch (FileParserException_CannotParse e) { assertThat(e.getDetectedMimetype()).isEqualTo("application/pdf"); } } @Test public void givenStreamOfXLSMimetypeWhenParsingThenCorrectContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.xls"); long length = getLengthOf("testFile.xls"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("Feuille1").contains("This is the content of") .contains("the xsl file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()).isEqualTo("application/vnd.ms-excel"); assertThat(parsedContent.getLength()).isEqualTo(23552L); assertThat(parsedContent.getProperties()).containsEntry("Title", "zeTitle"); assertThat(parsedContent.getProperties()).containsEntry("List:Keywords", asList("zeKeywords")); assertThat(parsedContent.getProperties()).containsEntry("Comments", "zeComments"); assertThat(parsedContent.getProperties()).containsEntry("Author", "zeAuthor"); } @Test public void givenStreamOfXLSXMimetypeWhenParsingThenCorrectContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.xlsx"); long length = getLengthOf("testFile.xlsx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("Sheet1").contains("This is the content of") .contains("the xslx file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()).isEqualTo("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); assertThat(parsedContent.getLength()).isEqualTo(8022L); assertThat(parsedContent.getProperties()).isEmpty(); } @Test public void givenStreamOfXMLMimetypeWhenParsingThenCorrectContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFile.xml"); long length = getLengthOf("testFile.xml"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("This is the content of").contains("the xml file"); assertThat(parsedContent.getLanguage()).isEqualTo(Language.English.getCode()); assertThat(parsedContent.getMimeType()).startsWith("text/html;"); assertThat(parsedContent.getProperties()).isEmpty(); } @Test public void givenStreamOfDOCWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.doc"); long length = getLengthOf("testFileWithProperties.doc"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); assertThatAllLessCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenStreamOfDOCXWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.docx"); long length = getLengthOf("testFileWithProperties.docx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); assertThatAllLessCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenStreamOfPDFWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.pdf"); long length = getLengthOf("testFileWithProperties.pdf"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenStreamOfPPTWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.ppt"); long length = getLengthOf("testFileWithProperties.ppt"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); assertThatAllLessCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenStreamOfPPTXWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.pptx"); long length = getLengthOf("testFileWithProperties.pptx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); assertThatAllLessCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenStreamOfXLSWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.xls"); long length = getLengthOf("testFileWithProperties.xls"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); assertThatAllLessCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenStreamOfXLSXWhenParsingThenAllPropertiesAreCatch() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testFileWithProperties.xlsx"); long length = getLengthOf("testFileWithProperties.xlsx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).isNotEmpty(); assertThatAllCommunPropertiesAreCatchIn(parsedContent); assertThatAllLessCommunPropertiesAreCatchIn(parsedContent); } @Test public void givenMessageWithAttachedDOCAndAttachedTextWhenParsingThenValidParsedContentReturned() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("testMessage.msg"); long length = getLengthOf("testFile.docx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getParsedContent()).contains("contenu"); assertThat(parsedContent.getParsedContent()).contains("Microsoft word document"); assertThat(parsedContent.getParsedContent()).contains("text document"); assertThat(parsedContent.getProperties().get("Subject")).isEqualTo("objet"); assertThat(parsedContent.getProperties().get("To")) .isEqualTo("a1@doculibre.com; a2@doculibre.com"); assertThat(parsedContent.getProperties().get("CC")) .isEqualTo("c1@doculibre.com; c2@doculibre.com"); assertThat(parsedContent.getProperties().get("BCC")).isEqualTo("b1@doculibre.com; b2@doculibre.com"); assertThat(parsedContent.getMimeType()) .isEqualTo("application/vnd.ms-outlook"); assertThat(parsedContent.getLength()).isEqualTo(13771L); } @Test public void givenWord2003DocumentWithStylesThenStylesExtracted() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("DocumentWithStylesAndProperties.doc"); long length = getLengthOf("DocumentWithStylesAndProperties.doc"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).containsOnly( entry("Category", "category2"), entry("Comments", "comments2"), entry("Subject", "subject2"), entry("List:Keywords", asList("zeKeyword2", "anotherKeyword2")), entry("Manager", "manager2"), entry("Author", "author2"), entry("Company", "company2"), entry("Title", "title2") ); assertThat(parsedContent.getMimeType()) .isEqualTo("application/msword"); assertThat(parsedContent.getStyles()).containsOnly( entry("titreofficiel", asList("The ring contract")), entry("nomdelacompagnie", asList("Frodon", "Bilbon")), entry("adressedelacompagnie", asList("Hobbiton, Shire")), entry("nomduclient", asList("Gandalf Leblanc")), entry("adresseduclient", asList("Somewhere, Terre du Milieu")) ); } @Test public void givenWord2007DocumentWithStylesThenStylesExtracted() throws Exception { inputStreamFactory = getTestResourceInputStreamFactory("DocumentWithStylesAndProperties.docx"); long length = getLengthOf("DocumentWithStylesAndProperties.docx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getProperties()).containsOnly( entry("Category", "category2"), entry("Comments", "comments2"), entry("Subject", "subject2"), entry("List:Keywords", asList("zeKeyword2", "anotherKeyword2")), entry("Manager", "manager2"), entry("Author", "author2"), entry("Company", "company2"), entry("Title", "title2") ); assertThat(parsedContent.getMimeType()) .isEqualTo("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); assertThat(parsedContent.getStyles()).containsOnly( entry("titreofficiel", asList("The ring contract")), entry("nomdelacompagnie", asList("Frodon", "Bilbon")), entry("adressedelacompagnie", asList("Hobbiton, Shire")), entry("nomduclient", asList("Gandalf Leblanc")), entry("adresseduclient", asList("Somewhere, Terre du Milieu")) ); } @Test public void whenParsingLargeFileNotExceedingFileSizeLimitThenParsed() throws Exception { givenConfig(ConstellioEIMConfigs.CONTENT_MAX_LENGTH_FOR_PARSING_IN_MEGAOCTETS, 2); inputStreamFactory = getTestResourceInputStreamFactory("testFileWithLargePictureOfEdouard.pptx"); long length = getLengthOf("testFileWithLargePictureOfEdouard.pptx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getMimeType()) .isEqualTo("application/vnd.openxmlformats-officedocument.presentationml.presentation"); assertThat(parsedContent.getParsedContent()).contains("history of cats"); } @Test public void whenParsingLargeFileNotExceedingFileWithLargeSizeLimitThenParsed() throws Exception { givenConfig(ConstellioEIMConfigs.CONTENT_MAX_LENGTH_FOR_PARSING_IN_MEGAOCTETS, 3000); inputStreamFactory = getTestResourceInputStreamFactory("testFileWithLargePictureOfEdouard.pptx"); long length = getLengthOf("testFileWithLargePictureOfEdouard.pptx"); ParsedContent parsedContent = fileParser.parse(inputStreamFactory, length); assertThat(parsedContent.getMimeType()) .isEqualTo("application/vnd.openxmlformats-officedocument.presentationml.presentation"); assertThat(parsedContent.getParsedContent()).contains("history of cats"); } @Test public void whenParsingLargeFileExceedingFileSizeLimitThenParsed() throws Exception { givenConfig(ConstellioEIMConfigs.CONTENT_MAX_LENGTH_FOR_PARSING_IN_MEGAOCTETS, 1); inputStreamFactory = getTestResourceInputStreamFactory("testFileWithLargePictureOfEdouard.pptx"); long length = getLengthOf("testFileWithLargePictureOfEdouard.pptx"); try { fileParser.parse(inputStreamFactory, length); fail("FileParserException_FileSizeExceedLimitForParsing expected"); } catch (FileParserException_FileSizeExceedLimitForParsing e) { assertThat(e.getDetectedMimetype()).isNull(); } } @Test @InDevelopmentTest public void givenStrongFileTryToMinimifyIt() { File[] listFile = null; // get every document. for (File f : listFile) { try { FileInputStream is = new FileInputStream(f); ParsedContent rawContent = fileParser.parse(is, false); ParsedContent beautifulContent = fileParser.parseWithoutBeautifying(is, false); String originalContent = rawContent.getParsedContent(); String newContent = beautifulContent.getParsedContent(); float originalLength = originalContent.getBytes("UTF-8").length; float newLenght = newContent.getBytes("UTF-8").length; System.out.println("\n\n\n\n----\n\n\n\n"); System.out.println("Testing file : " + f.getName()); System.out.println("Original file length : " + originalLength); System.out.println("New file length : " + newLenght); System.out.println("Difference : " + (originalLength - newLenght) + " ( " + (100f * (originalLength - newLenght)) / originalLength + " % )"); System.out.println("\n\n\n-----------------------------------\n\n\n"); } catch (Exception e) { e.printStackTrace(); } } } private void assertThatAllCommunPropertiesAreCatchIn(ParsedContent parsedContent) { assertThat(parsedContent.getProperties()).containsEntry("Title", "Ze title"); assertThat(parsedContent.getProperties()) .containsEntry("List:Keywords", asList("Ze keyword1", "Ze keyword2", "Ze keyword 3")); assertThat(parsedContent.getProperties()).containsEntry("Author", "Ze author"); assertThat(parsedContent.getProperties()).containsEntry("Subject", "Ze subject"); } private void assertThatAllLessCommunPropertiesAreCatchIn(ParsedContent parsedContent) { assertThat(parsedContent.getProperties()).containsEntry("Company", "Ze company"); assertThat(parsedContent.getProperties()).containsEntry("Category", "Ze category"); assertThat(parsedContent.getProperties()).containsEntry("Manager", "Ze ultimate manager"); assertThat(parsedContent.getProperties()).containsEntry("Comments", "Ze very useful comments Line2"); } @Before public void setup() { fileParser = getModelLayerFactory().newFileParser(); } private long getLengthOf(String resourceName) { return getTestResourceFile(resourceName).length(); } }