package com.constellio.model.services.parser; import static com.constellio.model.services.migrations.ConstellioEIMConfigs.CONTENT_MAX_LENGTH_FOR_PARSING_IN_MEGAOCTETS; import static com.constellio.model.services.migrations.ConstellioEIMConfigs.PARSED_CONTENT_MAX_LENGTH_IN_KILOOCTETS; import static java.util.Arrays.asList; import static org.apache.commons.lang.StringUtils.join; import java.io.*; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; import org.apache.commons.lang3.StringUtils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.HWPFOldDocument; import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.model.StyleDescription; import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.tika.Tika; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.fork.ForkParser; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import com.constellio.data.io.services.facades.IOServices; import com.constellio.data.io.streamFactories.StreamFactory; import com.constellio.data.io.streamFactories.StreamFactoryWithFilename; import com.constellio.data.io.streamFactories.impl.CopyInputStreamFactory; import com.constellio.data.utils.KeyListMap; import com.constellio.model.entities.records.ParsedContent; import com.constellio.model.services.configs.SystemConfigurationsManager; import com.constellio.model.services.parser.FileParserException.FileParserException_CannotExtractStyles; import com.constellio.model.services.parser.FileParserException.FileParserException_CannotParse; import com.constellio.model.services.parser.FileParserException.FileParserException_FileSizeExceedLimitForParsing; public class FileParser { enum StringCompressor { ; public static byte[] compress(String text) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { OutputStream out = new DeflaterOutputStream(baos); out.write(text.getBytes("UTF-8")); out.close(); } catch (IOException e) { throw new AssertionError(e); } return baos.toByteArray(); } public static String decompress(byte[] bytes) { InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { byte[] buffer = new byte[8192]; int len; while ((len = in.read(buffer)) > 0) baos.write(buffer, 0, len); return new String(baos.toByteArray(), "UTF-8"); } catch (IOException e) { throw new AssertionError(e); } } } private static final String MS_DOC_MIMETYPE = "application/msword"; private static final String MS_DOCX_MIMETYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; static final String READ_STREAM_FOR_STYLES_EXTRACTION = "FileParser-ReadStreamForStylesExtraction"; static final String READ_STREAM_FOR_PARSING_WITH_TIKA = "FileParser-ReadStreamForParsingWithTika"; static final String READ_STREAM_FOR_MIMETYPE_DETECTION = "FileParser-MimetypeDetection"; private final IOServices ioServices; private final ForkParsers parsers; private boolean forkParserEnabled; private LanguageDetectionManager languageDetectionManager; private ThreadLocal<AutoDetectParser> autoDetectParsers = new ThreadLocal<>(); private SystemConfigurationsManager systemConfigurationsManager; public FileParser(ForkParsers parsers, LanguageDetectionManager languageDetectionManager, IOServices ioServices, SystemConfigurationsManager systemConfigurationsManager, boolean forkParserEnabled) { super(); this.parsers = parsers; this.ioServices = ioServices; this.forkParserEnabled = forkParserEnabled; this.languageDetectionManager = languageDetectionManager; this.systemConfigurationsManager = systemConfigurationsManager; } public ParsedContent parse(StreamFactory<InputStream> inputStreamFactory, long length) throws FileParserException { return parse(inputStreamFactory, length, true); } public ParsedContent parse(InputStream inputStream, boolean detectLanguage) throws FileParserException { CopyInputStreamFactory inputStreamFactory = null; try { inputStreamFactory = ioServices.copyToReusableStreamFactory(inputStream, null); return parse(inputStreamFactory, inputStreamFactory.length(), detectLanguage); } finally { ioServices.closeQuietly(inputStream); ioServices.closeQuietly(inputStreamFactory); } } public ParsedContent parse(StreamFactory<InputStream> inputStreamFactory, long length, boolean detectLanguage) throws FileParserException { Pattern patternForChar = Pattern.compile("([^\u0000-\u00FF]+)"); Pattern patternForSpaceAndReturn = Pattern.compile("((\\n{3,})|( ){2,})|(\\t)"); Pattern patternForCharWeDontWant = Pattern.compile("[\\[\\]\\(\\)]"); Pattern patternForSingleCharLine = Pattern.compile("^([\\w]){1}\\n$"); int contentMaxLengthForParsingInMegaoctets = systemConfigurationsManager .getValue(CONTENT_MAX_LENGTH_FOR_PARSING_IN_MEGAOCTETS); if (length > 1024L * 1024 * contentMaxLengthForParsingInMegaoctets) { String detectedMimeType = null; if (inputStreamFactory instanceof StreamFactoryWithFilename) { String filename = ((StreamFactoryWithFilename) inputStreamFactory).getFilename(); if (filename != null) { detectedMimeType = new Tika().detect(filename); } } throw new FileParserException_FileSizeExceedLimitForParsing(contentMaxLengthForParsingInMegaoctets, detectedMimeType); } int maxParsedContentLengthInKO = systemConfigurationsManager.getValue(PARSED_CONTENT_MAX_LENGTH_IN_KILOOCTETS); BodyContentHandler handler = new BodyContentHandler(maxParsedContentLengthInKO * 1000); Metadata metadata = new Metadata(); InputStream inputStream = null; try { inputStream = inputStreamFactory.create(READ_STREAM_FOR_PARSING_WITH_TIKA); if (forkParserEnabled) { ForkParser forkParser = parsers.getForkParser(); forkParser.parse(inputStream, handler, metadata, new ParseContext()); } else { AutoDetectParser parser = autoDetectParsers.get(); if (parser == null) { autoDetectParsers.set(parser = newAutoDetectParser()); } parser.parse(inputStream, handler, metadata); } } catch (Throwable t) { if (!t.getClass().getSimpleName().equals("WriteLimitReachedException")) { String detectedMimetype = metadata.get(Metadata.CONTENT_TYPE); throw new FileParserException_CannotParse(t, detectedMimetype); } } finally { ioServices.closeQuietly(inputStream); } String type = metadata.get(Metadata.CONTENT_TYPE); String parsedContent = handler.toString().trim(); parsedContent = patternForChar.matcher(parsedContent).replaceAll(""); parsedContent = patternForSpaceAndReturn.matcher(parsedContent).replaceAll(""); String language = detectLanguage ? languageDetectionManager.tryDetectLanguage(parsedContent) : null; Map<String, Object> properties = getPropertiesHashMap(metadata, type); Map<String, List<String>> styles = null; try { styles = getStylesDoc(inputStreamFactory, type); } catch (Throwable t) { throw new FileParserException_CannotExtractStyles(t, type); } return new ParsedContent(parsedContent, language, type, length, properties, styles); } public ParsedContent parseWithoutBeautifying(StreamFactory<InputStream> inputStreamFactory, long length) throws FileParserException { return parseWithoutBeautifying(inputStreamFactory, length, true); } public ParsedContent parseWithoutBeautifying(InputStream inputStream, boolean detectLanguage) throws FileParserException { CopyInputStreamFactory inputStreamFactory = null; try { inputStreamFactory = ioServices.copyToReusableStreamFactory(inputStream, null); return parseWithoutBeautifying(inputStreamFactory, inputStreamFactory.length(), detectLanguage); } finally { ioServices.closeQuietly(inputStream); ioServices.closeQuietly(inputStreamFactory); } } public ParsedContent parseWithoutBeautifying(StreamFactory<InputStream> inputStreamFactory, long length, boolean detectLanguage) throws FileParserException { int contentMaxLengthForParsingInMegaoctets = systemConfigurationsManager .getValue(CONTENT_MAX_LENGTH_FOR_PARSING_IN_MEGAOCTETS); if (length > 1024L * 1024 * contentMaxLengthForParsingInMegaoctets) { String detectedMimeType = null; if (inputStreamFactory instanceof StreamFactoryWithFilename) { String filename = ((StreamFactoryWithFilename) inputStreamFactory).getFilename(); if (filename != null) { detectedMimeType = new Tika().detect(filename); } } throw new FileParserException_FileSizeExceedLimitForParsing(contentMaxLengthForParsingInMegaoctets, detectedMimeType); } int maxParsedContentLengthInKO = systemConfigurationsManager.getValue(PARSED_CONTENT_MAX_LENGTH_IN_KILOOCTETS); BodyContentHandler handler = new BodyContentHandler(maxParsedContentLengthInKO * 1000); Metadata metadata = new Metadata(); InputStream inputStream = null; try { inputStream = inputStreamFactory.create(READ_STREAM_FOR_PARSING_WITH_TIKA); if (forkParserEnabled) { ForkParser forkParser = parsers.getForkParser(); forkParser.parse(inputStream, handler, metadata, new ParseContext()); } else { AutoDetectParser parser = autoDetectParsers.get(); if (parser == null) { autoDetectParsers.set(parser = newAutoDetectParser()); } parser.parse(inputStream, handler, metadata); } } catch (Throwable t) { if (!t.getClass().getSimpleName().equals("WriteLimitReachedException")) { String detectedMimetype = metadata.get(Metadata.CONTENT_TYPE); throw new FileParserException_CannotParse(t, detectedMimetype); } } finally { ioServices.closeQuietly(inputStream); } String type = metadata.get(Metadata.CONTENT_TYPE); String parsedContent = handler.toString().trim(); String language = detectLanguage ? languageDetectionManager.tryDetectLanguage(parsedContent) : null; Map<String, Object> properties = getPropertiesHashMap(metadata, type); Map<String, List<String>> styles = null; try { styles = getStylesDoc(inputStreamFactory, type); } catch (Throwable t) { throw new FileParserException_CannotExtractStyles(t, type); } return new ParsedContent(parsedContent, language, type, length, properties, styles); } Map<String, Object> getPropertiesHashMap(Metadata metadata, String mimeType) { HashMap<String, Object> properties = new HashMap<String, Object>(); addKeywordsTo(properties, metadata, "Keywords", TikaCoreProperties.KEYWORDS); addPropertyTo(properties, metadata, "Title", TikaCoreProperties.TITLE); addPropertyTo(properties, metadata, "Comments", TikaCoreProperties.COMMENTS); addPropertyTo(properties, metadata, "Author", TikaCoreProperties.CREATOR); addPropertyTo(properties, metadata, "Subject", "subject"); addPropertyTo(properties, metadata, "Category", "Category"); addPropertyTo(properties, metadata, "Manager", "Manager"); addPropertyTo(properties, metadata, "BCC", Message.MESSAGE_BCC); addPropertyTo(properties, metadata, "CC", Message.MESSAGE_CC); addPropertyTo(properties, metadata, "From", Message.MESSAGE_FROM); addPropertyTo(properties, metadata, "To", Message.MESSAGE_TO); if (mimeType.contains("xml")) { addCommentsTo(properties, metadata, "Comments", TikaCoreProperties.DESCRIPTION, "_x000d_"); addPropertyTo(properties, metadata, "Company", TikaCoreProperties.PUBLISHER); } else { addCommentsTo(properties, metadata, "Comments", TikaCoreProperties.COMMENTS, "[\r]"); addPropertyTo(properties, metadata, "Company", "Company"); } return properties; } //For Property private void addPropertyTo(HashMap<String, Object> properties, Metadata metadata, String key, Property property) { if (metadata.get(property) != null && metadata.get(property).isEmpty() == false) { properties.put(key, metadata.get(property)); } } //For String private void addPropertyTo(HashMap<String, Object> properties, Metadata metadata, String key, String value) { if (metadata.get(value) != null && metadata.get(value).isEmpty() == false) { properties.put(key, metadata.get(value)); } } private void addKeywordsTo(HashMap<String, Object> properties, Metadata metadata, String key, Property property) { if (metadata.get(property) != null) { List<String> finalKeywordsList = new ArrayList<String>(); String[] keywordsAfterFirstSplit = metadata.get(property).split(";"); for (String aKeyword : keywordsAfterFirstSplit) { String[] keywordsAfterSecondSplit = aKeyword.split(","); for (String zeKeyword : keywordsAfterSecondSplit) { finalKeywordsList.add(zeKeyword.trim()); } } properties.put("List:" + key, finalKeywordsList); } } private void addCommentsTo(HashMap<String, Object> properties, Metadata metadata, String key, Property property, String regex) { if (metadata.get(property) != null) { String[] commentsListAfterSplit = metadata.get(property).split(regex); properties.put(key, join(commentsListAfterSplit, " ")); } } private Map<String, List<String>> getStylesDoc(StreamFactory<InputStream> inputStreamFactory, String mimeType) throws IOException { InputStream inputStream = null; try { if (MS_DOC_MIMETYPE.equals(mimeType)) { inputStream = inputStreamFactory.create(READ_STREAM_FOR_STYLES_EXTRACTION); return getStylesDoc(inputStream); } else if (MS_DOCX_MIMETYPE.equals(mimeType)) { inputStream = inputStreamFactory.create(READ_STREAM_FOR_STYLES_EXTRACTION); return getStylesDocX(inputStream); } else { return new HashMap<>(); } } finally { ioServices.closeQuietly(inputStream); } } private Map<String, List<String>> getStylesDoc(InputStream inputStream) throws IOException { KeyListMap<String, String> styles = new KeyListMap<>(); POIFSFileSystem fis = new POIFSFileSystem(inputStream); HWPFDocumentCore wdDoc; try { wdDoc = new HWPFDocument(fis); } catch (OldWordFileFormatException e) { wdDoc = new HWPFOldDocument(fis); } StyleSheet styleSheet = wdDoc.getStyleSheet(); if (styleSheet != null) { Range range = wdDoc.getRange(); int parasSize = range.numParagraphs(); int maxPara = 20; if (range.numParagraphs() > maxPara) { parasSize = maxPara; } for (int i = 0; i < parasSize; i++) { Paragraph p = range.getParagraph(i); short styleIndex = p.getStyleIndex(); StyleDescription style = styleSheet.getStyleDescription(styleIndex); String styleName = style.getName(); if (styleName != null) { styleName = styleName.toLowerCase().replace(" ", ""); if (!excludedStyles.contains(styleName)) { String text = p.text().trim(); if (StringUtils.isNotBlank(text)) { if (!styles.get(styleName).contains(text)) { styles.add(styleName, text); } } } } } } return styles.getNestedMap(); } private static List<String> excludedStyles = asList("normal", "nospacing"); public static Map<String, List<String>> getStylesDocX(InputStream inputStream) throws IOException { KeyListMap<String, String> styles = new KeyListMap<>(); XWPFDocument wdDoc = new XWPFDocument(inputStream); List<XWPFParagraph> paras = wdDoc.getParagraphs(); int parasSize = paras.size(); int maxPara = 20; if (paras.size() > maxPara) { parasSize = maxPara; } for (int i = 0; i < parasSize; i++) { XWPFParagraph para = paras.get(i); String styleName = para.getStyle(); if (styleName != null) { styleName = styleName.toLowerCase().replace(" ", ""); if (!excludedStyles.contains(styleName)) { String text = para.getText().trim(); if (StringUtils.isNotBlank(text)) { if (!styles.get(styleName).contains(text)) { styles.add(styleName, text); } } } } } return styles.getNestedMap(); } AutoDetectParser newAutoDetectParser() { return new AutoDetectParser(); } public String detectMimetype(StreamFactory<InputStream> inputStreamFactory, String fileName) throws FileParserException { InputStream inputStream = null; try { inputStream = inputStreamFactory.create(READ_STREAM_FOR_MIMETYPE_DETECTION); return getTikaMediaType(inputStream, fileName).toString(); } catch (IOException e) { throw new FileParserException_CannotParse(e, "application/octet-stream"); } finally { ioServices.closeQuietly(inputStream); } } private MediaType getTikaMediaType(InputStream is, String fileName) { Metadata md = new Metadata(); md.set(Metadata.RESOURCE_NAME_KEY, fileName); Detector detector = new DefaultDetector(); try { return detector.detect(is, md); } catch (IOException e) { throw new RuntimeException(e); } } }