/**
* FreeDesktopSearch - A Search Engine for your Desktop
* Copyright (C) 2013 Mirko Sertic
*
* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
package de.mirkosertic.desktopsearch;
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.DateUtils;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class ContentExtractor {
private static final Logger LOGGER = Logger.getLogger(ContentExtractor.class);
private final Tika tika;
private final Pattern metaDataDatePattern;
private final Configuration configuration;
private final LanguageDetector languageDetector;
public ContentExtractor(Configuration aConfiguration) {
// TODO: auch korrekt dieses Muster verarbeitrn : Mon Feb 18 15:55:10 CET 2013
metaDataDatePattern = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})Z");
configuration = aConfiguration;
tika = new Tika();
tika.setMaxStringLength(1024 * 1024 * 5);
OptimaizeLangDetector theDetector = new OptimaizeLangDetector();
try {
theDetector.loadModels();
languageDetector = theDetector;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private String harmonizeMetaDataName(String aName) {
int p = aName.indexOf(":");
if (p>0) {
aName = aName.substring(p+1);
}
String theReplacement = configuration.getMetaDataNameReplacement().get(aName);
if (theReplacement != null) {
return theReplacement;
}
return aName;
}
public Content extractContentFrom(Path aFile, BasicFileAttributes aBasicFileAttributes) {
try {
Metadata theMetaData = new Metadata();
String theStringData;
// Files under 10 Meg are read into memory as a whole
if (aBasicFileAttributes.size() < 1024 * 1024 * 4) {
byte[] theData = Files.readAllBytes(aFile);
theStringData = tika.parseToString(new ByteArrayInputStream(theData), theMetaData);
} else {
try (InputStream theStream = Files.newInputStream(aFile, StandardOpenOption.READ)) {
theStringData = tika.parseToString(new BufferedInputStream(theStream), theMetaData);
}
}
LanguageResult theLanguageResult = languageDetector.detect(theStringData);
FileTime theFileTime = aBasicFileAttributes.lastModifiedTime();
SupportedLanguage theLanguage = SupportedLanguage.getDefault();
try {
theLanguage = SupportedLanguage.valueOf(theLanguageResult.getLanguage());
if (!configuration.getEnabledLanguages().contains(theLanguage)) {
theLanguage = SupportedLanguage.getDefault();
}
} catch (Exception e) {
LOGGER.info("Language " + theLanguageResult.getLanguage() + " was detected, but is not supported");
}
Content theContent = new Content(aFile.toString(), theStringData, aBasicFileAttributes.size(), theFileTime.toMillis(), theLanguage);
for (String theName : theMetaData.names()) {
String theMetaDataValue = theMetaData.get(theName);
// Try to detect if this is a date
Matcher theMatcher = metaDataDatePattern.matcher(theMetaDataValue);
if (theMatcher.find()) {
int theYear = Integer.parseInt(theMatcher.group(1));
int theMonth = Integer.parseInt(theMatcher.group(2));
int theDay = Integer.parseInt(theMatcher.group(3));
int theHour = Integer.parseInt(theMatcher.group(4));
int theMinute = Integer.parseInt(theMatcher.group(5));
int theSecond = Integer.parseInt(theMatcher.group(6));
Calendar theCalendar = GregorianCalendar.getInstance(DateUtils.UTC, Locale.US);
theCalendar.set(Calendar.YEAR, theYear);
theCalendar.set(Calendar.MONTH, theMonth - 1);
theCalendar.set(Calendar.DAY_OF_MONTH, theDay);
theCalendar.set(Calendar.HOUR_OF_DAY, theHour);
theCalendar.set(Calendar.MINUTE, theMinute);
theCalendar.set(Calendar.SECOND, theSecond);
theCalendar.set(Calendar.MILLISECOND, 0);
theContent.addMetaData(harmonizeMetaDataName(theName.toLowerCase()), theCalendar.getTime());
} else {
theContent.addMetaData(harmonizeMetaDataName(theName.toLowerCase()), theMetaData.get(theName));
}
}
String theFileName = aFile.toString();
int p = theFileName.lastIndexOf(".");
if (p > 0) {
String theExtension = theFileName.substring(p + 1);
theContent.addMetaData(IndexFields.EXTENSION, theExtension.toLowerCase());
}
return theContent;
} catch (Exception e) {
LOGGER.error("Error extracting content of " + aFile, e);
}
return null;
}
public boolean supportsFile(String aFilename) {
for (SupportedDocumentType theType : configuration.getEnabledDocumentTypes()) {
if (theType.supports(aFilename)) {
return true;
}
}
return false;
}
}