package mj.ocraptor.file_handler;
import static mj.ocraptor.MainController.Status.INDEXING_FINISHED;
import static mj.ocraptor.MainController.Status.PAUSED;
import static mj.ocraptor.MainController.Status.STOPPED;
import static mj.ocraptor.database.DBFileStatus.MODIFIED;
import static mj.ocraptor.database.DBFileStatus.NOT_FOUND;
import static mj.ocraptor.database.DBFileStatus.NOT_SUPPORTED;
import static mj.ocraptor.database.DBFileStatus.UP_TO_DATE;
import java.io.File;
import java.sql.Connection;
import java.sql.SQLException;
import mj.ocraptor.MainController;
import mj.ocraptor.MainController.Status;
import mj.ocraptor.configuration.Config;
import mj.ocraptor.console.COF;
import mj.ocraptor.database.DBFileStatus;
import mj.ocraptor.database.DBManager;
import mj.ocraptor.database.dao.FileEntry;
import mj.ocraptor.database.dao.FileEntryDao;
import mj.ocraptor.database.dao.ResultError;
import mj.ocraptor.events.Event;
import mj.ocraptor.events.EventManager;
import mj.ocraptor.file_handler.filter.FileType;
import mj.ocraptor.file_handler.utils.FileTools;
import mj.ocraptor.rmi_server.RMIServerImpl;
import mj.ocraptor.tools.St;
import org.apache.tika.metadata.Metadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TextExtractorThread implements Runnable {
private DBManager indexDB;
private File currentFile;
private static Long fileCount;
private static Long processedCount;
private Config cfg;
private DBFileStatus fileStatus;
private FileEntry currentDBFileEntry;
private String currentFileMD5Hash;
private final Logger LOG = LoggerFactory.getLogger(getClass());
private FileType fileType;
private FileEntryDao fileEntryDao;
private Connection connection;
private TextExtractorTools extractorTools;
/**
*
*
* @param indexDB
* @param file
* @param properties
*/
public TextExtractorThread(DBManager indexDB, File file) {
this.indexDB = indexDB;
this.currentFile = file;
}
/**
* {@inheritDoc}
*
* @see Runnable#run()
*/
public void run() {
Thread.currentThread().setName(Config.APP_NAME + "TextExtractor");
long threadId = Thread.currentThread().getId();
MainController controller = MainController.inst();
try {
// ------------------------------------------------ //
final Status status = controller.getStatus();
if (status == STOPPED || status == INDEXING_FINISHED) {
return;
}
// pause thread on users demand
try {
while (!Thread.currentThread().isInterrupted() && status == PAUSED) {
Thread.sleep(200);
}
} catch (InterruptedException e) {
}
if (!controller.getCurrentFileWorkers().contains(currentFile)) {
controller.getCurrentFileWorkers().put(threadId, currentFile);
}
this.init();
this.fileType = FileType.get(currentFile);
if (controller.showInitialCPUList()) {
Thread.sleep(10);
// TODO:
showProgress(null, null);
controller.setShowInitialCPUList(false);
}
// ------------------------------------------------ //
// -- finally extract text from the file
// ------------------------------------------------ //
final FileEntry resultFromTika = extractTextTika(currentFile);
// ------------------------------------------------ //
if (resultFromTika != null && status != STOPPED && status != INDEXING_FINISHED
&& resultFromTika.getFullText() != null) {
final String extractedText = resultFromTika.getFullText().getText();
boolean fileTypeNotSupported = false;
boolean killedDuringProcessing = false;
if (extractedText.length() < 50) {
fileTypeNotSupported = ResultError.NOT_SUPPORTED == ResultError.getByCode(extractedText);
killedDuringProcessing = ResultError.KILLED == ResultError.getByCode(extractedText);
}
if (fileTypeNotSupported || killedDuringProcessing) {
showProgress(null, null); // update progress percentage
} else {
showProgress(currentFile, fileStatus);
// aa
if (this.fileStatus == NOT_FOUND || this.fileStatus == MODIFIED) {
// TODO: find a better way, starting and closing too many threads
new Thread(new CpuListUpdateWorker()).start();
}
this.saveToDatabase(resultFromTika);
}
}
// this.indexDB.getH2DB().printTables();
} catch (RuntimeException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (controller.getCurrentFileWorkers().containsKey(threadId)) {
controller.getCurrentFileWorkers().remove(threadId);
}
if (this.connection != null) {
try {
this.connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
private class CpuListUpdateWorker implements Runnable {
@Override
public void run() {
Thread.currentThread().setName(Config.APP_NAME + "JavaFX: Cpu list updating");
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
}
showProgress(null, null);
}
}
/**
*
*
* @param fileCount
* @param verbose
*/
public static synchronized void setFileCount(long fileCount, boolean verbose) {
if (TextExtractorThread.fileCount == null) {
TextExtractorThread.fileCount = fileCount;
verbose = false;
}
}
/**
*
*
*/
public static synchronized void resetCount() {
TextExtractorThread.fileCount = null;
TextExtractorThread.processedCount = null;
}
/**
*
*
* @param file
* @return
*/
private boolean hasUnknownFileHash(File file) {
fileStatus = getFileStatus(file);
if (fileStatus == NOT_FOUND || fileStatus == MODIFIED) {
return true;
}
return false;
}
/**
*
*
* @param file
* @param valid
*/
public static synchronized void showProgress(File file, DBFileStatus status) {
if (file != null && status == null) {
return;
}
EventManager eventManager = EventManager.instance();
eventManager.printProcess(file, fileCount, processedCount, false, status);
}
/**
*
*
* @param file
* @return
*/
public DBFileStatus getFileStatus(File file) {
this.currentFileMD5Hash = FileTools.calculateMD5FromFile(file);
this.currentDBFileEntry = indexDB.findMD5Hash(file.getPath());
if (this.currentDBFileEntry == null) {
return NOT_FOUND;
}
String md5FromDB = this.currentDBFileEntry.getHash();
if (md5FromDB == null || md5FromDB.trim().isEmpty()) {
return NOT_FOUND;
} else if (!md5FromDB.equals(currentFileMD5Hash)) {
return MODIFIED;
}
return UP_TO_DATE;
}
/**
*
*
* @param sizeRestriction
* @return
*/
private Boolean validSize(Integer maxSize) {
if (maxSize != null) {
try {
long fileSizeInKB = currentFile.length() / 1024;
if (fileSizeInKB < maxSize)
return true;
else
return false;
} catch (NumberFormatException e) {
e.printStackTrace();
}
}
return null;
}
/**
*
*
*/
private void init() {
this.cfg = Config.inst();
this.connection = indexDB.getConnection();
if (processedCount == null) {
processedCount = 0L;
}
this.extractorTools = new TextExtractorTools();
this.fileEntryDao = new FileEntryDao();
}
/**
*
*
* @param file
* @return
*/
public FileEntry extractTextTika(final File file) {
try {
processedCount++;
this.fileStatus = NOT_SUPPORTED;
if (this.fileType == null) {
this.fileType = FileType.get(file);
}
// do not index the given database-folder
final String databaseDirPath = new File(indexDB.getDatabaseDir()).getAbsolutePath();
if (indexDB != null && file.getAbsolutePath().startsWith(databaseDirPath)) {
return null;
}
// check if file was found in the current database
boolean unknownFile = hasUnknownFileHash(file);
if (!unknownFile || !extractorTools.hasAvailableParsers(file)) {
FileEntry onlyUpdateProgress = new FileEntry(file);
if (unknownFile) {
onlyUpdateProgress.setError(ResultError.NOT_SUPPORTED);
} else {
onlyUpdateProgress.setError(ResultError.NOT_SUPPORTED);
showProgress(file, fileStatus);
}
return onlyUpdateProgress; // known file hash
}
final RMIServerImpl extractionServer = MainController.inst().getServer();
final Event<FileEntry> first = extractionServer.requestTextExtraction(file);
FileEntry exResult = null;
if (first != null) {
exResult = first.get();
// TODO: delete later
// exResult.setFullText(ResultError.TIMEOUT.getErrorCode());
if (Config.DEBUG) {
COF.printLine("Received result for: " + exResult.getFile().getName());
COF.printText(St.trimToLengthIndicatorRight(St.stripHtmlTags(exResult
.getFullTextString()), 500));
}
}
return exResult;
} catch (InterruptedException e) {
return null;
} catch (Exception e) {
// TODO: logging
e.printStackTrace();
}
// ------------------------------------------------ //
return null;
}
/**
*
*
* @param metadata
* @return
*/
private Metadata normalizeMetadata(Metadata metadata) {
Metadata filteredMetadata = new Metadata();
for (String key : metadata.names()) {
if (key != null && !key.trim().isEmpty()) {
key = St.normalizeDocumentText(key);
key = St.stripHtmlTags(key);
String value = metadata.get(key);
if (value != null && !value.trim().isEmpty()) {
value = St.normalizeDocumentText(value);
value = St.stripHtmlTags(value);
value = value.replaceAll("\\s", " ");
filteredMetadata.add(key, value);
}
}
}
return filteredMetadata;
}
/**
*
*
* @param file
* @param xhtml
*/
private void saveToDatabase(final FileEntry result) {
try {
hasUnknownFileHash(currentFile); // update filestatus property
if (fileStatus == null) {
// TODO:
throw new IllegalArgumentException();
}
if (result == null) {
return;
}
final String text = result.getFullTextString();
if (text == null || text.isEmpty() || fileStatus == NOT_SUPPORTED
|| fileStatus == UP_TO_DATE) {
return;
}
// TODO: process result string before db insert???
if (fileStatus == NOT_FOUND && currentDBFileEntry == null) {
fileEntryDao.insert(result, connection);
}
if (fileStatus == MODIFIED && currentDBFileEntry != null) {
currentDBFileEntry.setFile(result.getFile());
currentDBFileEntry.setFullText(result.getFullText());
fileEntryDao.update(currentDBFileEntry, connection);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}