/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.parse.step;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import no.trank.openpipe.api.BasePipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.PipelineStepStatus;
import no.trank.openpipe.api.PipelineStepStatusCode;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.RawData;
import no.trank.openpipe.config.annotation.NotEmpty;
import no.trank.openpipe.config.annotation.NotNull;
import no.trank.openpipe.config.annotation.NullNotEmpty;
import no.trank.openpipe.parse.api.ParseData;
import no.trank.openpipe.parse.api.Parser;
import no.trank.openpipe.parse.api.ParserResult;
import no.trank.openpipe.parse.api.PipelineParseData;
/**
* @version $Revision$
*/
public class DocumentParser extends BasePipelineStep {
private static final Logger log = LoggerFactory.getLogger(DocumentParser.class);
private final Set<String> loggedExt = new HashSet<String>();
@NotEmpty
private String fileNameField = "fileName";
@NotNull
private Map<String, Parser> parsers = Collections.emptyMap();
@NotNull
private List<Parser> fallbackParsers = Collections.emptyList();
@NotNull
private Set<String> ignoredFileExtensions = Collections.emptySet();
@NotEmpty
private String textField;
@NullNotEmpty
private String titleField;
private boolean includeProperties;
private boolean failOnParseFailure;
private boolean stopOnParseFailure;
@Override
public PipelineStepStatus execute(Document doc) throws PipelineException {
final RawData data = doc.getRawData();
if (data == null) {
log.debug("No data to parse for doc {}", doc);
} else if (data.isReleased()) {
log.debug("Data released for doc {}", doc);
} else {
final String fileName = doc.getFieldValue(fileNameField);
final String ext = findExtension(fileName);
if (ignoredFileExtensions.contains(ext)) {
log.debug("Ignored file extension '{}'", ext);
return new PipelineStepStatus(PipelineStepStatusCode.FINISH);
}
final Parser parser = parsers.get(ext);
final ParseData parseData = new PipelineParseData(data, includeProperties, fileName);
if (parser == null || !parse(doc, parser, parseData)) {
if (parser == null && loggedExt.add(ext)) {
log.warn("No parser found for extension '{}'", ext);
}
if (parseWithFallbacks(doc, parseData)) {
data.release();
} else if (failOnParseFailure) {
throw new PipelineException("Parse failed for all parsers", getName());
} else if (stopOnParseFailure) {
return new PipelineStepStatus(PipelineStepStatusCode.FINISH);
}
} else {
data.release();
}
}
return PipelineStepStatus.DEFAULT;
}
private boolean parseWithFallbacks(Document doc, ParseData data) {
for (Parser parser : fallbackParsers) {
if (parse(doc, parser, data)) {
return true;
}
}
return false;
}
private boolean parse(Document doc, Parser parser, ParseData data) {
try {
final ParserResult result = parser.parse(data);
doc.setFieldValue(textField, result.getText());
if (titleField != null) {
doc.setFieldValue(titleField, result.getTitle());
}
if (includeProperties) {
final Map<String, String> properties = result.getProperties();
for (Map.Entry<String, String> entry : properties.entrySet()) {
doc.addFieldValue(entry.getKey(), entry.getValue());
}
}
log.debug("{} parsed {}", parser.getClass().getName(), doc);
return true;
} catch (Exception e) {
log.error("Problem parsing " + doc, e);
}
return false;
}
private static String findExtension(String fileName) {
if (fileName != null) {
final int idx = fileName.lastIndexOf('.');
if (idx >= 0) {
return fileName.substring(idx + 1).toLowerCase();
} else {
return "";
}
}
return null;
}
@Override
public void prepare() throws PipelineException {
super.prepare();
if (parsers.isEmpty() && fallbackParsers.isEmpty()) {
throw new PipelineException("No parser configured", getName());
}
}
@Override
public void finish(boolean success) throws PipelineException {
loggedExt.clear();
closeParsers(parsers.values());
closeParsers(fallbackParsers);
}
private static void closeParsers(Collection<Parser> parsers) {
for (Parser parser : parsers) {
if (parser instanceof Closeable) {
try {
((Closeable)parser).close();
} catch (IOException e) {
// Ignoring
}
}
}
}
@Override
public String getRevision() {
return "$Revision$";
}
public String getFileNameField() {
return fileNameField;
}
public void setFileNameField(String fileNameField) {
this.fileNameField = fileNameField;
}
public Map<String, Parser> getParsers() {
return parsers;
}
public void setParsers(Map<String, Parser> parsers) {
this.parsers = parsers;
}
public List<Parser> getFallbackParsers() {
return fallbackParsers;
}
public void setFallbackParsers(List<Parser> fallbackParsers) {
this.fallbackParsers = fallbackParsers;
}
public Set<String> getIgnoredFileExtensions() {
return ignoredFileExtensions;
}
public void setIgnoredFileExtensions(Set<String> ignoredFileExtensions) {
this.ignoredFileExtensions = ignoredFileExtensions;
}
public String getTextField() {
return textField;
}
public void setTextField(String textField) {
this.textField = textField;
}
public String getTitleField() {
return titleField;
}
public void setTitleField(String titleField) {
this.titleField = titleField;
}
public boolean isIncludeProperties() {
return includeProperties;
}
public void setIncludeProperties(boolean includeProperties) {
this.includeProperties = includeProperties;
}
public boolean isFailOnParseFailure() {
return failOnParseFailure;
}
public void setFailOnParseFailure(boolean failOnParseFailure) {
this.failOnParseFailure = failOnParseFailure;
}
public boolean isStopOnParseFailure() {
return stopOnParseFailure;
}
public void setStopOnParseFailure(boolean stopOnParseFailure) {
this.stopOnParseFailure = stopOnParseFailure;
}
}