package uk.ac.shef.dcs.jate.io; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.model.JATEDocument; import java.io.File; /** * A toy example to show how we can assign text from particular parts of an * input document to particular fields of a jate document */ public class TikaMultiFieldDocumentCreator extends DocumentCreator { private JATEProperties properties; protected ContentExtractor contentExtractor; public TikaMultiFieldDocumentCreator(JATEProperties properties) { this.properties = properties; contentExtractor = new ContentExtractor(); } @Override public JATEDocument create(String filePath) throws JATEException { File file = new File(filePath); JATEDocument doc = new JATEDocument(filePath); doc.setPath(filePath); String content = contentExtractor.extractContent(file); doc.setContent(content); // add two specific fields: title and links. The values are not really // useful String dynamicFieldName = properties.getSolrFieldNameJATECTermsF(); //TODO: check the comments below // JATEProperties.PROPERTY_SOLR_FIELD_MAP_DOC_PARTS fields extracted by Tika where terms will be extracted from if (dynamicFieldName == null) throw new JATEException(String.format("'%s' required but is not defined in jate.properties", JATEProperties.PROPERTY_SOLR_FIELD_MAP_DOC_PARTS)); doc.getMapField2Content().put(properties.getSolrFieldNameJATECTermsF().replace("\\*", "title"), doc.getContent().substring(0, 100)); doc.getMapField2Content().put(properties.getSolrFieldNameJATECTermsF().replace("\\*", "link"), doc.getContent().substring(101, 120)); return doc; } @Override public DocumentCreator copy() { return new TikaMultiFieldDocumentCreator(properties); } }