package eu.europeana.cloud.service.dps.storm.transform.text; import eu.europeana.cloud.service.dps.storm.transform.text.edm.EdmExtractionMethods; import eu.europeana.cloud.service.dps.storm.transform.text.edm.JibxExtractor; import eu.europeana.cloud.service.dps.storm.transform.text.oai.DcExtractor; import eu.europeana.cloud.service.dps.storm.transform.text.oai.OaiExtractionMethods; import eu.europeana.cloud.service.dps.storm.transform.text.pdf.PdfBoxExtractor; import eu.europeana.cloud.service.dps.storm.transform.text.pdf.TikaExtractor; import eu.europeana.cloud.service.dps.storm.transform.text.pdf.PdfExtractionMethods; import eu.europeana.cloud.service.dps.storm.transform.text.txt.ReadFileExtractor; import eu.europeana.cloud.service.dps.storm.transform.text.txt.TxtExtractionMethods; /** * Factory for select extraction method. * @author Pavel Kefurt <Pavel.Kefurt@gmail.com> */ public class TextExtractorFactory { /** * Retrieve extractor for specific representation by extractor name. * @param representationName Name of representation * @param extractorName Extractor name * @return Instance of extractor or null if representation is not supported */ public static TextExtractor getExtractor(String representationName, String extractorName) { switch(SupportedRepresentations.getMethod(representationName)) { case PDF: return getPdfExtractor(extractorName); case OAI: return getOaiExtractor(extractorName); case TXT: return getTxtExtractor(extractorName); case EDM: return getEdmExtractor(extractorName); case UNSUPPORTED: default: return null; } } /** * Retrieve extractor for PDF format. * It uses extractors from enum {@link PdfExtractionMethods}. * If given extractor is not implemented, than it will be used TIKA_EXTRACTOR. * @param extractorName Extractor name * @return Instance of extractor */ private static TextExtractor getPdfExtractor(String extractorName) { PdfExtractionMethods method = PdfExtractionMethods.TIKA_EXTRACTOR.getMethod(extractorName); switch(method) { case PDFBOX_EXTRACTOR: return new PdfBoxExtractor(); case TIKA_EXTRACTOR: default: return new TikaExtractor(); } } /** * Retrieve extractor pro OAI format. * It uses extractors from enum {@link OaiExtractionMethods}. * If given extractor is not implemented, than it will be used DC. * @param extractorName Extractor name * @return Instance of extractor */ private static TextExtractor getOaiExtractor(String extractorName) { OaiExtractionMethods method = OaiExtractionMethods.DC_EXTRACTOR.getMethod(extractorName); switch(method) { case DC_EXTRACTOR: default: return new DcExtractor(); } } /** * Retrieve extractor for TXT files. * @param extractorName Extractor name * @return Instance of extractor */ private static TextExtractor getTxtExtractor(String extractorName) { TxtExtractionMethods method = TxtExtractionMethods.READ_FILE_EXTRACTOR.getMethod(extractorName); switch(method) { case READ_FILE_EXTRACTOR: default: return new ReadFileExtractor(); } } /** * Retrieve extractor for EDM files. * @param extractorName Extractor name * @return Instance of extractor */ private static TextExtractor getEdmExtractor(String extractorName) { EdmExtractionMethods method = EdmExtractionMethods.JIBX_EXTRACTOR.getMethod(extractorName); switch(method) { case JIBX_EXTRACTOR: default: return new JibxExtractor(); } } }