package org.rr.jeborker.metadata; import static org.rr.commons.utils.StringUtil.EMPTY; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.logging.Level; import org.apache.commons.io.Charsets; import org.apache.commons.io.IOUtils; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.rr.commons.log.LoggerFactory; import org.rr.commons.mufs.IResourceHandler; import org.rr.commons.mufs.ResourceHandlerInputStream; import org.rr.commons.utils.StringUtil; import org.rr.commons.utils.UtilConstants; import org.rr.jeborker.db.item.EbookPropertyItem; class HTMLMetadataReader implements IMetadataReader { private IResourceHandler ebookResourceHandler; HTMLMetadataReader(IResourceHandler resource) { this.ebookResourceHandler = resource; } @Override public List<IResourceHandler> getEbookResource() { return Collections.singletonList(this.ebookResourceHandler); } @Override public List<MetadataProperty> readMetadata() { try { final String htmlHead = getHTMLHead(); if(!htmlHead.isEmpty()) { List<MetadataProperty> extractMetadata = extractMetadata(htmlHead); return extractMetadata; } } catch (IOException e) { LoggerFactory.getLogger(this).log(Level.WARNING, "Failed to read metadata for " + ebookResourceHandler, e); } return Collections.emptyList(); } /** * Extracts the head of the html document. Supports only reading the header and not the whole html file. * @throws IOException */ private String getHTMLHead() throws IOException { final ResourceHandlerInputStream contentInputStream = this.ebookResourceHandler.getContentInputStream(); try { final byte[] buf = new byte[512]; final StringBuilder content = new StringBuilder(); final String body = "<body>"; int len; int bodyIndex = -1; String charset = Charsets.UTF_8.name(); while((len = contentInputStream.read(buf)) != -1) { String html = new String(buf, 0, len, charset); if(html.indexOf('\ufffd') != -1) { String charsetLocationString = "text/html; charset="; int charsetStart = html.indexOf(charsetLocationString); int charsetEnd = html.indexOf('\"', charsetStart); if(charsetStart != -1 && charsetEnd != -1) { charset = html.substring(charsetStart + charsetLocationString.length(), charsetEnd); html = new String(buf, 0, len, charset); } } content.append(html); if((bodyIndex = StringUtil.find(content, body, content.length() - len - body.length(), UtilConstants.COMPARE_TEXT)) != -1) { break; } } if(bodyIndex != -1) { String metadata = content.toString().substring(0, bodyIndex); metadata = StringUtil.replace(metadata, new String[] {"<html>"}, EMPTY, UtilConstants.COMPARE_TEXT); metadata = StringUtil.ltrim(metadata, '\r', '\n'); return metadata; } } finally { IOUtils.closeQuietly(contentInputStream); } return EMPTY; } /** * Extracts the meta data from the given <code>content</code>. * @param content The html content containing some meta data. * @param bodyIndex The index of the body tag. * @return The extracted meta data. Never returns <code>null</code>. * @throws IOException */ private List<MetadataProperty> extractMetadata(final String content) throws IOException { final List<MetadataProperty> result = new ArrayList<>(); final HtmlCleaner cleaner = new HtmlCleaner(); final TagNode rootNode = cleaner.clean(new StringReader(content)); //add meta tags final TagNode[] metaElements = rootNode.getElementsByName("meta", true); for (int i = 0; i < metaElements.length; i++) { String metaName = metaElements[i].getAttributeByName("name"); String metaContent = metaElements[i].getAttributeByName("content"); if(metaName == null) { Map<String, String> attributes = metaElements[i].getAttributes(); for(String att : attributes.values()) { if(att != null && !att.equals(metaContent)) { metaName = att; } } } result.add(new MetadataProperty(metaName, metaContent)); } //add title tag final TagNode[] titleElements = rootNode.getElementsByName("title", true); for (int i = 0; i < titleElements.length; i++) { StringBuffer text = titleElements[i].getText(); result.add(new MetadataProperty(COMMON_METADATA_TYPES.TITLE.getName(), text)); } return result; } @Override public List<MetadataProperty> getSupportedMetadata() { return Collections.emptyList(); } @Override public void fillEbookPropertyItem(List<MetadataProperty> metadataProperties, EbookPropertyItem item) { for(MetadataProperty metadataProperty : metadataProperties) for(COMMON_METADATA_TYPES type : COMMON_METADATA_TYPES.values()) { if(type.getName().equalsIgnoreCase(metadataProperty.getName())) { type.fillItem(metadataProperty, item); } } } @Override public String getPlainMetadata() { try { final String htmlHead = getHTMLHead(); return htmlHead; } catch (IOException e) { LoggerFactory.getLogger(this).log(Level.WARNING, "Failed to read metadata for " + ebookResourceHandler, e); } return EMPTY; } @Override public String getPlainMetadataMime() { return "text/html"; } @Override public List<MetadataProperty> getMetadataByType(boolean create, List<MetadataProperty> props, COMMON_METADATA_TYPES type) { return Collections.emptyList(); } }