package nl.siegmann.epublib.util; import java.io.IOException; import java.io.Reader; import java.util.Scanner; import java.util.logging.Logger; import java.util.regex.Pattern; import nl.siegmann.epublib.domain.Resource; import nl.siegmann.epublib.service.MediatypeService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringEscapeUtils; /** * Various resource utility methods * * @author paul * */ public class ToolsResourceUtil { private static Logger log = Logger.getLogger(ToolsResourceUtil.class.getName()); public static String getTitle(Resource resource) { if (resource == null) { return ""; } if (resource.getMediaType() != MediatypeService.XHTML) { return resource.getHref(); } String title = findTitleFromXhtml(resource); if (title == null) { title = ""; } return title; } /** * Retrieves whatever it finds between <title>...</title> or <h1-7>...</h1-7>. * The first match is returned, even if it is a blank string. * If it finds nothing null is returned. * @param resource * @return */ public static String findTitleFromXhtml(Resource resource) { if (resource == null) { return ""; } if (resource.getTitle() != null) { return resource.getTitle(); } Pattern h_tag = Pattern.compile("^h\\d\\s*", Pattern.CASE_INSENSITIVE); String title = null; Scanner scanner = null; try { Reader content = resource.getReader(); scanner = new Scanner(content); scanner.useDelimiter("<"); while(scanner.hasNext()) { String text = scanner.next(); int closePos = text.indexOf('>'); String tag = text.substring(0, closePos); if (tag.equalsIgnoreCase("title") || h_tag.matcher(tag).find()) { title = text.substring(closePos + 1).trim(); title = StringEscapeUtils.unescapeHtml(title); break; } } } catch (IOException e) { log.warning(e.getMessage()); } finally { IOUtils.closeQuietly(scanner); } resource.setTitle(title); return title; } }