package ilarkesto.imdb; import ilarkesto.base.Str; import ilarkesto.core.logging.Log; import ilarkesto.io.IO; import ilarkesto.net.HttpUnit; import java.io.File; import java.io.IOException; import org.xml.sax.SAXException; import com.meterware.httpunit.HTMLElement; import com.meterware.httpunit.WebLink; import com.meterware.httpunit.WebResponse; public class Imdb { private static final String TITLE_URL_PREFIX = "http://www.imdb.com/title/"; private static Log log = Log.get(Imdb.class); public static String determineIdByTitle(String title, boolean guess) { log.info("Determining IMDB-ID by title:", title); WebResponse response = HttpUnit.loadPage(getTitleSearchUrl(title)); String url = response.getHeaderField("LOCATION"); if (!Str.isBlank(url) && url.startsWith(TITLE_URL_PREFIX)) { url = Str.removePrefix(url, TITLE_URL_PREFIX); if (url.contains("/")) url = Str.cutTo(url, "/"); return url; } if (guess) { WebLink[] links; try { links = response.getLinks(); } catch (SAXException ex) { throw new RuntimeException(ex); } for (WebLink link : links) { String linkUrl = link.getURLString(); if (Str.isBlank(linkUrl)) continue; linkUrl = Str.removePrefix(linkUrl, "http://www.imdb.com"); if (linkUrl.startsWith("/title/")) { String id = Str.removePrefix(linkUrl, "/title/"); return Str.removeSuffix(id, "/"); } } } return null; } public static ImdbRecord loadRecord(String imdbId) { if (imdbId == null) return null; String url = getPageUrl(imdbId); log.info("Loading IMDB record:", imdbId); WebResponse response = HttpUnit.loadPage(url); String title = parseTitle(response); Integer year = parseYear(response); String coverId = parseCoverId(response); String tagline = parseInfoContent(response, "Tagline"); String plot = parseInfoContent(response, "Plot"); String awards = parseInfoContent(response, "Awards"); return new ImdbRecord(imdbId, title, year, coverId); } private static String parseInfoContent(WebResponse response, String label) { String text; try { text = response.getText(); } catch (IOException ex) { throw new RuntimeException(ex); } text = Str.cutFromTo(text, "<h5>" + label + ":</h5>", "</div>"); if (text == null) return null; text = Str.cutFrom(text, "<div class=\"info-content\">"); if (text == null) return null; if (text.contains("<a ")) text = Str.cutTo(text, "<a "); return Str.html2text(text.trim()); } private static Integer parseYear(WebResponse response) { String title; try { title = response.getTitle(); } catch (SAXException ex) { throw new RuntimeException(ex); } if (title == null) return null; int idx = title.lastIndexOf(" ("); if (idx < 1) return null; String year = title.substring(idx + 2, idx + 6); return Integer.parseInt(year); } private static String parseTitle(WebResponse response) { String title; try { title = response.getTitle(); } catch (SAXException ex) { throw new RuntimeException(ex); } if (title == null) return null; int idx = title.indexOf(" ("); if (idx < 1) return title; return title.substring(0, idx); } private static String parseCoverId(WebResponse response) { HTMLElement img; try { img = response.getElementWithID("primary-poster"); } catch (SAXException ex) { throw new RuntimeException(ex); } if (img == null) return null; String url = img.getAttribute("src"); if (url == null) return null; if (!url.startsWith("http://ia.media-imdb.com/images/M/")) return null; if (!url.contains("._")) return null; String id = Str.removePrefix(url, "http://ia.media-imdb.com/images/M/"); id = id.substring(0, id.indexOf("._")); return id; } public static String getTitleSearchUrl(String title) { return "http://www.imdb.com/find?s=tt&q=" + Str.encodeUrlParameter(title); } public static String getPageUrl(String imdbId) { return TITLE_URL_PREFIX + imdbId + "/"; } public static void downloadCover(String coverId, File destinationFile) { String url = getCoverUrl(coverId); log.info("Downloading IMDB cover:", url); IO.downloadUrlToFile(url, destinationFile.getPath()); } public static String getCoverUrl(String coverId) { if (coverId == null) return null; return "http://ia.media-imdb.com/images/M/" + coverId + "._V1._SX510_SY755_.jpg"; } public static String extractId(String url) { if (Str.isBlank(url)) return null; String id = url; id = Str.removePrefix(id, TITLE_URL_PREFIX); id = Str.removePrefix(id, "http://www.imdb.de/title/"); id = Str.removeSuffix(id, "/"); return id; } }