/* * This file is part of Flicklib. * * Copyright (C) Francis De Brabandere * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.flicklib.service.movie.xpress; import static com.flicklib.tools.StringUtils.isElementAttributeValue; import static com.flicklib.tools.StringUtils.isElementAttributeValueContains; import static com.flicklib.tools.StringUtils.unbracket; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.StartTag; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.flicklib.api.AbstractMovieInfoFetcher; import com.flicklib.domain.MoviePage; import com.flicklib.domain.MovieSearchResult; import com.flicklib.domain.MovieService; import com.flicklib.service.Source; import com.flicklib.service.SourceLoader; import com.google.inject.Inject; public class XpressHuFetcher extends AbstractMovieInfoFetcher { private final static MovieService XPRESSHU = new MovieService("XPRESSHU" ,"Xpress.hu", "http://www.xpress.hu"); private static final Pattern ORIG_TITLE_WITH_YEAR_PATTERN = Pattern.compile("(.*) - ([0-9]+)"); private static final Pattern URL_PATTERN = Pattern.compile(".*FILMAZ=([0-9]+)"); private static final URL SEARCH_URL; private static final URL MOVIE_URL; static { try { SEARCH_URL = new URL("http://www.xpress.hu/dvd/keres.asp"); MOVIE_URL = new URL("http://www.xpress.hu/dvd/film.asp"); } catch (MalformedURLException e) { throw new RuntimeException("Malformed URL exception : " + e.getMessage(), e); } } private static final Logger LOGGER = LoggerFactory.getLogger(XpressHuFetcher.class); private final SourceLoader sourceLoader; @Inject public XpressHuFetcher(final SourceLoader sourceLoader) { this.sourceLoader = sourceLoader; } @Override public MoviePage getMovieInfo(final String idForSite) throws IOException { final String url = "http://www.xpress.hu/dvd/film.asp?FILMAZ=" + idForSite; final Source source = this.sourceLoader.loadSource(url); final net.htmlparser.jericho.Source jerichoSource = source.getJerichoSource(); final MoviePage result = new MoviePage(); result.setIdForSite(idForSite); result.setUrl(url); result.setService(XPRESSHU); final List<Element> trLines = jerichoSource.getAllElements("tr"); // search for <tr valign='top' align='center'> with 2 child for (Element tr : trLines) { if (isElementAttributeValue(tr, "valign", "top") && isElementAttributeValue(tr, "align", "center")) { List<Element> tdList = tr.getChildElements(); if (tdList.size() == 2) { if (parseCenterPanel(result, tdList)) { // navigate to the right most column, up one table Element outerTr = getParentElement(tr, "tr"); if (outerTr != null && outerTr.getChildElements().size()==2) { parseLeftMostColumn(result, (Element) outerTr.getChildElements().get(1)); } else { LOGGER.error("unable locate rating for "+url); } break; } } } } if (result.getImgUrl() == null) { LOGGER.error("Image URL not found for " + url); } return result; } private void parseLeftMostColumn(MoviePage result, Element element) { List<Element> bolds = element.getAllElements("b"); for (Element boldElement : bolds) { List<Element> boldChild = boldElement.getChildElements(); LOGGER.debug("CHILD:" + boldChild); if (boldChild.size() == 1 && "img".equals(boldChild.get(0).getName())) { // <img width="110" height="11" src="kepek/m2film.gif"/> Element img = boldChild.get(0); if (isElementAttributeValue(img, "src", "kepek/m2film.gif")) { result.setScore(locateRating(img)); return; } } } } private boolean parseCenterPanel(final MoviePage result, List<Element> tdList) { Element firstTd = tdList.get(0); Element secondTd = tdList.get(1); if (isElementAttributeValue(firstTd, "rowspan", "2") && isElementAttributeValue(secondTd, "rowspan", "2") && isElementAttributeValue(firstTd, "valign", "top") && isElementAttributeValue(firstTd, "width", "194") && isElementAttributeValue(secondTd, "width", "1056")) { List<Element> imgList = firstTd.getAllElements("img"); for (Element image : imgList) { if (isElementAttributeValueContains(image, "src", "cover")) { result.setImgUrl(resolveRelativeUrl(MOVIE_URL, image.getAttributeValue("src"))); } } List<Element> bolds = secondTd.getAllElements("b"); if (bolds.size() > 0) { extractTitle(result, bolds); int found = 0; for (int i = 1; i < bolds.size() && found < 4; i++) { Element boldElement = bolds.get(i); String text = boldElement.getTextExtractor().toString(); LOGGER.debug("text found:" + text); if ("Műfaj:".equalsIgnoreCase(text) || "Mûfaj:".equalsIgnoreCase(text)) { String value = getLabelValue(boldElement); if (value != null) { result.setGenres(Collections.singleton(value)); found++; } } else if ("Rendezte:".equalsIgnoreCase(text)) { String value = getLabelValue(boldElement); if (value != null) { result.getDirectors().add(value); found++; } } else if ("Szereplők:".equalsIgnoreCase(text) || "Szereplõk:".equalsIgnoreCase(text)) { String value = getLabelValue(boldElement); if (value != null) { LOGGER.info("Actors:" + value); found++; } } else if ("Tartalom:".equalsIgnoreCase(text)) { Element trx = getParentElement(boldElement, "tr"); Element plot = getSibling(trx, 1); String plotValue = plot.getTextExtractor().toString(); LOGGER.info("Plot:" + plotValue); result.setPlot(plotValue); found++; } } } return true; } return false; } /** * Locate the rating from a >img width="110" height="11" src="kepek/m2film.gif"/< element. * * @param img * @return */ private Integer locateRating(Element img) { Element tr = getParentElement(img, "tr"); Element nextRow = getSibling(tr, 1); String nextRowValue = nextRow.getTextExtractor().toString(); LOGGER.debug("nextRow : "+nextRowValue); if (nextRowValue.endsWith("%")) { String subSequence = nextRowValue.substring(0, nextRowValue.length()-1); return Integer.valueOf(subSequence); } else { return null; } } private void extractTitle(final MoviePage result, List<Element> bolds) { String title = bolds.get(0).getTextExtractor().toString(); LOGGER.info("title is :" + title); result.setTitle(title); Element parentTd = getParentElement(bolds.get(0), "td"); List<Element> childs = parentTd.getChildElements(); LOGGER.debug("child size:" + childs.size()); if (childs.size() >= 3) { String text = childs.get(2).getTextExtractor().toString(); Pattern p = Pattern.compile("(.+) \\((.+)- ([0-9]+)\\)"); Matcher matcher = p.matcher(text); if (matcher.matches()) { LOGGER.debug("group : " + matcher.group()); LOGGER.debug("group 0 : " + matcher.group(0)); LOGGER.debug("group 1 : " + matcher.group(1)); LOGGER.debug("group 2 : " + matcher.group(2)); LOGGER.debug("group 3 : " + matcher.group(3)); result.setOriginalTitle(matcher.group(1)); result.setYear(new Integer(matcher.group(3))); } // result.setOriginalTitle(text); } } private static String getLabelValue(Element boldElement) { Element parentTr = getParentElement(boldElement, "tr"); if (parentTr != null) { List<Element> tds = parentTr.getChildElements(); if (tds.size() == 2) { String value = tds.get(1).getTextExtractor().toString(); if (value.endsWith("</")) { value = value.substring(0, value.length() - 2); } return value; } } return null; } private static Element getSibling(Element element, int position) { List<Element> elements = element.getParentElement().getChildElements(); int current = elements.indexOf(element); return elements.get(current + position); } @Override public List<? extends MovieSearchResult> search(String title) throws IOException { Source searchResponse = executeSearch(title); LOGGER.info("search for '" + title + "' returned " + searchResponse); //System.err.println(searchResponse.getContent()); net.htmlparser.jericho.Source jerichoSource = searchResponse.getJerichoSource(); List<StartTag> forms = jerichoSource.getAllStartTags("form"); List<MovieSearchResult> result = new ArrayList<MovieSearchResult>(); for (StartTag form : forms) { if (isElementAttributeValueContains(form, "action", "rendel.asp")) { List<Element> trLines = form.getElement().getAllElements("tr"); for (Element tr : trLines) { if (isElementAttributeValue(tr, "valign", "top")) { List<Element> childs = tr.getChildElements(); LOGGER.debug("child count :" + childs.size()); if (childs.size() == 4) { MovieSearchResult msr = parseRow(childs); result.add(msr); } } } } } return result; } /** * parse the search result row * * @param childs * @return */ private MovieSearchResult parseRow(List<Element> childs) { MovieSearchResult msr = new MovieSearchResult(); msr.setService(XPRESSHU); { Element imageCell = childs.get(1); List<Element> imageTags = imageCell.getAllElements("img"); if (imageTags.size() > 0) { String imageUrl = imageTags.get(0).getAttributeValue("src"); LOGGER.info("image url : " + imageUrl); } } { Element descCell = childs.get(2); List<Element> aTags = descCell.getAllElements("a"); if (aTags.size() > 0) { Element alink = aTags.get(0); String link = alink.getAttributeValue("href"); Matcher matcher = URL_PATTERN.matcher(link); if (matcher.find()) { String id = matcher.group(1); String relUrl = matcher.group(); LOGGER.info("ID:" + id + " relative url:" + relUrl); msr.setIdForSite(id); String url = resolveRelativeUrl(SEARCH_URL, relUrl); msr.setUrl(url); } List<Element> fontElements = alink.getAllElements("font"); if (fontElements.size() == 1) { msr.setTitle(fontElements.get(0).getTextExtractor().toString()); LOGGER.info("set title :" + msr.getTitle()); } List<Element> siblings = alink.getParentElement().getChildElements(); int pos = siblings.indexOf(alink); LOGGER.debug("pos :" + pos); if (pos >= 0) { Element originalTitle = siblings.get(pos + 2); String originalTitleText = unbracket(originalTitle.getTextExtractor().toString()); Matcher titleMatcher = ORIG_TITLE_WITH_YEAR_PATTERN.matcher(originalTitleText); if (titleMatcher.matches()) { String origTitle = titleMatcher.group(1); String year = titleMatcher.group(2); LOGGER.info("set original title :" + origTitle + ", year:" + year); msr.setOriginalTitle(origTitle); msr.setYear(Integer.parseInt(year)); } } } } return msr; } private static Element getParentElement(Element element, String parentName) { while (true) { element = element.getParentElement(); if (element == null || element.getName().equalsIgnoreCase(parentName)) { return element; } } } private static String resolveRelativeUrl(URL baseUrl, String relUrl) { try { return new URL(baseUrl, relUrl).toString(); } catch (MalformedURLException e) { LOGGER.error("Problematic url:" + relUrl, e); return relUrl; } } /** * post a request to do the search. * * @param title * @return * @throws IOException */ private Source executeSearch(String title) throws IOException { Source loadSource = sourceLoader.loadSource("http://www.xpress.hu/", false); net.htmlparser.jericho.Source jerichoSource = loadSource.getJerichoSource(); List<StartTag> forms = jerichoSource.getAllStartTags("form"); Map<String, String> params = new HashMap<String, String>(); params.put("GOMB", "1"); params.put("Go2", "Go"); params.put("Go2.x", "0"); params.put("Go2.y", "0"); params.put("KERES", title); for (StartTag form : forms) { String action = form.getAttributeValue("action"); if (action != null && action.indexOf("keres.asp") != -1) { parseHiddenFields(params, form); break; } } Map<String, String> headers = new HashMap<String, String>(); headers.put("Referer", "http://www.xpress.hu/"); Source searchResponse = sourceLoader.post("http://www.xpress.hu/dvd/keres.asp", params, headers); return searchResponse; } private static void parseHiddenFields(Map<String, String> params, StartTag form) { List<Element> hiddenElements = form.getElement().getAllElements("type", "hidden", true); for (Element hidden : hiddenElements) { String name = hidden.getAttributeValue("name"); String value = hidden.getAttributeValue("value"); if (name != null && value != null && name.length() > 0) { params.put(name, value); } } } @Override public MovieService getService() { return XPRESSHU; } }