/* * This file is part of Flicklib. * * Copyright (C) Francis De Brabandere * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.flicklib.service.movie.cinebel; import java.util.Collection; import java.util.List; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import com.flicklib.api.Parser; import com.flicklib.domain.MoviePage; import com.flicklib.tools.AdvancedTextExtractor; import com.flicklib.tools.SimpleXPath; /** * @author francisdb * */ public class CinebelParser implements Parser { /* (non-Javadoc) * @see com.flicklib.api.Parser#parse(com.flicklib.service.Source, com.flicklib.domain.MoviePage) */ @Override public void parse(com.flicklib.service.Source source, MoviePage page) { Source document = source.getJerichoSource(); parseTitle(page, document); parseImageUrl(page, document); parseRating(page, document); parseSynopsis(page, document); parseYear(page, document); parseMisc(page, document); } private void parseMisc(MoviePage page, Source document) { for (Element e : new SimpleXPath(document.getAllElements("class", "movieInfosGroup", true)).children().filterTagName(HTMLElementName.DIV)) { Element strong = e.getFirstElement(HTMLElementName.STRONG); if (strong != null) { String strongValue = strong.getTextExtractor().toString(); if ("Regie".equals(strongValue)) { fillSet(page.getDirectors(), new SimpleXPath(e).getTags(HTMLElementName.LI)); } if ("Cast".equals(strongValue)) { fillSet(page.getActors(), new SimpleXPath(e).getTags(HTMLElementName.LI)); } if ("Genre".equals(strongValue)) { fillSet(page.getGenres(), new SimpleXPath(e).getTags(HTMLElementName.LI)); } } } } private void fillSet(Collection<String> values, SimpleXPath tags) { for (Element e : tags) { String value = e.getTextExtractor().toString().trim(); values.add(value); } } private void parseYear(MoviePage page, Source document) { List<Element> elements = document.getAllElements("class", "productionDate", true); if (elements.size()> 0) { String value = excludeStrong(elements); page.setYear(Integer.parseInt(value.replace(':', ' ').trim())); } } private void parseTitle(MoviePage page, Source document) { Element movieDetails = document.getElementById("movieDetails"); Element title = new SimpleXPath(movieDetails).children().filterTagName(HTMLElementName.H1).children().unique(); page.setTitle(title.getTextExtractor().toString()); } private void parseImageUrl(MoviePage page, Source document) { Element fullPosterLink = document.getElementById("fullPosterLink"); if (fullPosterLink != null) { page.setImgUrl(fullPosterLink.getAttributeValue("href")); } } private void parseRating(MoviePage page, Source document) { Element rating = document.getElementById("userRating"); if (rating != null) { Element average = new SimpleXPath(rating).getAllTagByAttributes("class", "average").unique(); String averageText = average.getTextExtractor().toString(); double score = Double.parseDouble(averageText); page.setScore((int) (score * 10)); } } private void parseSynopsis(MoviePage page, Source document) { List<Element> synopsis = document.getAllElements("class", "synopsis", true); if (synopsis.size() > 0) { String syn = excludeStrong(synopsis); page.setDescription(syn); page.setPlot(syn); } } private String excludeStrong(List<Element> synopsis) { return new AdvancedTextExtractor(synopsis.get(0), false).addExcludedTagName(HTMLElementName.STRONG).toString(); } }