/* * This file is part of Flicklib. * * Copyright (C) Francis De Brabandere * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.flicklib.service.sub; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.flicklib.api.SubtitlesLoader; import com.flicklib.domain.Subtitle; import com.flicklib.service.SourceLoader; import com.flicklib.tools.ElementOnlyTextExtractor; import com.flicklib.tools.Param; import com.google.inject.Inject; /** * * http://www.opensubtitles.org * @author francisdb */ public class OpenSubtitlesLoader implements SubtitlesLoader { private static final Logger LOGGER = LoggerFactory.getLogger(OpenSubtitlesLoader.class); private static final String SITE = "http://www.opensubtitles.org"; private final SourceLoader sourceLoader; /** * Constructs a new OpenSubtitlesLoader * @param sourceLoader */ @Inject public OpenSubtitlesLoader(final SourceLoader sourceLoader) { this.sourceLoader = sourceLoader; } @Override public Set<Subtitle> search(String localFileName, String imdbId) throws IOException { String url = searchUrl(localFileName); int carryOn = 1; com.flicklib.service.Source source = sourceLoader.loadSource(url); Source jerichoSource = source.getJerichoSource(); Set<Subtitle> results = new HashSet<Subtitle>(); Element titleElement = (Element) jerichoSource.getAllElements(HTMLElementName.TITLE).get(0); String title = titleElement.getContent().getTextExtractor().toString(); if (title.contains("(results)")) { //first check if the results page contains no results. List<?> divElements = jerichoSource.getAllElements(HTMLElementName.DIV); Iterator<?> j = divElements.iterator(); while(j.hasNext() && carryOn==1) { Element divElement = (Element) j.next(); if(divElement.getTextExtractor().toString().contains("No results found")) { carryOn = 0; } else { carryOn = 1; } } //if the results page does contain results then load first link if(carryOn!=0) { String subsUrl = null; List<?> aElements = jerichoSource.getAllElements(HTMLElementName.A); for (int i = 0; i < aElements.size() && subsUrl == null; i++) { Element aElement = (Element) aElements.get(i); if ("bnone".equals(aElement.getAttributeValue("class"))) { subsUrl = SITE + aElement.getAttributeValue("href"); } } source = load(subsUrl); results = loadSubtitlesPage(source.getJerichoSource()); //Get links for other pages. List<String> pages = new ArrayList<String>(); pages.addAll(getPageLinks(jerichoSource)); Iterator<?> k = pages.iterator(); while(k.hasNext()) { String link = (String) k.next(); source = load(SITE + link); results.addAll(loadSubtitlesPage(source.getJerichoSource())); } } } else { // direct hit results = loadSubtitlesPage(jerichoSource); //Get links for other pages. Set<String> pages = new HashSet<String>(); pages.addAll(getPageLinks(jerichoSource)); for(String link:pages){ source = load(SITE + link); results.addAll(loadSubtitlesPage(source.getJerichoSource())); } } return results; } private com.flicklib.service.Source load(String link) throws IOException { com.flicklib.service.Source source = sourceLoader.loadSource(link); if (source == null) { throw new IOException("loading " + SITE + link + " is failed!"); } return source; } /** * This method retrieves the links for all pages other than the first page. * @param URL * @return */ private Set<String> getPageLinks(Source source) { Set<String> links = new HashSet<String>(); List<?> linksElements = source.getAllElements(HTMLElementName.A); Iterator<?> i; i = linksElements.iterator(); while (i.hasNext()) { Element linkElement = (Element) i.next(); String href = linkElement.getAttributeValue("href"); if(!href.isEmpty() && href.contains("/offset-")) { LOGGER.trace(linkElement.getTextExtractor().toString()); links.add(href); } } return links; } private Set<Subtitle> loadSubtitlesPage(Source jerichoSource) { Set<Subtitle> results = new HashSet<Subtitle>(); Element tableElement = (Element) jerichoSource.getAllElements("id", "search_results", false).get(0); Subtitle sub; for (Element trElement : tableElement.getAllElements(HTMLElementName.TR)) { String style = trElement.getAttributeValue("style"); if (!"display:none".equals(style)) { sub = new Subtitle(); sub.setSubSource(SITE); List<?> tdElements = trElement.getAllElements(HTMLElementName.TD); if (tdElements.size() >= 4) { // TITLE/URL Element titleTd = (Element) tdElements.get(0); Element firstLink = (Element) titleTd.getFirstElement(HTMLElementName.A); if (firstLink == null) { // TODO : investigate continue; } String fileName = firstLink.getContent().getTextExtractor().toString(); ElementOnlyTextExtractor extractor = new ElementOnlyTextExtractor(titleTd.getContent()); String extra = extractor.toString(); if(extra.trim().length() > 0){ fileName = extractor.toString()+" "+fileName; } sub.setFileName(fileName); // LANG Element flagTd = (Element) tdElements.get(1); List<?> divElements = flagTd.getAllElements(HTMLElementName.DIV); Element divElement; Iterator<?> divs = divElements.iterator(); while (divs.hasNext()) { divElement = (Element) divs.next(); //LOGGER.info(divElement.toString()); String cls = divElement.getAttributeValue("class"); if (cls != null && cls.startsWith("flag")) { sub.setLanguage(cls.substring(5)); } } // CD Element cdTd = (Element) tdElements.get(2); sub.setNoCd(cdTd.getContent().getTextExtractor().toString()); // TYPE & URL Element typeTd = (Element) tdElements.get(4); Element span = (Element) typeTd.getAllElements("class", "p", false).get(0); Element link = (Element) typeTd.getAllElements("a").get(0); sub.setType(span.getContent().getTextExtractor().toString()); sub.setFileUrl(SITE + link.getAttributeValue("href")); results.add(sub); } } } return results; } private String searchUrl(String title) { String encoded = Param.encode("\""+title+"\""); return SITE + "/en/search2/sublanguageid-all/moviename-" + encoded; } }