package moviescraper.doctord.controller.siteparsingprofile;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import javax.swing.ImageIcon;
import org.apache.commons.io.FilenameUtils;
import org.imgscalr.Scalr;
import org.imgscalr.Scalr.Method;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import moviescraper.doctord.controller.AbstractMovieScraper;
import moviescraper.doctord.controller.GenericMovieScraper;
import moviescraper.doctord.controller.languagetranslation.Language;
import moviescraper.doctord.model.SearchResult;
import moviescraper.doctord.model.dataitem.Actor;
import moviescraper.doctord.model.dataitem.DataItemSource;
import moviescraper.doctord.model.dataitem.Director;
import moviescraper.doctord.model.dataitem.Genre;
import moviescraper.doctord.model.dataitem.ID;
import moviescraper.doctord.model.dataitem.MPAARating;
import moviescraper.doctord.model.dataitem.OriginalTitle;
import moviescraper.doctord.model.dataitem.Outline;
import moviescraper.doctord.model.dataitem.Plot;
import moviescraper.doctord.model.dataitem.Rating;
import moviescraper.doctord.model.dataitem.ReleaseDate;
import moviescraper.doctord.model.dataitem.Set;
import moviescraper.doctord.model.dataitem.SortTitle;
import moviescraper.doctord.model.dataitem.Studio;
import moviescraper.doctord.model.dataitem.Tag;
import moviescraper.doctord.model.dataitem.Tagline;
import moviescraper.doctord.model.dataitem.Thumb;
import moviescraper.doctord.model.dataitem.Title;
import moviescraper.doctord.model.dataitem.Top250;
import moviescraper.doctord.model.dataitem.Trailer;
import moviescraper.doctord.model.dataitem.Votes;
import moviescraper.doctord.model.dataitem.Year;
import moviescraper.doctord.model.preferences.MoviescraperPreferences;
import moviescraper.doctord.view.GUIMain;
public abstract class SiteParsingProfile implements DataItemSource{
/* Any group of SiteParsingProfiles which return the same type of information for a given file and which
* will be compatible for amalgamation should return the same ScraperGroupName by implementing getScraperGroupName()
*/
public enum ScraperGroupName{
JAV_CENSORED_SCRAPER_GROUP {@Override
public String toString() {return "JAV Censored Group";}},
AMERICAN_ADULT_DVD_SCRAPER_GROUP {@Override
public String toString() {return "American Adult DVD";}},
DEFAULT_SCRAPER_GROUP {@Override
public String toString() {return "Default Group";}}
}
public List<ScraperGroupName> getScraperGroupNames()
{
if(groupNames == null)
groupNames = Arrays.asList(ScraperGroupName.DEFAULT_SCRAPER_GROUP);
return groupNames;
}
protected List<ScraperGroupName> groupNames;
protected Language scrapingLanguage;
public Document document; // the base page to start parsing from
@Deprecated
public String overrideURLDMM; //TODO: no longer used variable - will be removed later
private boolean extraFanartScrapingEnabled = false;
MoviescraperPreferences scrapingPreferences;
private boolean isDisabled = false;
private boolean firstWordOfFileIsID = false;
public static final int CONNECTION_TIMEOUT_VALUE = 13000;
protected File scrapedMovieFile;
private ImageIcon profileIcon;
/**
* If this has a value when scraping, will use overridenSearchResult
* from a user provided URL without looking at file name
*/
private SearchResult overridenSearchResult;
/**
* do we want to ignore scraping from this scraper. typically done when the user has hit cancel from a dialog box because none of the seen results were valid
*/
private boolean discardResults;
public boolean isExtraFanartScrapingEnabled() {
return extraFanartScrapingEnabled;
}
public void setExtraFanartScrapingEnabled(boolean extraFanartScrapingEnabled) {
this.extraFanartScrapingEnabled = extraFanartScrapingEnabled;
}
public String getOverrideURLDMM() {
return overrideURLDMM;
}
public void setOverrideURLDMM(String overrideURL) {
this.overrideURLDMM = overrideURL;
}
public SiteParsingProfile(Document document) {
this.document = document;
overrideURLDMM = null;
scrapingLanguage = Language.ENGLISH;
scrapingPreferences = MoviescraperPreferences.getInstance();
setScrapingLanguage(scrapingPreferences);
this.firstWordOfFileIsID = scrapingPreferences.getIsFirstWordOfFileID();
this.isDisabled = false;
}
public SiteParsingProfile(){
scrapingLanguage = Language.ENGLISH;
scrapingPreferences = MoviescraperPreferences.getInstance();
setScrapingLanguage(scrapingPreferences);
this.firstWordOfFileIsID = scrapingPreferences.getIsFirstWordOfFileID();
this.isDisabled = false;
}
public Document getDocument() {
return document;
}
public void setDocument(Document document) {
this.document = document;
}
/**
* Sets the {@link SiteParsingProfile#overridenSearchResult} to the URL defined by @param urlPath
* This will cause the scraper to ignore the file name of the file when scraping
* @param urlPath
*/
public void setOverridenSearchResult(String urlPath)
{
overridenSearchResult = new SearchResult(urlPath);
if(SiteParsingProfileJSON.class.isAssignableFrom(this.getClass()))
{
overridenSearchResult.setJSONSearchResult(true);
}
}
/**
* @return {@link SiteParsingProfile#overridenSearchResult}
*/
public SearchResult getOverridenSearchResult()
{
return overridenSearchResult;
}
/**
* Gets the ID number from the file and considers stripped out multipart file identifiers like CD1, CD2, etc
* The ID number needs to be the last word in the filename or the next to the last word in the file name if the file name
* ends with something like CD1 or Disc 1
* So this filename "My Movie ABC-123 CD1" would return the id as ABC-123
* This filename "My Movie ABC-123" would return the id as ABC-123
* @param file - file to find the ID tag from
* @param firstWordOfFileIsID - if true, just uses the first word in the file (seperated by space) as the ID number
* otherwise use the method described above
* @return
*/
public static String findIDTagFromFile(File file, boolean firstWordOfFileIsID)
{
String fileNameNoExtension;
if(file.isDirectory())
{
fileNameNoExtension = file.getName();
}
else fileNameNoExtension = FilenameUtils.removeExtension(file.getName());
String fileNameNoExtensionNoDiscNumber = stripDiscNumber(fileNameNoExtension);
String[] splitFileName = fileNameNoExtensionNoDiscNumber.split(" ");
String lastWord = "";
if(firstWordOfFileIsID && splitFileName.length > 0)
lastWord = splitFileName[0];
else lastWord = splitFileName[splitFileName.length-1];
//Some people like to enclose the ID number in parenthesis or brackets like this (ABC-123) or this [ABC-123] so this gets rid of that
//TODO: Maybe consolidate these lines of code using a single REGEX?
lastWord = lastWord.replace("(","");
lastWord = lastWord.replace(")","");
lastWord = lastWord.replace("[","");
lastWord = lastWord.replace("]","");
return lastWord;
}
public static String stripDiscNumber(String fileNameNoExtension) {
//replace <cd/dvd/part/pt/disk/disc/d> <0-N> (case insensitive) with empty
String discNumberStripped = fileNameNoExtension.replaceAll("(?i)[ _.]+(?:cd|dvd|p(?:ar)?t|dis[ck]|d)[ _.]*[0-9]+$", "");
//replace <cd/dvd/part/pt/disk/disc/d> <a-d> (case insensitive) with empty
discNumberStripped = discNumberStripped.replaceAll("(?i)[ _.]+(?:cd|dvd|p(?:ar)?t|dis[ck]|d)[ _.]*[a-d]$","");
return discNumberStripped.trim();
}
public abstract Title scrapeTitle();
public abstract OriginalTitle scrapeOriginalTitle();
public abstract SortTitle scrapeSortTitle();
public abstract Set scrapeSet();
public abstract Rating scrapeRating();
public abstract ReleaseDate scrapeReleaseDate();
public abstract Year scrapeYear();
public abstract Top250 scrapeTop250();
public abstract Votes scrapeVotes();
public abstract Outline scrapeOutline();
public abstract Plot scrapePlot();
public abstract Tagline scrapeTagline();
public abstract moviescraper.doctord.model.dataitem.Runtime scrapeRuntime();
public abstract Thumb[] scrapePosters();
public abstract Thumb[] scrapeFanart();
public abstract Thumb[] scrapeExtraFanart();
public abstract MPAARating scrapeMPAA();
public abstract ID scrapeID();
public abstract ArrayList<Genre> scrapeGenres();
public abstract ArrayList<Actor> scrapeActors();
public abstract ArrayList<Director> scrapeDirectors();
public abstract Studio scrapeStudio();
public abstract String createSearchString(File file);
public Trailer scrapeTrailer() {
return Trailer.BLANK_TRAILER;
}
public ArrayList<Tag> scrapeTags()
{
return Tag.BLANK_TAGS;
}
public abstract SearchResult[] getSearchResults(String searchString) throws IOException;
public SearchResult [] getLinksFromGoogle(String searchQuery, String site)
{
//System.out.println("calling get links from google with searchQuery = " + searchQuery);
ArrayList<SearchResult> linksToReturn = new ArrayList<>();
try{
String encodingScheme = "UTF-8";
String queryToEncode = "site:" + site + " " + searchQuery;
String encodedSearchQuery = URLEncoder.encode(queryToEncode, encodingScheme);
Document doc = Jsoup.connect("https://www.google.com/search?q="+encodedSearchQuery).userAgent(getRandomUserAgent()).referrer("http://www.google.com").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
Elements sorryLink = doc.select("form[action=CaptchaRedirect] input");
Map<String, String> captchaData = new HashMap<>();
for (Element element : sorryLink) {
String key = element.attr("name");
String value = element.attr("value");
captchaData.put(key, value);
}
if ( captchaData.size() > 0 )
{
System.out.println("Found Captchadata : " + captchaData);
System.out.println("Google has temporarily blocked us. Trying on bing instead.");
return getLinksFromBing(searchQuery, site);
}
Elements links = doc.select("div.g");
for (Element link : links) {
Elements hrefs = link.select("h3.r a");
String href = hrefs.attr("href");
href = URLDecoder.decode(href, encodingScheme);
href = href.replaceFirst(Pattern.quote("/url?q="), "");
href = href.replaceFirst(Pattern.quote("http://www.google.com/url?url="),"");
//remove some junk referrer stuff
int startIndexToRemove = href.indexOf("&rct=");
if (startIndexToRemove > -1)
href = href.substring(0, startIndexToRemove);
linksToReturn.add(new SearchResult(href,hrefs.text()));
}
if (linksToReturn.size() == 0)
{
//maybe we will have better luck with bing since we found nothing on google
return getLinksFromBing(encodedSearchQuery, site);
}
return linksToReturn.toArray(new SearchResult[linksToReturn.size()]);
}
catch (IOException e) {
e.printStackTrace();
return linksToReturn.toArray(new SearchResult[linksToReturn.size()]);
}
}
/**
* A backup search provider in case google search fails. This method is marked private and is called from getLinksFromGoogle. It should not be called in any other class.
*/
private SearchResult [] getLinksFromBing(String searchQuery, String site)
{
ArrayList<SearchResult> linksToReturn = new ArrayList<>();
String encodingScheme = "UTF-8";
String queryToEncode = "site:" + site + " " + searchQuery;
String encodedSearchQuery;
try {
encodedSearchQuery = URLEncoder.encode(queryToEncode, encodingScheme);
Document bingResultDocument = Jsoup.connect("https://www.bing.com/search?q="+encodedSearchQuery).userAgent(getRandomUserAgent()).referrer("http://www.bing.com").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
Elements links = bingResultDocument.select("a[href*=" + site);
for(Element link : links) {
linksToReturn.add(new SearchResult(link.attr("href")));
}
} catch (IOException e) {
e.printStackTrace();
return linksToReturn.toArray(new SearchResult[linksToReturn.size()]);
}
return linksToReturn.toArray(new SearchResult[linksToReturn.size()]);
}
protected static boolean fileExistsAtURL(String URLName){
try {
HttpURLConnection.setFollowRedirects(false);
// note : you may also need
// HttpURLConnection.setInstanceFollowRedirects(false)
HttpURLConnection con =
(HttpURLConnection) new URL(URLName).openConnection();
con.setRequestMethod("HEAD");
con.setConnectTimeout(CONNECTION_TIMEOUT_VALUE);
con.setReadTimeout(CONNECTION_TIMEOUT_VALUE);
return (con.getResponseCode() == HttpURLConnection.HTTP_OK);
}
catch(SocketTimeoutException e) {
// Non-existing DMM trailers usually time out
System.err.println("Connection timed out: " + URLName);
return false;
}
catch (Exception e) {
e.printStackTrace();
return false;
}
}
public AbstractMovieScraper getMovieScraper() {
return new GenericMovieScraper(this);
}
/**
*
* @return a new copy of the parser by calling the parser's constructor.
* used to instantiate a parser when the type of the object is not known
*/
public abstract SiteParsingProfile newInstance();
public Language getScrapingLanguage() {
return scrapingLanguage;
}
public void setScrapingLanguage(Language scrapingLanguage) {
this.scrapingLanguage = scrapingLanguage;
}
public void setScrapingLanguage(MoviescraperPreferences preferences)
{
if(preferences.getScrapeInJapanese())
scrapingLanguage = Language.JAPANESE;
else
scrapingLanguage = Language.ENGLISH;
}
/**
* If your file is called "Movie Name Here (2001)" this method returns "Movie Name Here"
* @param file the file to process
* @return The movie name without the year in parenthesis next to it
*/
public static String getMovieNameFromFileWithYear(File file)
{
String movieName = FilenameUtils.removeExtension(FilenameUtils.getName(file.getName()));
movieName = movieName.replaceFirst("\\(\\d{4}\\)$", "").trim();
return movieName;
}
/**
*
* @return - null if no file has been scraped yet or the file name of the scraped movie used in {@link #createSearchString(File)} method
*/
public String getFileNameOfScrapedMovie(){
if(scrapedMovieFile == null)
return null;
return FilenameUtils.removeExtension(FilenameUtils.getName(scrapedMovieFile.getName()));
}
/**
* If your file is called "Movie Name Here (2001)" this method returns "2001"
* @param file the file to process
* @return A length 4 string representing the year, if it exists. Otherwise an empty String
*/
public static String getYearFromFileWithYear(File file)
{
String movieName = FilenameUtils.removeExtension(FilenameUtils.getName(file.getName()));
String patternString = "\\(\\d{4}\\)$";
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(movieName);
while(matcher.find())
{
return matcher.group().replace("(", "").replace(")", "").trim();
}
return "";
}
/**
*
* @return The name of the parser used when displaying the parser in drop down menus or console output.
* For example if the parser parses a site called, "MySite.com"
* this function may return "My Site".
*/
public abstract String getParserName();
@Override
public String toString(){
return getParserName();
}
/**
* Maybe we are less likely to get blocked on google if we don't always use the same user agent when searching,
* so this method is designed to pick a random one from a list of valid user agent strings
* @return a random user agent string that can be passed to .userAgent() when calling Jsoup.connect
*/
public String getRandomUserAgent()
{
String[] userAgent = {"Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-en) AppleWebKit/533.16 (KHTML, like Gecko) Version/4.1 Safari/533.16"};
return userAgent[new Random().nextInt(userAgent.length)];
}
public boolean isFirstWordOfFileIsID() {
return firstWordOfFileIsID;
}
public void setFirstWordOfFileIsID(boolean firstWordOfFileIsID) {
this.firstWordOfFileIsID = firstWordOfFileIsID;
}
@Override
public String getDataItemSourceName(){
return getParserName();
}
@Override
public DataItemSource createInstanceOfSameType(){
DataItemSource newInstance = newInstance();
newInstance.setDisabled(isDisabled());
return newInstance;
}
@Override
public boolean isDisabled() {
return isDisabled;
}
@Override
public void setDisabled(boolean value) {
isDisabled = value;
}
public static Document downloadDocumentFromURLString(String url) {
try {
return Jsoup.connect(url).userAgent("Mozilla").ignoreHttpErrors(true).timeout(CONNECTION_TIMEOUT_VALUE).get();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public static Document downloadDocument(SearchResult searchResult){
try {
if(searchResult.isJSONSearchResult())
return SiteParsingProfileJSON.getDocument(searchResult.getUrlPath());
else return Jsoup.connect(searchResult.getUrlPath()).userAgent("Mozilla").ignoreHttpErrors(true).timeout(CONNECTION_TIMEOUT_VALUE).get();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
@Override
public ImageIcon getProfileIcon()
{
if(profileIcon != null)
return profileIcon;
else
{
String profileName = this.getClass().getSimpleName();
String siteName = profileName.replace("ParsingProfile", "");
return initializeResourceIcon("/res/sites/" + siteName + ".png",16,16);
}
}
private ImageIcon initializeResourceIcon(String resourceName, int iconSizeX, int iconSizeY) {
try {
URL url = GUIMain.class.getResource(resourceName);
if(url != null)
{
BufferedImage iconBufferedImage = ImageIO.read(url);
if(iconBufferedImage != null)
{
iconBufferedImage = Scalr.resize(iconBufferedImage, Method.QUALITY, iconSizeX, iconSizeY, Scalr.OP_ANTIALIAS);
return new ImageIcon(iconBufferedImage);
}
else return new ImageIcon();
}
return new ImageIcon();
} catch (IOException e1) {
e1.printStackTrace();
return null;
}
}
public boolean getDiscardResults() {
return discardResults;
}
public void setDiscardResults(boolean value)
{
discardResults = value;
}
}