/**
* RedirectUnshortener
* Copyright 08.03.2015 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package org.loklak.harvester;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.Socket;
import java.net.URL;
import org.loklak.data.DAO;
import org.loklak.http.ClientConnection;
public class RedirectUnshortener {
private final static String[] workingHosts = new String[] {
"bbc.in",
"fb.me",
"wp.me",
"j.mp",
"t.co",
"bit.ly",
"ift.tt",
"goo.gl",
"tinyurl.com",
"ow.ly",
"tiny.cc",
"bit.do",
"amzn.to",
"tmblr.co",
"tumblr.com",
"www.tumblr.com",
"abo.io",
"gdta.st",
"wpo.st",
"buff.ly",
"reut.rs",
"dlvr.it",
"flip.it",
"lnkd.in"
};
private final static String[] untestedHosts = new String[] {
"is.gd",
"ta.gd",
"cli.gs",
"sURL.co.uk",
"y.ahoo.it",
"yi.tl",
"su.pr",
"Fwd4.Me",
"budurl.com",
"snipurl.com",
"igg.me",
"twiza.ru"
};
public static String unShorten(String urlstring) {
//long start = System.currentTimeMillis();
try {
int termination = 10; // loop for recursively shortened urls
while (isApplicable(urlstring) && termination-- > 0) {
String unshortened = ClientConnection.getRedirect(urlstring);
if (unshortened.equals(urlstring)) return urlstring;
urlstring = unshortened; // recursive apply unshortener because some unshortener are applied several times
}
//DAO.log("UNSHORTENED in " + (System.currentTimeMillis() - start) + " milliseconds: " + urlstring);
return urlstring;
} catch (IOException e) {
DAO.log("UNSHORTEN failed for " + urlstring);
return urlstring;
}
}
private static boolean isApplicable(String urlstring) {
String s = urlstring.toLowerCase();
if (!s.startsWith("http://") && !s.startsWith("https://")) return false;
s = s.substring(s.startsWith("https://") ? 8 : 7);
for (String t: workingHosts) {
if (s.startsWith(t + "/")) return true;
}
for (String t: untestedHosts) { // we just suspect that they work
if (s.startsWith(t + "/")) return true;
}
int slp = s.indexOf('/');
int domlength = slp < 0 ? s.length() : slp;
if (domlength < 8 && s.length() < 23 && !s.endsWith("/") && !s.endsWith("html")) {
// very short, because of SEO mostly urls are very long. lets try that
return true;
}
return false;
}
/**
* this is the raw implementation if ClientConnection.getRedirect.
* Surprisingly it's much slower, but some redirects cannot be discovered with the other
* method, but with this one.
* @param urlstring
* @return
* @throws IOException
*/
private static String getRedirect(String urlstring) throws IOException {
URL url = new URL(urlstring);
Socket socket = new Socket(url.getHost(), 80);
socket.setSoTimeout(2000);
PrintWriter out = new PrintWriter(socket.getOutputStream(), true);
BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream()));
out.println("GET " + url.getPath() + " HTTP/1.1");
out.println("Host: " + url.getHost());
// fake a bit that we are real
out.println("User-Agent: " + ClientConnection.USER_AGENT);
out.println("Accept-Language: en-us,en;q=0.5");
out.println("Accept-Encoding: gzip,deflate");
out.println("Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7");
out.println("Keep-Alive: 300");
out.println("Connection: keep-alive");
out.println("Pragma: no-cache");
out.println("Cache-Control: no-cache");
out.println(""); // don't forget the empty line at the end
out.flush();
// read result
String line = in.readLine();
if (line != null && line.contains("301")) {
// first line should be "HTTP/1.1 301 Moved Permanently"
// skip most of the next lines, but one should start with "Location:"
while ((line = in.readLine()) != null) {
if (line.length() == 0) break;
if (!line.toLowerCase().startsWith("location:")) continue;
urlstring = line.substring(9).trim();
break;
}
}
in.close();
out.close();
socket.close();
return urlstring;
}
public static void main(String[] args) {
String[] test = new String[] {
"http://tmblr.co/Z6YPNx1jL1hHK",
"http://dlvr.it/8kTDbJ",
"http://fb.me/4lcXZsyyO",
"http://wp.me/p4yQu6-za0",
"http://j.mp/1vfXKr0",
"http://t.co/E3w7s2qdBT",
"http://bit.ly/1h9gTTT",
"http://ift.tt/1I2O4pF",
"http://goo.gl/R9CVuz",
"http://tinyurl.com/pcp7fu4",
"http://ow.ly/JtOPA",
"http://tiny.cc/60ohux",
"http://bit.do/ZwrT",
"http://amzn.to/MO51If"
};
for (String t: test) {
try {
long start = System.currentTimeMillis();
System.out.println("Test \"" + t + "\" -> " + getRedirect(t));
System.out.println("time: " + (System.currentTimeMillis() - start));
} catch (IOException e) {
e.printStackTrace();
}
try {
long start = System.currentTimeMillis();
System.out.println("Test \"" + t + "\" -> " + ClientConnection.getRedirect(t));
System.out.println("time: " + (System.currentTimeMillis() - start));
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* does not work:
* https://tr.im/v31Rf
* http://dlvr.it/8htd6W // works on terminal but not here
*/
}