import org.jsoup.Jsoup; import org.jsoup.select.Elements; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.sql.*; import org.h2.tools.*; import java.net.*; import java.util.Date; import java.text.DateFormat; import java.text.SimpleDateFormat; /** * * Dumper * * @version 1.0 vom 27.05.2014 * @author Daniel Ruf */ public class dumper { public static void main(String[] args) throws Exception { //java.security.Security.setProperty("networkaddress.cache.ttl" , "0"); //java.security.Security.setProperty("networkaddress.cache.negative.ttl" , "0"); try{ String version = "1.0.0"; String program = "Dumper"; System.out.println(program + " " + version ); String domain ="homepage-baukasten.de"; String ending =".de.tl"; Class.forName("org.h2.Driver"); Runtime.getRuntime().addShutdownHook(new Thread() { @Override public void run() { try { Connection conn = DriverManager.getConnection("jdbc:h2:./test;MODE=MySQL;MV_STORE=FALSE;MVCC=FALSE;MAX_COMPACT_TIME=2000", "sa", ""); conn.createStatement().execute("SHUTDOWN COMPACT"); conn.close(); } catch(Exception e) { } } }); Connection conn = DriverManager.getConnection("jdbc:h2:./test;MODE=MySQL;MV_STORE=FALSE;MVCC=FALSE;MAX_COMPACT_TIME=2000", "sa", ""); conn.createStatement().execute("CREATE TABLE IF NOT EXISTS WEBSITES(ID BIGINT auto_increment, NAME VARCHAR)"); conn.createStatement().execute("ALTER TABLE WEBSITES ADD CONSTRAINT IF NOT EXISTS NAME_UNIQUE UNIQUE(NAME)"); conn.createStatement().execute("ALTER TABLE WEBSITES DROP COLUMN IF EXISTS ID"); conn.createStatement().execute("ALTER TABLE WEBSITES ADD COLUMN IF NOT EXISTS ID BIGINT auto_increment BEFORE NAME"); conn.createStatement().execute("SHUTDOWN COMPACT"); conn.close(); int dataset=0; String padding=""; while (true) { try { //getPages("http://www.homepage-baukasten.de/forum/viewonline.php", "div.forum_main a.gen", conn); DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); Date date = new Date(); int websites = getPages("http://www."+domain+"/forum/viewonline.php", "div.forum_main span.gen a", domain, ending); if (websites>0) { dataset++; if (dataset<10) { padding="0000"; } else if(dataset<100){ padding="000"; } else if(dataset<1000){ padding="00"; } else if(dataset<10000){ padding="0"; } else { padding=""; } // end of if-else // end of if System.out.println(padding+""+dataset+"\t"+dateFormat.format(date)+"("+System.currentTimeMillis() / 1000+")\tAdded "+websites+" website(s)"); } // end of if Thread.sleep(1000*5); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); } } // end of while } catch(Exception e) { System.out.println(e); } //conn.close(); } // end of main //private static void getPages(String url, String selector, Connection conn) { private static int getPages(String url, String selector, String domain, String ending) { try { Document doc = Jsoup.connect(url).get(); Elements elements = doc.select(selector); int websites = 0; for (Element element : elements) { if (element.attr("abs:href").toString().contains("http://www."+domain+"/profile.php?of=")) { Connection conn = DriverManager.getConnection("jdbc:h2:./test;MODE=MySQL;MV_STORE=FALSE;MVCC=FALSE;MAX_COMPACT_TIME=2000", "sa", ""); String test = element.attr("abs:href").toString().replace("http://www."+domain+"/profile.php?of=",""); //String[] parts = test.split("&sid="); String website = test+""+ending; website = website.toLowerCase(); ResultSet records = conn.createStatement().executeQuery("SELECT COUNT(ID) FROM WEBSITES WHERE NAME = '"+website+"'"); records.next(); boolean recordExists = records.getInt(1)!=0; if(!recordExists){ conn.createStatement().executeUpdate("INSERT INTO WEBSITES(NAME) VALUES('"+website+"')"); websites++; } // end of if conn.close(); } // end of if } return websites; } catch(Exception e) { System.out.println(e.getMessage()); return 0; } } } // end of class dumper