package com.joe.utilities.common.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ReadURL {
public static String getOneHtml(final String htmlurl) throws IOException
{
URL url;
String temp;
final StringBuffer sb = new StringBuffer();
try
{
url = new URL(htmlurl);
final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "gb2312"));
while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
}
catch (final MalformedURLException me)
{
me.getMessage();
throw me;
}
catch (final IOException e)
{
e.printStackTrace();
throw e;
}
return sb.toString();
}
public static String getTitle(final String s)
{
String regex;
String title = "";
final List<String> list = new ArrayList<String>();
regex = "<title>.*?</title>";
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++)
{
title = title + list.get(i);
}
return outTag(title);
}
public static List<String> getTagContent(final String html, String startTag, String endTag, Boolean returnOne)
{
String regex;
final List<String> list = new ArrayList<String>();
regex = startTag+".*?"+endTag;
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(html);
while (ma.find())
{
list.add(ma.group());
if(returnOne != null && returnOne)
break;
}
if(list.size() <= 0)
list.add("");
return list;
}
public static List<String> getLink(final String s)
{
String regex;
final List<String> list = new ArrayList<String>();
regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
return list;
}
public static List<String> getScript(final String s)
{
String regex;
final List<String> list = new ArrayList<String>();
regex = "<script.*?</script>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
return list;
}
public static List<String> getCSS(final String s)
{
String regex;
final List<String> list = new ArrayList<String>();
regex = "<style.*?</style>";
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
return list;
}
public static String outTag(final String s)
{
return s.replaceAll("<.*?>", "");
}
}