Java Examples for org.htmlcleaner.TagNode
The following java examples will help you to understand the usage of org.htmlcleaner.TagNode. These source code samples are taken from different open source projects.
Example 1
| Project: pair-java-master File: HtmlParserUtil.java View source code |
public static Document getHtmlDocumentModel(String htmlContent) {
try {
TagNode tagNode = new HtmlCleaner().clean(htmlContent);
Document doc;
try {
doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
} catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
return doc;
} catch (RuntimeException rte) {
return null;
}
}Example 2
| Project: dungproxy-master File: XmlModeFetcher.java View source code |
public List<String> fetch(String html) {
List<String> container = new ArrayList<String>();
TagNode tagNodeRoot = new HtmlCleaner().clean(html);
if (tagNodeRoot == null) {
return null;
}
NodeData nodDataRoot = new NodeData(null, "page");
innerFetch(container, doc.getRootElement(), nodDataRoot, nodDataRoot, tagNodeRoot, null);
return container;
}Example 3
| Project: FastHub-master File: TableHandler.java View source code |
private void readNode(Object node, Table table) {
if (node instanceof TagNode) {
TagNode tagNode = (TagNode) node;
if (tagNode.getName().equals("td") || tagNode.getName().equals("th")) {
Spanned result = this.getSpanner().fromTagNode(tagNode);
table.addCell(result);
return;
}
if (tagNode.getName().equals("tr")) {
table.addRow();
}
for (Object child : tagNode.getChildTags()) {
readNode(child, table);
}
}
}Example 4
| Project: HtmlSpanner-master File: CSSCompiler.java View source code |
@Override
public boolean matches(TagNode tagNode) {
if (tagNode == null) {
return false;
}
//If a tag name is given it should match
if (tagName != null && tagName.length() > 0 && !tagName.equals(tagNode.getName())) {
return false;
}
String classAttribute = tagNode.getAttributeByName("class");
return classAttribute != null && classAttribute.equals(className);
}Example 5
| Project: PageTurner-master File: BookView.java View source code |
@TargetApi(Build.VERSION_CODES.FROYO)
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, SpanStack span) {
String src = node.getAttributeByName("src");
if (src == null) {
src = node.getAttributeByName("href");
}
if (src == null) {
src = node.getAttributeByName("xlink:href");
}
if (src == null) {
return;
}
builder.append("");
if (src.startsWith("data:image")) {
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.FROYO) {
try {
String dataString = src.substring(src.indexOf(',') + 1);
byte[] binData = Base64.decode(dataString, Base64.DEFAULT);
setImageSpan(builder, new BitmapDrawable(getContext().getResources(), BitmapFactory.decodeByteArray(binData, 0, binData.length)), start, builder.length());
} catch (OutOfMemoryErrorIllegalArgumentException | ia) {
}
}
} else if (spine != null) {
String resolvedHref = spine.resolveHref(src);
if (textLoader.hasCachedImage(resolvedHref) && !fakeImages) {
Drawable drawable = textLoader.getCachedImage(resolvedHref);
setImageSpan(builder, drawable, start, builder.length());
LOG.debug("Got cached href: " + resolvedHref);
} else {
LOG.debug("Loading href: " + resolvedHref);
this.registerCallback(resolvedHref, new ImageCallback(resolvedHref, builder, start, builder.length(), fakeImages));
}
}
}Example 6
| Project: confluence2wordpress-master File: DefaultConverter.java View source code |
public String convert(ContentEntityObject page, ConverterOptions options) throws ConversionException {
String originalTitle = page.getTitle();
try {
//temporarily replace page title to get correct anchors
//(I know it's ugly)
page.setTitle(options.getPageTitle());
String storage = page.getBodyAsString();
PageContext pageContext = page.toPageContext();
DefaultConversionContext conversionContext = new DefaultConversionContext(pageContext);
//storage pre-processing
List<PreProcessor> preProcessors = getPreProcessors(options, conversionContext);
for (PreProcessor preProcessor : preProcessors) {
storage = preProcessor.preProcess(storage, options, pageContext);
}
//wiki -> html conversion
String view = renderer.render(storage, conversionContext);
handleConversionErrors(view);
//HTML cleanup
HtmlCleaner cleaner = getHtmlCleaner(options);
TagNode root = cleaner.clean(view);
TagNode body = root.findElementByName("body", false);
//DOM traversal
List<TagNodeVisitor> visitors = getTagNodeVisitors(options, page);
for (TagNodeVisitor visitor : visitors) {
body.traverse(visitor);
}
//serialization
String html = serialize(body, cleaner.getProperties(), options);
//HTML post-processing
List<PostProcessor> postProcessors = getPostProcessors(options);
for (PostProcessor postProcessor : postProcessors) {
html = postProcessor.postProcess(html, body, options);
}
return html;
} finally {
page.setTitle(originalTitle);
}
}Example 7
| Project: en-webmagic-master File: Xpath2Selector.java View source code |
@Override
public String select(String text) {
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null;
}
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
return item.getTextContent();
} else {
StreamResult xmlOutput = new StreamResult(new StringWriter());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
}
}
return result.toString();
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}Example 8
| Project: epublib-master File: HHCParser.java View source code |
public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException, XPathExpressionException {
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
TagNode node = htmlCleaner.clean(hhcFile);
Document hhcDocument = new DomSerializer(props).createDOM(node);
XPath xpath = XPathFactory.newInstance().newXPath();
Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument.getDocumentElement(), XPathConstants.NODE);
List<TOCReference> sections = processUlNode(ulNode, resources);
return sections;
}Example 9
| Project: jeboorker-master File: HTMLMetadataReader.java View source code |
/**
* Extracts the meta data from the given <code>content</code>.
* @param content The html content containing some meta data.
* @param bodyIndex The index of the body tag.
* @return The extracted meta data. Never returns <code>null</code>.
* @throws IOException
*/
private List<MetadataProperty> extractMetadata(final String content) throws IOException {
final List<MetadataProperty> result = new ArrayList<>();
final HtmlCleaner cleaner = new HtmlCleaner();
final TagNode rootNode = cleaner.clean(new StringReader(content));
//add meta tags
final TagNode[] metaElements = rootNode.getElementsByName("meta", true);
for (int i = 0; i < metaElements.length; i++) {
String metaName = metaElements[i].getAttributeByName("name");
String metaContent = metaElements[i].getAttributeByName("content");
if (metaName == null) {
Map<String, String> attributes = metaElements[i].getAttributes();
for (String att : attributes.values()) {
if (att != null && !att.equals(metaContent)) {
metaName = att;
}
}
}
result.add(new MetadataProperty(metaName, metaContent));
}
//add title tag
final TagNode[] titleElements = rootNode.getElementsByName("title", true);
for (int i = 0; i < titleElements.length; i++) {
StringBuffer text = titleElements[i].getText();
result.add(new MetadataProperty(COMMON_METADATA_TYPES.TITLE.getName(), text));
}
return result;
}Example 10
| Project: Reforger-master File: Item.java View source code |
public synchronized void parse() {
if (!_parsed) {
String[] pair;
String[] elements;
String attribute;
URL url = null;
StringBuilder wowhead = new StringBuilder("http://www.wowhead.com/");
TagNode ref = null;
//<editor-fold defaultstate="collapsed" desc="Parse name.">
ref = _data.findElementByAttValue("class", "name-shadow", true, true);
assert ref != null && ref.getText() != null : "Error: unable to determine item name.";
_name = (ref != null) ? StringEscapeUtils.unescapeHtml4(ref.getText().toString()) : "";
//</editor-fold>
//<editor-fold defaultstate="collapsed" desc="Parse item ID.">
ref = _data.findElementByName("a", false);
assert ref != null : "Error: unable to determine item attributes.";
attribute = ref.getAttributeByName("href");
elements = attribute.split("/wow/en/item/");
assert elements.length == 2 : "Error: unexpected Armory data format.";
wowhead.append("item=").append(elements[1]);
//</editor-fold>
//<editor-fold defaultstate="collapsed" desc="Extract data-item string.">
ref = _data.findElementByName("a", false);
assert ref != null : "Error: unable to determine item attributes.";
attribute = ref.getAttributeByName("data-item");
elements = StringEscapeUtils.unescapeHtml4((attribute != null) ? attribute : "").split("&");
//<editor-fold defaultstate="collapsed" desc="Parse Armory data-item attributes.">
for (String e : elements) {
pair = e.split("=");
if ("e".equals(pair[0])) {
// Permanent Enchantment
wowhead.append("&ench=").append(pair[1]);
}
if ("re".equals(pair[0])) {
// Reforge ID (not currently supported by Wowhead)
wowhead.append("&rf=").append(pair[1]);
}
if ("es".equals(pair[0])) {
// Additional Socket
wowhead.append("&sock");
}
if ("r".equals(pair[0])) {
// Random Itemization
wowhead.append("&rand=").append(pair[1]);
}
if ("set".equals(pair[0])) {
// Set Pieces Equipped
wowhead.append("&pcs=").append(pair[1].replace(',', ':'));
}
}
//</editor-fold>
//<editor-fold defaultstate="collapsed" desc="Parse Armory gem ID's.">
TagNode[] gems = _data.getElementsByAttValue("class", "gem", true, true);
final int GEM_COUNT = gems.length;
if (GEM_COUNT != 0) {
String suffix;
wowhead.append("&gems=");
for (int i = 0; i < GEM_COUNT; ++i) {
suffix = gems[i].getAttributeByName("href").replace("/wow/en/item/", "");
wowhead.append(suffix);
if (i + 1 < GEM_COUNT) {
wowhead.append(":");
}
}
}
//</editor-fold>
wowhead.append("&power");
System.out.println(" " + _name);
//<editor-fold defaultstate="collapsed" desc="Download and parse Wowhead JSON data.">
try {
url = new URL(wowhead.toString());
} catch (Exception e) {
Logger.getLogger(Item.class.getSimpleName()).log(Level.SEVERE, null, e);
}
Pattern p = Pattern.compile(TOOLTIP_FORMAT);
Matcher m = p.matcher(URLRetriever.fetchContents(url));
String itemPayload = (m.find()) ? m.group(1) : "";
HtmlCleaner parser = new HtmlCleaner();
CleanerTransformations transform = new CleanerTransformations();
TagTransformation strip = new TagTransformation("small");
transform.addTransformation(strip);
parser.setTransformations(transform);
TagNode root = parser.clean(itemPayload);
//</editor-fold>
root.traverse(this);
System.out.println();
_parsed = true;
}
}Example 11
| Project: WebGatherer---Scraper-and-Analyzer-master File: HtmlParserImpl.java View source code |
public Map<String, String> extractLinks(String baseUrl, String htmlPage) {
TagNode node = htmlCleaner.clean(htmlPage);
TagNode[] nodesHref = node.getElementsByName("a", true);
Map<String, String> urlList = new HashMap<String, String>();
for (TagNode curNode : nodesHref) {
Map<String, String> attributes = curNode.getAttributes();
if (attributes.containsKey("href")) {
String url = curNode.getAttributeByName("href").trim();
url = getRelativeLink(url, baseUrl);
urlList.put(curNode.getText().toString().toLowerCase().trim(), url);
}
}
return urlList;
}Example 12
| Project: agile-stock-master File: EtnetQuoteFetcher.java View source code |
@Override
public StockDetail fetch(String quote) {
StockDetail detail = new StockDetail();
String url = getUrl(quote);
HttpGet req = new HttpGet(url);
try {
detail.setQuote(quote);
detail.setSourceUrl(url);
// download html
HttpResponse resp = getClient().execute(req);
String html = EntityUtils.toString(resp.getEntity());
// optimization to reduce html size
int start = html.indexOf("<!-- Content -->");
int end = html.indexOf("top:-1000px;\">");
html = StringUtils.substring(html, start, end);
TagNode document = getCleaner().clean(html);
resp = null;
// set updatedAt
SimpleDateFormat formatter = new SimpleDateFormat(DATE_FORMAT);
String updatedAtStr = getFirstMatchedElementContent(document, XPATH_UPDATE);
Date updatedDate = formatter.parse(updatedAtStr);
Calendar updatedAt = Calendar.getInstance();
updatedAt.setTime(updatedDate);
detail.setUpdatedAt(updatedAt);
TagNode table = getFirstMatchedElement(document, XPATH_BASE);
// set price
String pricesStr = getFirstMatchedElementContent(table, XPATH_PRICE);
BigDecimal price = new BigDecimal(pricesStr);
detail.setPrice(price);
// set price change and change %
String priceChangesStr = getFirstMatchedElementContent(table, XPATH_PRICE_CHANGE);
Matcher priceChangeMatcher = PATTERN_PRICE_CHANGE.matcher(priceChangesStr);
if (priceChangeMatcher.find()) {
String priceChangeNumStr = priceChangeMatcher.group(1);
BigDecimal priceChangeNum = new BigDecimal(priceChangeNumStr);
detail.setChangePrice(priceChangeNum);
String priceChangePercentStr = priceChangeMatcher.group(2);
BigDecimal priceChangePercent = new BigDecimal(priceChangePercentStr);
detail.setChangePricePercent(priceChangePercent);
}
String dayHighStr = getFirstMatchedElementContent(table, XPATH_DAY_HIGH);
BigDecimal dayHigh = new BigDecimal(dayHighStr);
detail.setDayHigh(dayHigh);
String dayLowStr = getFirstMatchedElementContent(table, XPATH_DAY_LOW);
BigDecimal dayLow = new BigDecimal(dayLowStr);
detail.setDayLow(dayLow);
String volume = getFirstMatchedElementContent(table, XPATH_DAY_VOLUME);
detail.setVolume(volume);
} catch (ClientProtocolException e) {
throw new DownloadException("error fetching stock", e);
} catch (IOException e) {
throw new DownloadException("error fetching stock", e);
} catch (XPatherException e) {
throw new ParseException("unexpected result while fetch stock", e);
} catch (java.text.ParseException e) {
throw new ParseException("date format unparsable", e);
}
return detail;
}Example 13
| Project: brezskrbnik-master File: Service2.java View source code |
public TagNode xmlCleaner(String url) { CleanerProperties props = new CleanerProperties(); props.setTranslateSpecialEntities(true); props.setTransResCharsToNCR(true); props.setOmitComments(true); TagNode tagNode; try { tagNode = new HtmlCleaner(props).clean(new URL(url)); return tagNode; } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; }
Example 14
| Project: MadStore-master File: PreprocessingStage.java View source code |
public Page execute(Page page) {
try {
LOG.info("Cleaning up page: {}", page.getLink());
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties cleanerProperties = htmlCleaner.getProperties();
cleanerProperties.setOmitComments(true);
cleanerProperties.setTranslateSpecialEntities(false);
cleanerProperties.setRecognizeUnicodeChars(false);
cleanerProperties.setOmitUnknownTags(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setUseCdataForScriptAndStyle(true);
TagNode tagNode = htmlCleaner.clean(page.getData());
tagNode.removeAttribute("xmlns:xml");
XmlSerializer xmlSerializer = new CompactXmlSerializer(cleanerProperties);
String cleanedPage = xmlSerializer.getXmlAsString(tagNode, "UTF-8");
LOG.debug("Cleaned page: {}", cleanedPage);
return new Page(page.getLink(), cleanedPage);
} catch (Exception e) {
LOG.warn(e.getMessage(), e);
return null;
}
}Example 15
| Project: RestFixtureLiveDoc-master File: HtmlSimplifier.java View source code |
private String serializeAndSanitiseResult(TagNode tagNode) {
try {
String result = serializer.getAsString(tagNode, true);
result = result.replaceAll(" ", " ");
result = result.replaceAll("<br />\n\n<br />\n\n", "<br />\n\n");
result = result.replaceAll("<br />\n\n<br />\n\n", "<br />\n\n");
result = result.replaceAll("<br />\n\n", "<br />\n");
result = result.replaceAll("<br />\n<br />\n", "<br />\n");
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
}Example 16
| Project: EasySOA-Incubation-master File: ScrapingStrategy.java View source code |
@Override
public List<FoundService> findFromContext(BrowsingContext context) throws Exception {
List<FoundService> foundServices = new LinkedList<FoundService>();
if (context.getData() != null) {
URL url = context.getURL();
// Web page parsing
HtmlCleaner cleaner = new HtmlCleaner();
TagNode cleanHtml = null;
try {
cleanHtml = cleaner.clean(context.getData());
} catch (StackOverflowError e) {
log.warn("HtmlCleaner stack overflow while parsing " + url + ", aborting strategy");
return foundServices;
}
// Find app name
String applicationName = guessApplicationName(context);
// Find links
List<String> foundServicesNames = new LinkedList<String>();
Object[] links = cleanHtml.evaluateXPath("//a");
changeToAbsolutePath(links, "href", url);
for (Object o : links) {
TagNode link = (TagNode) o;
try {
String linkHref = link.getAttributeByName("href");
if (linkHref == null) {
// NB. happens in some bad html
continue;
}
String ref = new URL(url, linkHref).toString();
String name = (link.getText() != null) ? link.getText().toString() : // TODO else title attr
ref;
// Truncate if name is an URL (serviceName cannot contain slashes)
if (name.contains("/")) {
String[] nameParts = name.split("/}");
name = nameParts[nameParts.length - 1].replaceAll("(\\?|\\.|\\?wsdl)", // AND NOT 'wsdl' only (see below)
"");
}
// Append digits to the link name if it already exists
int i = 1;
if (ref != null && ref.toLowerCase().endsWith("?wsdl")) {
// AND NOT "wsdl" only (see below)
while (foundServicesNames.contains(name)) {
name = (i == 1 ? name + i++ : name.substring(0, name.length() - 1)) + i++;
}
name = name.replaceAll("[\n\r]", "").trim();
String nameWithoutWsdl = name.replaceAll("([ ]*\\?WSDL|[ ]*\\?wsdl)", "").trim();
///name = name.replaceAll("([\n\r]|[ ]*WSDL|[ ]*wsdl)", "").trim();
if (!nameWithoutWsdl.isEmpty()) {
// NOT REQUIRED ANYMORE
name = nameWithoutWsdl;
}
foundServices.add(new FoundService(name, ref, applicationName));
foundServicesNames.add(name);
}
} catch (Exception e) {
}
}
}
return foundServices;
}Example 17
| Project: EasySOA-master File: ScrapingStrategy.java View source code |
@Override
public List<FoundService> findFromContext(BrowsingContext context) throws Exception {
List<FoundService> foundServices = new LinkedList<FoundService>();
if (context.getData() != null) {
URL url = context.getURL();
// Web page parsing
HtmlCleaner cleaner = new HtmlCleaner();
TagNode cleanHtml = null;
try {
cleanHtml = cleaner.clean(context.getData());
} catch (StackOverflowError e) {
log.warn("HtmlCleaner stack overflow while parsing " + url + ", aborting strategy");
return foundServices;
}
// Find app name
String applicationName = guessApplicationName(url);
// Find links
List<String> foundServicesNames = new LinkedList<String>();
Object[] links = cleanHtml.evaluateXPath("//a");
changeToAbsolutePath(links, "href", url);
for (Object o : links) {
TagNode link = (TagNode) o;
try {
String ref = new URL(url, link.getAttributeByName("href")).toString();
String name = (link.getText() != null) ? link.getText().toString() : ref;
// Truncate if name is an URL (serviceName cannot contain slashes)
if (name.contains("/")) {
String[] nameParts = name.split("/}");
name = nameParts[nameParts.length - 1].replaceAll("(\\?|\\.|wsdl)", "");
}
// Append digits to the link name if it already exists
int i = 1;
if (ref != null && ref.toLowerCase().endsWith("wsdl")) {
while (foundServicesNames.contains(name)) {
name = (i == 1 ? name + i++ : name.substring(0, name.length() - 1)) + i++;
}
name = name.replaceAll("([\n\r]|[ ]*WSDL|[ ]*wsdl)", "").trim();
foundServices.add(new FoundService(name, ref, applicationName));
foundServicesNames.add(name);
}
} catch (MalformedURLException e) {
}
}
}
return foundServices;
}Example 18
| Project: email-master File: HtmlSignatureRemover.java View source code |
public static String stripSignature(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
}Example 19
| Project: FivewaysBusTimesAndroid-master File: BusTimeScraper.java View source code |
public static List<Bus> getBusesFromURL(String url) throws XPatherException, ParserConfigurationException, SAXException, IOException, XPatherException {
HtmlCleaner cleaner = new HtmlCleaner();
URL buses_url = new URL(url);
URLConnection conn = buses_url.openConnection();
TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream()));
Object[] data_nodes = node.evaluateXPath(BUS_EXPR);
// take the data in groups of three - if the first of the three is
// bus number is blank then skip
List<Bus> busList = new ArrayList<Bus>();
for (int i = 0; i < data_nodes.length; i += 3) {
String bus_name = ((TagNode) data_nodes[i]).getText().toString();
String bus_dest = ((TagNode) data_nodes[i + 1]).getText().toString();
String bus_time = ((TagNode) data_nodes[i + 2]).getText().toString();
if (bus_name != "") {
bus_dest = bus_dest.replace(" ", " ");
bus_time = bus_time.replace(" ", " ");
// deal with the time object - this is either a time
// or a minutes offset, convert the minutes offset to a real
// bus time can have an appended * for timetabled time
Calendar arrivetime = Calendar.getInstance();
if (!bus_time.contains(":")) {
// in the format 'mm mins' or 'm mins'
int minutes_offset = Integer.parseInt(bus_time.substring(0, 2).trim());
arrivetime.add(Calendar.MINUTE, minutes_offset);
} else {
int cpoint = bus_time.indexOf(':');
int hour = Integer.parseInt(bus_time.substring(0, cpoint));
int minutes = Integer.parseInt(bus_time.substring(cpoint + 1, cpoint + 3));
Log.v(LOG_TAG, hour + " == " + minutes);
Calendar timenow = arrivetime;
arrivetime.set(Calendar.HOUR_OF_DAY, hour);
arrivetime.set(Calendar.MINUTE, minutes);
// deal with midnight crossing
if (arrivetime.before(timenow)) {
arrivetime.add(Calendar.HOUR_OF_DAY, 24);
}
}
Bus b = new Bus(bus_name, bus_dest, arrivetime);
busList.add(b);
Log.v(LOG_TAG, b.toString());
}
}
return busList;
}Example 20
| Project: k-9-master File: HtmlSignatureRemover.java View source code |
public static String stripSignature(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
}Example 21
| Project: meaningfulweb-master File: ExtractUtils.java View source code |
public static void cleanInvalidAttributes(TagNode parent) { List nodes = parent.getChildren(); if (nodes != null) { for (int i = 0; i < nodes.size(); i++) { Object curChild = nodes.get(i); if (curChild instanceof TagNode) { TagNode curNode = (TagNode) curChild; Map attrMap = curNode.getAttributes(); Set<String> toRemove = new HashSet<String>(); for (Object entryObj : attrMap.entrySet()) { Entry entry = (Entry) entryObj; String attrName = (String) entry.getKey(); if (!HtmlExtractUtils.isValidAttribute(attrName)) { toRemove.add(attrName); } } for (String remove : toRemove) { curNode.removeAttribute(remove); } cleanInvalidAttributes(curNode); } } } }
Example 22
| Project: opensearchserver-master File: HtmlArchiver.java View source code |
private final void checkStyleCSS(TagNode node) throws ClientProtocolException, IllegalStateException, IOException, SearchLibException, URISyntaxException {
if (!("style".equalsIgnoreCase(node.getName())))
return;
String attr = node.getAttributeByName("type");
if (!StringUtils.isEmpty(attr) && !"text/css".equalsIgnoreCase(attr))
return;
attr = node.getAttributeByName("media");
if (!StringUtils.isEmpty(attr) && !"screen".equalsIgnoreCase(attr) && !"all".equalsIgnoreCase(attr))
return;
StringBuilder builder = (StringBuilder) node.getText();
if (builder == null)
return;
String content = builder.toString();
String newContent = StringEscapeUtils.unescapeXml(content);
StringBuffer sb = checkCSSContent(baseUrl, newContent);
if (sb != null)
newContent = sb.toString();
if (newContent.equals(content))
return;
node.removeAllChildren();
node.addChild(new ContentNode(newContent));
}Example 23
| Project: TL-android-app-master File: TLLib.java View source code |
public static boolean login(String login, String pw, Handler handler, Context context) throws IOException {
handler.sendEmptyMessage(TLHandler.PROGRESS_LOGIN);
logout();
// Fetch the token
HtmlCleaner cleaner = TLLib.buildDefaultHtmlCleaner();
URL url = new URL(LOGIN_URL);
//TagNode node = TagNodeFromURLEx2(cleaner, url, handler, context, "<html>", false);
TagNode node = TLLib.TagNodeFromURLLoginToken(cleaner, url, handler, context);
String token = null;
try {
TagNode result = (TagNode) (node.evaluateXPath("//input")[0]);
token = result.getAttributeByName("value");
} catch (XPatherException e1) {
e1.printStackTrace();
}
if (token == null) {
return false;
}
//
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpPost httpost = new HttpPost(LOGIN_URL);
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
nvps.add(new BasicNameValuePair(USER_FIELD, login));
nvps.add(new BasicNameValuePair(PASS_FIELD, pw));
nvps.add(new BasicNameValuePair(REMEMBERME, "1"));
nvps.add(new BasicNameValuePair("stage", "1"));
nvps.add(new BasicNameValuePair("back_url", "/"));
nvps.add(new BasicNameValuePair("token", token));
Log.d("token:", token);
tokenField = token;
if (cookieStore != null) {
httpclient.setCookieStore(cookieStore);
}
try {
httpost.setEntity(new UrlEncodedFormEntity(nvps));
HttpResponse response = httpclient.execute(httpost);
HttpEntity entity = response.getEntity();
Header[] headers = response.getHeaders("Set-Cookie");
if (cookieStore.getCookies().size() < 2) {
loginName = null;
loginStatus = false;
} else {
loginName = login;
loginStatus = true;
cookieStore = httpclient.getCookieStore();
}
if (entity != null) {
entity.consumeContent();
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return loginStatus;
}Example 24
| Project: zseinfo-master File: VulcanReplacementsHandler.java View source code |
private static TagNode downloadReplacements() throws IOException { HtmlCleaner cleaner = new HtmlCleaner(); String url = Configuration.getInstance().getReplacementsConfig().getReplacementsUrl(); InputStream input; HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection(); if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) { throw new IOException("Błąd połączenia z serwerem: " + connection.getResponseCode() + " " + connection.getResponseMessage()); } input = connection.getInputStream(); TagNode node = cleaner.clean(input, Configuration.getInstance().getReplacementsConfig().getEncoding()); try { return node.findElementByName("table", true).findElementByName("tbody", false); } catch (NullPointerException e) { if (node.findElementByName("frame", true) != null) { TagNode frame = node.findElementByName("frame", true); String sheetAddr = frame.getAttributeByName("src"); connection = (HttpURLConnection) new URL(url + sheetAddr).openConnection(); if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) { throw new IOException("Błąd połączenia z serwerem: " + connection.getResponseCode() + " " + connection.getResponseMessage()); } input = connection.getInputStream(); node = cleaner.clean(input, Configuration.getInstance().getReplacementsConfig().getEncoding()); return node; } throw e; } }
Example 25
| Project: AbianReader-master File: AbianReaderItemView.java View source code |
public void setTargetRssItem(int itemPosition) {
m_targetRssItemNumber = itemPosition;
AbianReaderData abianReaderAppData = AbianReaderApplication.getData();
AbianReaderItem theItem = abianReaderAppData.getItemNumber(itemPosition);
if (theItem != null) {
int nWid = AbianReaderApplication.s_width;
int nHei = AbianReaderApplication.s_height;
float thisScale = m_webView.getScale();
float nScaledWid = (nWid / thisScale);
float nScaledHei = (nHei / thisScale);
float nMaxWid = (nScaledWid * 0.9f);
float nMaxHei = (nScaledHei * 0.9f);
if (nWid > nHei) {
nMaxHei = (nScaledHei * 0.75f);
}
String maxWidStr = Integer.toString((int) nMaxWid);
String maxHeiStr = Integer.toString((int) nMaxHei);
String constraints = "{ ";
constraints += "max-width: " + maxWidStr + "; ";
constraints += "max-height: " + maxHeiStr + "; ";
constraints += "width: auto; ";
constraints += "height: auto; ";
constraints += "display: block; ";
constraints += "margin-left: auto; ";
constraints += "margin-right: auto; ";
constraints += "}";
String ourHeadNode = "<head>";
// use this to tell webview not to scale the webpage
// ourHeadNode +=
// "<meta name=\"viewport\" content=\"target-densitydpi=device-dpi\" />";
ourHeadNode += "<style>";
ourHeadNode += "img " + constraints;
ourHeadNode += "\niframe " + constraints;
ourHeadNode += "\ndiv " + constraints;
// ourHeadNode += "\npre " + constraints;
ourHeadNode += "</style>";
ourHeadNode += "</head>";
String ourHeader = "<html>" + ourHeadNode + "<body><h2>" + theItem.getTitle() + "</h2>";
ourHeader += "<small>By " + theItem.getCreator() + " posted " + theItem.getPubDate() + "</small>";
if (theItem.getFeaturedImageLink().length() != 0) {
ourHeader += "<br /><br />";
ourHeader += "<a href=\"";
ourHeader += theItem.getFeaturedImageLink();
ourHeader += "\">";
ourHeader += "<img src=\"";
ourHeader += theItem.getFeaturedImageLink();
ourHeader += "\" /> </a>";
}
// ourHeader += "<br />";
String ourFooter = "<br /><br /></body></html>";
//String ourHtml = theItem.getContent();
String ourHtml = ourHeader;
ourHtml += theItem.getContent();
ourHtml += ourFooter;
TagNode theCleanTagNode = m_htmlCleaner.clean(ourHtml);
TagNode imgNodes[] = theCleanTagNode.getElementsByName("img", true);
for (int i = 0; i < imgNodes.length; i++) {
imgNodes[i].removeAttribute("width");
imgNodes[i].removeAttribute("height");
}
TagNode iFrameNodes[] = theCleanTagNode.getElementsByName("iframe", true);
for (int i = 0; i < iFrameNodes.length; i++) {
iFrameNodes[i].removeAttribute("width");
iFrameNodes[i].removeAttribute("height");
}
try {
ourHtml = m_htmlSerializer.getAsString(theCleanTagNode);
} catch (IOException e) {
e.printStackTrace();
}
//ourHtml = ourHeader + ourHtml;
//ourHtml += ourFooter;
m_webView.loadDataWithBaseURL(null, ourHtml, "text/html", "UTF-8", null);
// m_webView.loadDataWithBaseURL(theItem.getLink(), ourHtml,
// "text/html", "UTF-8", null);
} else {
Log.e(TAG, "TheItem is null");
}
}Example 26
| Project: PressGangCCMSREST-master File: TopicSourceURLTitleThread.java View source code |
protected void setTitle(final TopicSourceUrl topicSourceUrl) {
try {
// Some common string replacements to make in the titles
final Map<String, String> replaceList = new HashMap<String, String>();
replaceList.put(" ", " ");
replaceList.put("&", "&");
// create an instance of HtmlCleaner
final HtmlCleaner cleaner = new HtmlCleaner();
// clean the source url
final TagNode node = cleaner.clean(new URL(topicSourceUrl.getSourceUrl()));
// find the first title node
final TagNode title = node.findElementByName("title", true);
if (title != null) {
// clean up the title
String titleText = title.getText().toString();
for (final String replace : replaceList.keySet()) titleText = titleText.replaceAll(replace, replaceList.get(replace));
titleText = titleText.trim();
// assign it to the entity
topicSourceUrl.setTitle(titleText);
}
} catch (final IOException ex) {
LOGGER.error("Probably a problem with HTMLCleaner", ex);
}
}Example 27
| Project: webmagic-master File: XpathSelectorTest.java View source code |
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
System.out.println(html.length());
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(html);
Document document = Jsoup.parse(html);
long time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
Jsoup.parse(html);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
document.select("a");
}
System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
XPathEvaluator compile = Xsoup.compile("//a");
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
compile.evaluate(document);
}
System.out.println(System.currentTimeMillis() - time);
}Example 28
| Project: zkBrowser-master File: Search.java View source code |
public int getResultsCount() {
String requestUrl = mRequestUrl;
try {
requestUrl += URLEncoder.encode(mTerm, "UTF-8") + "/" + URLEncoder.encode(mLocation, "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
TagNode rootNode = getRootNode(requestUrl);
int resultsCount = 1;
Object[] len = evaluateXPath(Utils.XPATH_RESULT_COUNT, rootNode);
Object[] noRes = evaluateXPath(Utils.XPATH_NO_RESULTS, rootNode);
if (noRes.length > 0) {
resultsCount = 0;
}
if (len.length > 0) {
String str = ((StringBuffer) len[0]).toString();
String[] tempContent = str.split(" ");
resultsCount = Integer.parseInt(tempContent[3]);
}
return resultsCount;
}Example 29
| Project: Ebselen-master File: IDEToEbselen.java View source code |
/**
* Cleans the relevant file and generates a valid XML file ready for processing to Sel 2 java File.
*
* @param absoluteFilename - name of the file to convert.
* @return String - location of the converted file.
*/
public String convertToXML(String absoluteFilename) throws Exception {
FileHandler fromSelIDE = new FileHandler(absoluteFilename);
FileHandler toXML = new FileHandler(System.getProperty("java.io.tmpdir") + File.separator + fromSelIDE.getFileName() + ".xml", true);
if (fromSelIDE.getFile().isDirectory()) {
LOGGER.error("Cannot convert directory {} into a Selenium Test!", fromSelIDE.getFileName());
return null;
}
//Clean up html so that we can read it as XML properly
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties XMLPrefs = cleaner.getProperties();
XMLPrefs.setUseEmptyElementTags(true);
XMLPrefs.setTranslateSpecialEntities(true);
XMLPrefs.setTransResCharsToNCR(true);
XMLPrefs.setOmitComments(true);
XMLPrefs.setOmitComments(true);
XMLPrefs.setOmitDoctypeDeclaration(true);
XMLPrefs.setNamespacesAware(false);
TagNode tagNode = new HtmlCleaner(XMLPrefs).clean(fromSelIDE.getFile());
new PrettyXmlSerializer(XMLPrefs).writeToStream(tagNode, toXML.getWritableFileOutputStream(), "utf-8");
toXML.close();
return toXML.getAbsoluteFile();
}Example 30
| Project: fenixedu-ist-teacher-service-master File: AnnualTeachingCreditsDocumentFilter.java View source code |
private String clean(String dirtyHtml) {
try {
HtmlCleaner cleaner = new HtmlCleaner();
TagNode root = cleaner.clean(dirtyHtml);
return new SimpleHtmlSerializer(cleaner.getProperties()).getAsString(root);
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
return StringUtils.EMPTY;
}Example 31
| Project: Joomla-Eclipse-master File: JoomlaDeployerImpl.java View source code |
@Override
public IStatus install(BasicExtensionModel extension, DeploymentRuntime transientRuntime, IProgressMonitor progressMonitor) {
try {
progressMonitor.beginTask("Install " + extension.getName(), 2000);
final DeploymentRuntime persistentRuntime = ServerUtils.getPersistentDeploymentRuntime(transientRuntime, getDeploymentDescriptor());
final IJoomlaHttpSession session = persistentRuntime.getHttpSession();
final String extensionDir = ResourcesPlugin.getWorkspace().getRoot().getFile(extension.getManifestPath()).getLocation().toFile().getParent();
final TagNode installPage = session.executeAndParseResponseBody(new PrepareInstallationRequest(persistentRuntime), true, new SubProgressMonitor(progressMonitor, 1000));
final String adminFormAttribute = persistentRuntime.getServer().getMajorVersion() == MajorJoomlaVersion.ONE_SIX ? "id" : "name";
final List<NameValuePair> installParams = ServerUtils.extractInputNameValuePairs("//form[@" + adminFormAttribute + "='adminForm']//input[@type='hidden']", installPage);
final Iterator<NameValuePair> i = installParams.iterator();
while (i.hasNext()) {
final NameValuePair param = i.next();
if ("installtype".equals(param.getName())) {
i.remove();
}
}
installParams.add(new NameValuePair("installtype", "folder"));
installParams.add(new NameValuePair("install_directory", extensionDir));
final TagNode result = session.executeAndParseResponseBody(new GenericPostRequest("administrator/index.php?option=com_installer&view=install", persistentRuntime, installParams), true, new SubProgressMonitor(progressMonitor, 1000));
final JoomlaSystemMessage systemMessage = ServerUtils.extractFirstSystemMessage(result);
if (systemMessage == null || systemMessage.getSeverity() == MessageSeverity.INFO) {
newDeployment(extension, persistentRuntime);
return systemMessage == null ? JoomlaCorePlugin.newStatus(IStatus.WARNING, "No confirmation message. Installation likely failed.") : JoomlaCorePlugin.newStatus(IStatus.OK, systemMessage.getMessage());
} else {
// really a warning, because it might be "extension already installed" type of message
// TODO: possibly try to distinguish between failure & "already installed" - fetch list of extensions
newDeployment(extension, persistentRuntime);
return JoomlaCorePlugin.newStatus(IStatus.WARNING, systemMessage.getMessage());
}
} catch (final RuntimeException e) {
final String message = "Unexpected exception while installing extension " + extension.getName();
JoomlaCorePlugin.logError(message, e);
return JoomlaCorePlugin.newStatus(IStatus.ERROR, message, e);
} finally {
progressMonitor.done();
}
}Example 32
| Project: PoliSons-master File: News.java View source code |
private String extractText(TagNode node, boolean keepHtmlTags) { StringBuilder value = new StringBuilder(); if (node.getChildren().size() > 0) { for (int i = 0; i < node.getChildren().size(); i++) { if (node.getChildren().get(i).toString().equals("strong")) { value.append("<b>" + ((TagNode) node.getChildren().get(i)).getText().toString() + "</b>"); } else if (node.getChildren().get(i).toString().equals("img")) { value.append("<br>"); } else if (node.getChildren().get(i).toString().equals("br")) { value.append("<br>"); } else if (node.getChildren().get(i).toString().equals("a")) { value.append(((TagNode) node.getChildren().get(i)).getText().toString()); } else if (node.getChildren().get(i).toString().equals("small")) { value.append("<small>" + ((TagNode) node.getChildren().get(i)).getText().toString() + "</small>"); } else if (node.getChildren().get(i).toString() != null) { value.append(node.getChildren().get(i).toString()); } } } // Do you want to keep Html tags if (keepHtmlTags) { return value.toString(); } else { // To this to reformat encoded character and remove html tags like <br> return Html.fromHtml(value.toString()).toString(); } }
Example 33
| Project: sisob-academic-data-extractor-master File: ResearchersPagePostProcessor.java View source code |
/**
*
* @param props
* @param path
* @param nameFile
* @param newNameFile
*/
public static void cleanFile(CleanerProperties props, String path, String nameFile, String newNameFile) {
File fileURL = new File(path + File.separator + nameFile);
// do parsing
try {
TagNode tagNode = new HtmlCleaner(props).clean(fileURL, "utf-8");
// serialize to xml file
new CompactHtmlSerializer(props).writeToFile(tagNode, path + File.separator + newNameFile, "UTF-8");
ProjectLogger.LOGGER.info(path + File.separator + nameFile + " cleaned!");
} catch (Exception ex) {
ProjectLogger.LOGGER.warn(ex.getMessage() + " " + path + File.separator + nameFile + " NOT FOUND!");
}
}Example 34
| Project: StackX-master File: MarkdownFormatter.java View source code |
private static String clean(String markdownText) throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerTransformations transformations = new CleanerTransformations();
transformations.addTransformation(new TagTransformation(Tags.BR, Tags.BR + "/", true));
cleaner.setTransformations(transformations);
TagNode node = cleaner.clean(markdownText);
SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(cleaner.getProperties());
serializer.write(node, new StringWriter(), HTTP.UTF_8);
return serializer.getAsString(node);
}Example 35
| Project: xwiki-commons-master File: DefaultHTMLCleaner.java View source code |
@Override
public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration) {
Document result;
// Note: Instantiation of an HtmlCleaner object is cheap so there's no need to cache an instance of it,
// especially since this makes it extra safe with regards to multithreading (even though HTML Cleaner is
// already supposed to be thread safe).
CleanerProperties cleanerProperties = getDefaultCleanerProperties(configuration);
HtmlCleaner cleaner = new HtmlCleaner(cleanerProperties);
TagNode cleanedNode;
try {
cleanedNode = cleaner.clean(originalHtmlContent);
} catch (Exception e) {
throw new RuntimeException("Unhandled error when cleaning HTML", e);
}
try {
// Ideally we would use SF's HTMLCleaner DomSerializer but there are outstanding issues with it, so we're
// using a custom XWikiDOMSerializer (see its javadoc for more details).
// Replace by the following when fixed:
// result = new DomSerializer(cleanerProperties, false).createDOM(cleanedNode);
cleanedNode.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
result = new XWikiDOMSerializer(cleanerProperties, false).createDOM(getAvailableDocumentBuilder(), cleanedNode);
} catch (ParserConfigurationException ex) {
throw new RuntimeException("Error while serializing TagNode into w3c dom.", ex);
}
// Finally apply filters.
for (HTMLFilter filter : configuration.getFilters()) {
filter.filter(result, configuration.getParameters());
}
return result;
}Example 36
| Project: fenixedu-academic-master File: ProcessCandidacyPrintAllDocumentsFilter.java View source code |
private String clean(String dirtyHtml) {
try {
HtmlCleaner cleaner = new HtmlCleaner();
TagNode root = cleaner.clean(dirtyHtml);
return new SimpleHtmlSerializer(cleaner.getProperties()).getAsString(root);
} catch (HtmlCleanerException e) {
logger.error(e.getMessage(), e);
}
return StringUtils.EMPTY;
}Example 37
| Project: LimeWire-Pirate-Edition-master File: TorrentWebSearch.java View source code |
/**
* Extracts all uris from <code>htmlFile</code> that are the targets of anchor
* elements and could be potential torrent uris.
*/
List<URI> extractTorrentUriCandidates(File htmlFile, URI referrer) throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(htmlFile);
@SuppressWarnings("unchecked") List<TagNode> anchors = tagNode.getElementListHavingAttribute("href", true);
List<URI> candidates = new ArrayList<URI>(anchors.size());
for (TagNode node : anchors) {
if (!"a".equalsIgnoreCase(node.getName())) {
continue;
}
String href = node.getAttributeByName("href");
LOG.debugf("resolving: {0} with {1}", href, referrer);
try {
URI link = URIUtils.toURI(href);
if (canBeTorrentUri(link)) {
candidates.add(link);
} else {
link = org.apache.http.client.utils.URIUtils.resolve(referrer, link);
if (canBeTorrentUri(link)) {
candidates.add(link);
} else {
LOG.debugf("not a potential torrent link: {0}", link);
}
}
} catch (URISyntaxException e) {
LOG.debug("error parsing", e);
}
}
return candidates;
}Example 38
| Project: mitbbs-android-master File: FetchWebpage.java View source code |
/** * get previous page and next page for board, notice the parse get the second tag by (TagNode)nodes.get(1); * @param htmlCleaner * @param mitbbspageURL * @param encoding * @param tagName * @return */ public ArrayList<URL> getBoardLinks(HtmlCleaner htmlCleaner, URL mitbbspageURL, String encoding, String tagName) { Log.i(TAG, "getBoardLinks"); ArrayList<URL> links = new ArrayList<URL>(); try { URLConnection conn = mitbbspageURL.openConnection(); TagNode node = htmlCleaner.clean(new InputStreamReader(conn.getInputStream(), encoding)); List nodes = node.getElementListByAttValue("id", tagName, true, true); Log.i(TAG, "getBoardLinks nodes.length=" + String.valueOf(nodes.size())); TagNode pnode = (TagNode) nodes.get(1); List linklist = pnode.getElementListByName("a", true); for (int i = 0; i < linklist.size(); i++) { TagNode pre = (TagNode) linklist.get(i); links.add(new URL(moibleBaseURL + pre.getAttributeByName("href"))); } if (links.size() < 4 && links.size() == 2) { URL obj = links.get(0); URL obj1 = links.get(1); links.set(0, null); links.add(1, null); links.add(2, obj); links.add(3, obj1); } } catch (Exception ex) { Log.i(TAG, "getBoardLinks" + ex.toString()); } return links; }
Example 39
| Project: stanbol-master File: DomSerializer2.java View source code |
public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(props.isNamespacesAware());
Document document = factory.newDocumentBuilder().newDocument();
Element rootElement = document.createElement(rootNode.getName());
;
document.appendChild(rootElement);
setAttributes(rootNode, rootElement);
createSubnodes(document, rootElement, rootNode.getChildren());
return document;
}Example 40
| Project: fastcatsearch-master File: ReadabilityExtractor.java View source code |
/**
* htmlcleaner로 html string� xml string으로 바꿔주는 메소드.
* @param source
* @return
*/
private String toXML(String source) {
try {
CleanerProperties props = new CleanerProperties();
props.setTranslateSpecialEntities(true);
props.setOmitComments(true);
props.setPruneTags("script,style");
// namespace를 무시한다.
props.setNamespacesAware(false);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
HtmlCleaner cl = new HtmlCleaner(props);
TagNode tagNode = cl.clean(source);
source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
} catch (IOException e) {
logger.error("", e);
}
return source;
}Example 41
| Project: weblounge-master File: XhtmlRendererPagePreviewGenerator.java View source code |
/**
* {@inheritDoc}
*
* @see ch.entwine.weblounge.common.content.PreviewGenerator#createPreview(ch.entwine.weblounge.common.content.Resource,
* ch.entwine.weblounge.common.site.Environment,
* ch.entwine.weblounge.common.language.Language,
* ch.entwine.weblounge.common.content.image.ImageStyle, String,
* java.io.InputStream, java.io.OutputStream)
*/
public void createPreview(Resource<?> resource, Environment environment, Language language, ImageStyle style, String format, InputStream is, OutputStream os) throws IOException {
if (!isRenderingEnvironmentSane) {
logger.debug("Skipping page preview rendering as environment is not sane");
return;
}
if (resource == null)
throw new IllegalArgumentException("Resource cannot be null");
ImagePreviewGenerator imagePreviewGenerator = null;
synchronized (previewGenerators) {
if (previewGenerators.size() == 0) {
logger.debug("Unable to generate page previews since no image renderer is available");
return;
}
imagePreviewGenerator = previewGenerators.get(0);
}
ResourceURI uri = resource.getURI();
Site site = uri.getSite();
String html = null;
try {
URL pageURL = new URL(UrlUtils.concat(site.getHostname(environment).toExternalForm(), PAGE_HANDLER_PREFIX, uri.getIdentifier()));
html = render(pageURL, site, environment, language, resource.getVersion());
if (StringUtils.isBlank(html)) {
logger.warn("Error rendering preview of page " + uri.getPath());
return;
}
html = HTMLUtils.escapeHtml(HTMLUtils.unescape(html));
} catch (ServletException e) {
logger.warn("Error rendering page " + uri.getPath(), e);
throw new IOException(e);
}
// Try to convert html to xhtml
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties xhtmlProperties = cleaner.getProperties();
TagNode xhtmlNode = cleaner.clean(html);
if (xhtmlNode == null) {
logger.warn("Error creating well-formed document from page {}", resource);
return;
}
File xhtmlFile = null;
is = new ByteArrayInputStream(html.getBytes("UTF-8"));
// generator can only handle files.
try {
xhtmlFile = File.createTempFile("xhtml", ".xml");
Serializer xhtmlSerializer = new SimpleXmlSerializer(xhtmlProperties);
xhtmlSerializer.writeToFile(xhtmlNode, xhtmlFile.getAbsolutePath(), "UTF-8");
} catch (IOException e) {
logger.error("Error creating temporary copy of file content at " + xhtmlFile, e);
FileUtils.deleteQuietly(xhtmlFile);
throw e;
} finally {
IOUtils.closeQuietly(is);
}
File imageFile = File.createTempFile("xhtml-preview", "." + PREVIEW_FORMAT);
FileOutputStream imageFos = null;
// Render the page and write back to client
try {
int screenshotWidth = DEFAULT_SCREENSHOT_WIDTH;
int screenshotHeight = DEFAULT_SCREENSHOT_HEIGHT;
if (style != null && style.getWidth() > 0 && style.getHeight() > 0) {
screenshotHeight = (int) ((float) screenshotWidth / (float) style.getWidth() * style.getHeight());
}
// Create the renderer. Due to a synchronization bug in the software,
// this needs to be synchronized
Java2DRenderer renderer = null;
try {
synchronized (this) {
renderer = new Java2DRenderer(xhtmlFile, screenshotWidth, screenshotHeight);
}
} catch (Throwable t) {
if (isRenderingEnvironmentSane) {
logger.warn("Error creating Java 2D renderer for previews: {}" + t.getMessage());
logger.warn("Page preview rendering will be switched off");
isRenderingEnvironmentSane = false;
}
logger.debug("Error creating Java 2D renderer for preview of page {}: {}" + uri.getPath(), t.getMessage());
return;
}
// Configure the renderer
renderer.getSharedContext().setBaseURL(site.getHostname().toExternalForm());
renderer.getSharedContext().setInteractive(false);
// Make sure the renderer is using a user agent that will correctly
// resolve urls
WebloungeUserAgent agent = userAgents.get(site.getIdentifier());
if (agent == null) {
agent = new WebloungeUserAgent(site.getHostname().getURL());
userAgents.put(site.getIdentifier(), agent);
}
renderer.getSharedContext().setUserAgentCallback(agent);
// Render the page to an image
BufferedImage img = renderer.getImage();
FSImageWriter imageWriter = new FSImageWriter(PREVIEW_FORMAT);
imageFos = new FileOutputStream(imageFile);
imageWriter.write(img, imageFos);
} catch (IOException e) {
logger.error("Error creating temporary copy of file content at " + xhtmlFile, e);
throw e;
} catch (XRRuntimeException e) {
logger.warn("Error rendering page content at " + uri + ": " + e.getMessage());
throw e;
} catch (HeadlessException e) {
logger.warn("Headless error while trying to render page preview: " + e.getMessage());
logger.warn("Page preview rendering will be switched off");
isRenderingEnvironmentSane = false;
throw e;
} catch (Throwable t) {
logger.warn("Error rendering page content at " + uri + ": " + t.getMessage(), t);
throw new IOException(t);
} finally {
IOUtils.closeQuietly(imageFos);
FileUtils.deleteQuietly(xhtmlFile);
}
FileInputStream imageIs = null;
// Scale the image to the correct size
try {
imageIs = new FileInputStream(imageFile);
imagePreviewGenerator.createPreview(resource, environment, language, style, PREVIEW_FORMAT, imageIs, os);
} catch (IOException e) {
logger.error("Error creating temporary copy of file content at " + xhtmlFile, e);
throw e;
} catch (Throwable t) {
logger.warn("Error scaling page preview at " + uri + ": " + t.getMessage(), t);
throw new IOException(t);
} finally {
IOUtils.closeQuietly(imageIs);
FileUtils.deleteQuietly(imageFile);
}
}Example 42
| Project: book_reader_lib-master File: BookView.java View source code |
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end) {
final String href = node.getAttributeByName("href");
if (href == null) {
return;
}
// First check if it should be a normal URL link
for (String protocol : this.externalProtocols) {
if (href.toLowerCase().startsWith(protocol)) {
builder.setSpan(new URLSpan(href), start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
return;
}
}
// If not, consider it an internal nav link.
ClickableSpan span = new ClickableSpan() {
@Override
public void onClick(View widget) {
navigateTo(spine.resolveHref(href));
}
};
builder.setSpan(span, start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
}Example 43
| Project: feedscribe-master File: FeedManager.java View source code |
public String cleanDescription(TagNode node) { final StringBuilder description = new StringBuilder(); node.traverse(new TagNodeVisitor() { @Override public boolean visit(TagNode tagNode, HtmlNode htmlNode) { if (htmlNode instanceof ContentNode) { ContentNode contentNode = (ContentNode) htmlNode; htmlUnescapeInto(contentNode.getContent(), description); } return true; } }); return description.toString().trim(); }
Example 44
| Project: kolmafia-master File: RuntimeLibrary.java View source code |
public static Value xpath(Interpreter interpreter, final Value html, final Value xpath) {
HtmlCleaner cleaner = HTMLParserUtils.configureDefaultParser();
TagNode doc;
try {
doc = cleaner.clean(html.toString());
} catch (IOException e) {
StaticEntity.printStackTrace(e);
throw interpreter.runtimeException("something went wrong while cleaning html");
}
Object[] result;
try {
result = doc.evaluateXPath(xpath.toString());
} catch (XPatherException e) {
throw interpreter.runtimeException("invalid xpath expression");
}
AggregateType type = new AggregateType(DataTypes.STRING_TYPE, result.length);
ArrayValue value = new ArrayValue(type);
// convert Tagnode objects to strings consisting of their inner HTML
SimpleXmlSerializer serializer = new SimpleXmlSerializer(cleaner.getProperties());
for (int i = 0; i < result.length; i++) {
Object ob = result[i];
if (ob instanceof TagNode) {
TagNode tag = (TagNode) ob;
try {
result[i] = serializer.getXmlAsString(tag);
} catch (IOException e) {
StaticEntity.printStackTrace(e);
throw interpreter.runtimeException("something went wrong while serializing to html");
}
}
value.aset(new Value(i), new Value(result[i].toString()));
}
return value;
}Example 45
| Project: ttr-master File: ArticleFragment.java View source code |
/**
* Using a small html parser with a visitor which goes through the html I extract the alt-attribute from the
* content. If nothing is found it is left as null and the menu should'nt contain the item to display the caption.
*
* @param extra the
* @return the alt-text or null if none was found.
*/
private String getAltTextForImageUrl(String extra) {
if (content == null || !content.contains(extra))
return null;
HtmlCleaner cleaner = new HtmlCleaner();
TagNode node = cleaner.clean(content);
MyTagNodeVisitor tnv = new MyTagNodeVisitor(extra);
node.traverse(tnv);
if (tnv.alt == null)
return null;
return Html.fromHtml(tnv.alt).toString();
}Example 46
| Project: ttrss-reader-fork-master File: ArticleFragment.java View source code |
/**
* Using a small html parser with a visitor which goes through the html I extract the alt-attribute from the
* content. If nothing is found it is left as null and the menu should'nt contain the item to display the caption.
*
* @param extra the
* @return the alt-text or null if none was found.
*/
private String getAltTextForImageUrl(String extra) {
if (content == null || !content.contains(extra))
return null;
HtmlCleaner cleaner = new HtmlCleaner();
TagNode node = cleaner.clean(content);
MyTagNodeVisitor tnv = new MyTagNodeVisitor(extra);
node.traverse(tnv);
if (tnv.alt == null)
return null;
return Html.fromHtml(tnv.alt).toString();
}Example 47
| Project: TweetTopics2.0-master File: LinksUtils.java View source code |
public static InfoLink getInfoTweet(String link) {
String originalLink = link;
// acortadores
link = largeLink(link);
// si es un url media
if (CacheData.getInstance().existURLMedia(link)) {
Utils.URLContent content = CacheData.getInstance().getURLMedia(link);
InfoLink il = new InfoLink();
il.setService("Twitter Pic");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(content.linkMediaThumb);
il.setLinkImageLarge(content.linkMediaLarge);
return il;
}
// es una busqueda
if (link.startsWith(Utils.URL_QR)) {
InfoLink il = new InfoLink();
il.setService("tweettopics-qr");
il.setType(Utils.TYPE_LINK_TWEETOPICS_QR);
il.setLink(link);
il.setOriginalLink(originalLink);
return il;
}
if (link.startsWith(Utils.URL_SHARE_THEME_QR)) {
InfoLink il = new InfoLink();
il.setService("tweettopics-theme");
il.setType(Utils.TYPE_LINK_TWEETOPICS_THEME);
il.setLink(link);
il.setOriginalLink(originalLink);
return il;
}
if ((link.endsWith(".jpg")) || (link.endsWith(".png")) || (link.endsWith(".gif")) || (link.endsWith(".bmp"))) {
InfoLink il = new InfoLink();
il.setExtensiveInfo(true);
il.setService("Web");
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(link);
il.setLinkImageLarge(link);
return il;
}
if (link.contains("imgur.com")) {
String id = link.substring(link.lastIndexOf("/") + 1);
String imgThumb = "http://i.imgur.com/" + id + "b.jpg";
String imgLarge = "http://i.imgur.com/" + id + ".jpg";
InfoLink il = new InfoLink();
il.setService("Imgur");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(imgThumb);
il.setLinkImageLarge(imgLarge);
return il;
}
// lightbox
if (link.contains("lightbox")) {
String id = link.substring(link.lastIndexOf("/") + 1);
InfoLink il = new InfoLink();
il.setService("Lightbox");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb("http://lightbox.com/show/thumb/" + id);
il.setLinkImageLarge("http://lightbox.com/show/large/" + id);
return il;
}
// twitpic
if (link.contains("twitpic")) {
String id = link.substring(link.lastIndexOf("/") + 1);
InfoLink il = new InfoLink();
il.setService("Twitpic");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb("http://twitpic.com/show/mini/" + id);
il.setLinkImageLarge("http://twitpic.com/show/large/" + id);
return il;
}
// picplz
if (link.contains("picplz")) {
String id = link.substring(link.lastIndexOf("/") + 1);
InfoLink il = new InfoLink();
il.setService("Picplz");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb("http://picplz.com/" + id + "/thumb/200");
il.setLinkImageLarge("http://picplz.com/" + id + "/thumb/400");
return il;
}
// img.ly
if (link.contains("img.ly")) {
String id = link.substring(link.lastIndexOf("/") + 1);
InfoLink il = new InfoLink();
il.setService("Img.ly");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb("http://img.ly/show/thumb/" + id);
il.setLinkImageLarge("http://img.ly/show/medium/" + id);
return il;
}
if (link.contains("vvcap")) {
String image = link.replace(".htp", ".png");
InfoLink il = new InfoLink();
il.setExtensiveInfo(true);
il.setService("Vvcap.net");
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(image);
il.setLinkImageLarge(image);
return il;
}
if (link.contains("yfrog")) {
InfoLink il = new InfoLink();
il.setService("Yfrog");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(link + ".th.jpg");
il.setLinkImageLarge(link + ":android");
Log.d(Utils.TAG, "yfrog (\"+link+\"): " + link + ".th.jpg" + " -- " + link + ":android");
return il;
}
// twitvid
if (link.contains("twitvid")) {
String id = link.substring(link.lastIndexOf("/") + 1);
InfoLink il = new InfoLink();
il.setService("twitvid");
il.setExtensiveInfo(true);
il.setType(1);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setTitle("Twitvid");
il.setDurationVideo(0);
il.setLinkImageThumb("http://images2.twitvid.com/" + id + ".jpg");
il.setLinkImageLarge("http://images2.twitvid.com/" + id + ".jpg");
return il;
}
/*
if (link.contains("flic.kr")) {
String idbase58 = link.substring(link.lastIndexOf("/")+1);
String id = String.valueOf(alphaToNumber(idbase58));
String urlApi = "http://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=6ce2af123df7dd2a7dab086f086e9824&photo_id="+id+"&format=json&nojsoncallback=1";
Log.d(Utils.TAG, "urlApi: (" + link + ") " + urlApi);
String farmId="";
String serverId="";
String secret="";
HttpGet request = new HttpGet(urlApi);
HttpClient client = new DefaultHttpClient();
HttpResponse httpResponse;
try {
httpResponse = client.execute(request);
String xml = EntityUtils.toString(httpResponse.getEntity());
JSONObject jsonObject = new JSONObject(xml);
if (jsonObject!=null) {
if (jsonObject.getJSONObject("photo")!=null) {
farmId = jsonObject.getJSONObject("photo").getString("farm");
serverId = jsonObject.getJSONObject("photo").getString("server");
secret = jsonObject.getJSONObject("photo").getString("secret");
}
}
} catch (Exception e) {
e.printStackTrace();
}
if (farmId!="") {
String imgThumb = "http://farm"+farmId+".static.flickr.com/"+serverId+"/"+id+"_"+secret+"_s.jpg";
String imgLarge = "http://farm"+farmId+".static.flickr.com/"+serverId+"/"+id+"_"+secret+".jpg";
Bitmap bmp = getBitmap(imgThumb, HEIGHT_THUMB);
if (bmp!=null) {
InfoLink il = new InfoLink();
il.setBitmapThumb(bmp);
il.setService("Flickr");
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(imgThumb);
il.setLinkImageLarge(imgLarge);
return il;
}
}
}
*/
if (link.contains("mytubo.net")) {
String image = "";
try {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
URL url = new URL(link);
URLConnection conn;
conn = url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
TagNode node = cleaner.clean(isr);
Object[] objMeta = node.evaluateXPath("//img[@id='originPic']");
if (objMeta.length > 0) {
TagNode info_node = (TagNode) objMeta[0];
image = URLDecoder.decode(info_node.getAttributeByName("src").toString().trim());
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (XPatherException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
InfoLink il = new InfoLink();
il.setExtensiveInfo(true);
il.setService("Mytubo.net");
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(image);
il.setLinkImageLarge(image);
return il;
}
// instagr.am
if (link.contains("instagr.am")) {
String image = "";
try {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
props.setUseEmptyElementTags(true);
/*
URL url = new URL(link);
URLConnection conn;
conn = url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream()); */
TagNode node = cleaner.clean(getURIContent(link));
Object[] objMeta = node.evaluateXPath("//img[@class='photo']");
if (objMeta.length > 0) {
TagNode info_node = (TagNode) objMeta[0];
image = URLDecoder.decode(info_node.getAttributeByName("src").toString().trim());
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (XPatherException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
InfoLink il = new InfoLink();
il.setExtensiveInfo(true);
il.setService("Instagr.am");
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(image);
il.setLinkImageLarge(image);
Log.d(Utils.TAG, "Instagr.am (" + link + "): " + image);
return il;
}
if (link.contains("plixi") || link.contains("lockerz")) {
String strURL = "http://api.plixi.com/api/tpapi.svc/metadatafromurl?url=" + link;
try {
Document doc = null;
try {
URL url;
URLConnection urlConn = null;
url = new URL(strURL);
urlConn = url.openConnection();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
doc = db.parse(urlConn.getInputStream());
} catch (IOException ioe) {
} catch (ParserConfigurationException pce) {
} catch (SAXException se) {
}
if (doc != null) {
try {
String imgThumb = doc.getElementsByTagName("ThumbnailUrl").item(0).getFirstChild().getNodeValue();
String imgLarge = doc.getElementsByTagName("MediumImageUrl").item(0).getFirstChild().getNodeValue();
if (!imgThumb.equals("")) {
InfoLink il = new InfoLink();
il.setService("Lockerz");
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(imgThumb);
il.setLinkImageLarge(imgLarge);
return il;
}
} catch (Exception e) {
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
if (link.contains("twitgoo")) {
String id = link.substring(link.lastIndexOf("/") + 1);
String strURL = "http://twitgoo.com/api/message/info/" + id;
Document doc = null;
try {
URL url;
URLConnection urlConn = null;
url = new URL(strURL);
urlConn = url.openConnection();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
doc = db.parse(urlConn.getInputStream());
} catch (IOException ioe) {
} catch (ParserConfigurationException pce) {
} catch (SAXException se) {
}
if (doc != null) {
try {
String imgThumb = doc.getElementsByTagName("thumburl").item(0).getFirstChild().getNodeValue();
String imgLarge = doc.getElementsByTagName("imageurl").item(0).getFirstChild().getNodeValue();
if (!imgThumb.equals("")) {
InfoLink il = new InfoLink();
il.setService("Twitgoo");
il.setExtensiveInfo(true);
il.setType(0);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setLinkImageThumb(imgThumb);
il.setLinkImageLarge(imgLarge);
return il;
}
} catch (Exception e) {
}
}
}
if (link.contains("vimeo")) {
String id = link.substring(link.lastIndexOf("/") + 1);
String strURL = "http://vimeo.com/api/v2/video/" + id + ".xml";
Document doc = null;
try {
URL url;
URLConnection urlConn = null;
url = new URL(strURL);
urlConn = url.openConnection();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
doc = db.parse(urlConn.getInputStream());
} catch (IOException ioe) {
} catch (ParserConfigurationException pce) {
} catch (SAXException se) {
}
if (doc != null) {
try {
String imgThumb = doc.getElementsByTagName("thumbnail_small").item(0).getFirstChild().getNodeValue();
String imgLarge = doc.getElementsByTagName("thumbnail_large").item(0).getFirstChild().getNodeValue();
String title = doc.getElementsByTagName("title").item(0).getFirstChild().getNodeValue();
int duration = Integer.parseInt(doc.getElementsByTagName("duration").item(0).getFirstChild().getNodeValue());
if (!imgThumb.equals("")) {
InfoLink il = new InfoLink();
il.setService("Vimeo");
il.setExtensiveInfo(true);
il.setType(1);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setTitle(title);
il.setDurationVideo(duration);
il.setLinkImageThumb(imgThumb);
il.setLinkImageLarge(imgLarge);
return il;
}
} catch (Exception e) {
}
}
}
if ((link.contains("youtube")) || (link.contains("youtu.be"))) {
String id = "";
if (link.contains("youtube")) {
id = link.substring(link.lastIndexOf("v=") + 2);
if (id.contains("&")) {
id = id.substring(0, id.indexOf("&"));
}
}
if (link.contains("youtu.be")) {
id = link.substring(link.lastIndexOf("/") + 1);
if (id.contains("?")) {
id = id.substring(0, id.indexOf("?"));
}
}
String imgThumb = "http://img.youtube.com/vi/" + id + "/2.jpg";
String imgLarge = "http://img.youtube.com/vi/" + id + "/0.jpg";
String strURL = "http://gdata.youtube.com/feeds/api/videos/" + id;
Document doc = null;
try {
URL url;
URLConnection urlConn = null;
url = new URL(strURL);
urlConn = url.openConnection();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
doc = db.parse(urlConn.getInputStream());
} catch (IOException ioe) {
} catch (ParserConfigurationException pce) {
} catch (SAXException se) {
}
String title = "Youtube";
int duration = 0;
try {
if (doc != null) {
title = doc.getElementsByTagName("title").item(0).getFirstChild().getNodeValue();
duration = Integer.parseInt(doc.getElementsByTagName("yt:duration").item(0).getAttributes().getNamedItem("seconds").getNodeValue());
}
} catch (Exception e) {
}
InfoLink il = new InfoLink();
il.setService("Youtube");
il.setExtensiveInfo(true);
il.setType(1);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setTitle(title);
il.setDurationVideo(duration);
il.setLinkImageThumb(imgThumb);
il.setLinkImageLarge(imgLarge);
return il;
}
// si no es una imagen, es un enlace web
InfoLink il = new InfoLink();
il.setService("web");
il.setType(2);
il.setLink(link);
il.setOriginalLink(originalLink);
il.setTitle(originalLink);
return il;
}Example 48
| Project: BotLibre-master File: Http.java View source code |
/**
* Convert the HTML input stream into DOM parsable XHTML.
*/
public StringReader convertToXHTML(InputStream input) throws IOException {
StringWriter output = new StringWriter();
/*int next = input.read();
while (next != -1) {
output.write(next);
next = input.read();
}
String result = output.toString();
System.out.println(result);*/
TagNode node = getHtmlCleaner().clean(input, "UTF-8");
//TagNode node = getHtmlCleaner().clean(result);
node.serialize(new SimpleXmlSerializer(getHtmlCleaner().getProperties()), output);
output.flush();
String xhtml = output.toString();
return new StringReader(xhtml);
}Example 49
| Project: concourse-connect-master File: HTMLToWikiUtils.java View source code |
public static String htmlToWiki(String html, String contextPath, int projectId) throws Exception {
// Strip the nbsp because it gets converted to unicode
html = StringUtils.replace(html, " ", " ");
// Take the html create DOM for parsing
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
TagNode node = cleaner.clean(html);
Document document = new DomSerializer(props, true).createDOM(node);
if (LOG.isTraceEnabled()) {
LOG.trace(html);
}
// Process each node and output the wiki equivalent
StringBuffer sb = new StringBuffer();
ArrayList<Node> nodeList = new ArrayList<Node>();
for (int i = 0; i < document.getChildNodes().getLength(); i++) {
Node n = document.getChildNodes().item(i);
nodeList.add(n);
}
processChildNodes(nodeList, sb, 0, true, true, false, "", contextPath, projectId);
if (sb.length() > 0) {
String content = sb.toString().trim();
if (content.contains("'")) {
// Determine if this is where the ' is being introduced
content = StringUtils.replace(content, "'", "'");
}
if (!content.endsWith(CRLF)) {
return content + CRLF;
} else {
return content;
}
} else {
return "";
}
}Example 50
| Project: iaf-master File: XmlUtils.java View source code |
public static String toXhtml(String htmlString) {
String xhtmlString = null;
if (StringUtils.isNotEmpty(htmlString)) {
xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim());
if (xhtmlString.startsWith("<html>") || xhtmlString.startsWith("<html ")) {
CleanerProperties props = new CleanerProperties();
HtmlCleaner cleaner = new HtmlCleaner(props);
TagNode tagNode = cleaner.clean(xhtmlString);
xhtmlString = new SimpleXmlSerializer(props).getXmlAsString(tagNode);
}
}
return xhtmlString;
}Example 51
| Project: cos598b-master File: MessageCompose.java View source code |
/**
* Build and populate the UI with the quoted message.
*
* @param showQuotedText
* {@code true} if the quoted text should be shown, {@code false} otherwise.
*
* @throws MessagingException
*/
private void populateUIWithQuotedMessage(boolean showQuotedText) throws MessagingException {
if (mMessageFormat == MessageFormat.AUTO) {
mMessageFormat = MimeUtility.findFirstPartByMimeType(mSourceMessage, "text/html") == null ? MessageFormat.TEXT : MessageFormat.HTML;
}
// TODO -- I am assuming that mSourceMessageBody will always be a text part. Is this a safe assumption?
// Handle the original message in the reply
// If we already have mSourceMessageBody, use that. It's pre-populated if we've got crypto going on.
String content = mSourceMessageBody != null ? mSourceMessageBody : getBodyTextFromMessage(mSourceMessage, mMessageFormat);
if (mMessageFormat == MessageFormat.HTML) {
// closing tags such as </div>, </span>, </table>, </pre> will be cut off.
if (mAccount.isStripSignature() && (ACTION_REPLY_ALL.equals(getIntent().getAction()) || ACTION_REPLY.equals(getIntent().getAction()))) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<Integer>();
List<Integer> end = new ArrayList<Integer>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Log.d(K9.LOG_TAG, "There are " + start.size() + " <blockquote> tags, but " + end.size() + " </blockquote> tags. Refusing to strip.");
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
try {
content = htmlSerialized.getAsString(node, "UTF8");
} catch (java.io.IOException ioe) {
Log.e(K9.LOG_TAG, "Problem cleaning quoted message.", ioe);
}
}
// Add the HTML reply header to the top of the content.
mQuotedHtmlContent = quoteOriginalHtmlMessage(mSourceMessage, content, mQuoteStyle);
// Load the message with the reply header.
mQuotedHTML.setText(mQuotedHtmlContent.getQuotedContent(), "text/html");
mQuotedText.setText(quoteOriginalTextMessage(mSourceMessage, getBodyTextFromMessage(mSourceMessage, MessageFormat.TEXT), mQuoteStyle));
} else if (mMessageFormat == MessageFormat.TEXT) {
if (mAccount.isStripSignature() && (ACTION_REPLY_ALL.equals(getIntent().getAction()) || ACTION_REPLY.equals(getIntent().getAction()))) {
if (DASH_SIGNATURE_PLAIN.matcher(content).find()) {
content = DASH_SIGNATURE_PLAIN.matcher(content).replaceFirst("\r\n");
}
}
mQuotedText.setText(quoteOriginalTextMessage(mSourceMessage, content, mQuoteStyle));
}
if (showQuotedText) {
showOrHideQuotedText(QuotedTextMode.SHOW);
} else {
showOrHideQuotedText(QuotedTextMode.HIDE);
}
}Example 52
| Project: xMail-master File: HtmlSanitizer.java View source code |
public static String sanitize(String html) {
TagNode rootNode = HTML_CLEANER.clean(html);
removeMetaRefresh(rootNode);
return HTML_SERIALIZER.getAsString(rootNode, "UTF8");
}Example 53
| Project: XDA-One-master File: XDATagHandlers.java View source code |
@Override
public void handleTagNode(final TagNode node, final SpannableStringBuilder builder, final int start, final int end, final SpanStack stack) {
final String src = node.getAttributeByName("src");
final int newStart = builder.length();
builder.append("");
stack.pushSpan(new ImageSpan(src), newStart, builder.length());
}Example 54
| Project: commcare-master File: MarkupUtil.java View source code |
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, SpanStack spanStack) {
spanStack.pushSpan(new UnderlineSpan(), start, end);
}Example 55
| Project: WotCrawler-master File: Parser.java View source code |
// -------------------- html parsing --------------------
/**
* Parses a HTML document, transforms it into valid XML using the
* htmlcleaner-library and returns it as org.w3c.dom.Document
* @param file the html file to parse
* @return org.w3c.dom.Document representation of the cleaned HTML file
* @throws IOException cannot access the file
* @throws ParserConfigurationException parser configuration invalid
* @throws SAXException error while parsing (usually invalid xml)
*/
public static Document parseHTML(File file) throws IOException, ParserConfigurationException, SAXException {
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(file);
String cleanHTML = new SimpleXmlSerializer(cleaner.getProperties()).getAsString(tagNode);
return buildDOM(cleanHTML);
}