Java Examples for org.apache.tika.parser.html.HtmlParser
The following java examples will help you to understand the usage of org.apache.tika.parser.html.HtmlParser. These source code samples are taken from different open source projects.
Example 1
| Project: trombone-master File: XmlOrHtmlTikaParser.java View source code |
/* (non-Javadoc)
* @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext)
*/
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Detector detector = new DefaultDetector();
MediaType mediaType = detector.detect(stream, metadata);
if (mediaType == MediaType.TEXT_HTML) {
new HtmlParser().parse(stream, handler, metadata, context);
} else {
new XMLParser().parse(stream, handler, metadata, context);
}
}Example 2
| Project: tika-master File: OutlookExtractor.java View source code |
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}Example 3
| Project: openalexis-master File: WebContentParserImpl.java View source code |
public String parseHTML(final String string) throws IOException {
final StringBuffer buffer = new StringBuffer();
final HtmlParser htmlParser = new HtmlParser();
final BoilerpipeContentHandler handler = new BoilerpipeContentHandler(new FullTextContentHandler(buffer), KeepEverythingExtractor.INSTANCE);
return parse(htmlParser, new ReaderInputStream(new StringReader(string)), handler, buffer);
}Example 4
| Project: streamflow-core-master File: Translator.java View source code |
public static String htmlToText(String html) {
String result = html;
String encoding = "";
// if HTML contains encoding information we do not have to guess encoding!
Document doc = Jsoup.parse(result);
Element meta = doc.select("meta[http-equiv]").first();
if (meta != null) {
String contentString = meta.attr("content");
contentString = contentString.indexOf(' ') != -1 ? contentString.replace(' ', ';') : contentString;
ContentType contentType = new ContentType(contentString);
encoding = contentType.getCharacterSet() != null ? contentType.getCharacterSet().getName() : "";
}
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try {
// if we already found an encoding - don't guess
new HtmlParser().parse(IOUtils.toInputStream(result, (!Strings.empty(encoding) ? encoding : guessEncoding(result))), handler, metadata, new ParseContext());
result = handler.toString();
// since Pdfbox COSString would interpret a string containing dash as UTF-16
if (result.indexOf("–") != 1) {
result = result.replace("–", "-");
}
} catch (Exception e) {
}
return result;
}Example 5
| Project: book-master File: TikaTest.java View source code |
@Test
public void testHtml() throws Exception {
String html = "<html><head><title>The Big Brown Shoe</title></head><body><p>The best pizza place " + "in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" + "<p>It is located in Amherst, MA.</p></body></html>";
//<start id="tika-html"/>
InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
//<co id="html.text.co"/>
ContentHandler text = new BodyContentHandler();
//<co id="html.link.co"/>
LinkContentHandler links = new LinkContentHandler();
//<co id="html.merge"/>
ContentHandler handler = new TeeContentHandler(links, text);
//<co id="html.store"/>
Metadata metadata = new Metadata();
//<co id="html.parser"/>
Parser parser = new HtmlParser();
ParseContext context = new ParseContext();
//<co id="html.parse"/>
parser.parse(input, handler, metadata, context);
System.out.println("Title: " + metadata.get(Metadata.TITLE));
System.out.println("Body: " + text.toString());
System.out.println("Links: " + links.getLinks());
/*
<calloutlist>
<callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>
<callout arearefs="html.link.co"><para>Construct ContentHandler that knows about HTML links</para></callout>
<callout arearefs="html.merge"><para>Wrap up our ContentHandlers into one</para></callout>
<callout arearefs="html.store"><para>Metadata is a simple storage mechanism where the extracted metadata gets stored</para></callout>
<callout arearefs="html.parser"><para>We know the input is HTML, so construct a Parser to parse it</para></callout>
<callout arearefs="html.parse"><para>Do the parse</para></callout>
</calloutlist>
*/
//<end id="tika-html"/>
}Example 6
| Project: collector-http-master File: TikaLinkExtractor.java View source code |
@Override
public Set<com.norconex.collector.http.url.Link> extractLinks(InputStream is, String url, ContentType contentType) throws IOException {
LinkContentHandler linkHandler = new LinkContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
parseContext.set(HtmlMapper.class, fixedHtmlMapper);
HtmlParser parser = new HtmlParser();
try {
parser.parse(is, linkHandler, metadata, parseContext);
IOUtils.closeQuietly(is);
List<Link> tikaLinks = linkHandler.getLinks();
Set<com.norconex.collector.http.url.Link> nxLinks = new HashSet<>(tikaLinks.size());
for (Link tikaLink : tikaLinks) {
if (!isIgnoreNofollow() && "nofollow".equalsIgnoreCase(StringUtils.trim(tikaLink.getRel()))) {
continue;
}
String extractedURL = tikaLink.getUri();
if (StringUtils.isBlank(extractedURL)) {
continue;
} else if (extractedURL.startsWith("?")) {
extractedURL = url + extractedURL;
} else if (extractedURL.startsWith("#")) {
extractedURL = url + extractedURL;
} else {
extractedURL = resolve(url, extractedURL);
}
if (StringUtils.isNotBlank(extractedURL)) {
com.norconex.collector.http.url.Link nxLink = new com.norconex.collector.http.url.Link(extractedURL);
nxLink.setReferrer(url);
if (StringUtils.isNotBlank(tikaLink.getText())) {
nxLink.setText(tikaLink.getText());
}
if (tikaLink.isAnchor()) {
nxLink.setTag("a.href");
} else if (tikaLink.isImage()) {
nxLink.setTag("img.src");
}
if (StringUtils.isNotBlank(tikaLink.getTitle())) {
nxLink.setTitle(tikaLink.getTitle());
}
nxLinks.add(nxLink);
}
}
//grab refresh URL from metadata (if present)
String refreshURL = getCaseInsensitive(metadata, "refresh");
if (StringUtils.isNotBlank(refreshURL)) {
Matcher matcher = META_REFRESH_PATTERN.matcher(refreshURL);
if (matcher.find()) {
refreshURL = matcher.group(URL_PATTERN_GROUP_URL);
}
refreshURL = resolve(url, refreshURL);
if (StringUtils.isNotBlank(refreshURL)) {
com.norconex.collector.http.url.Link nxLink = new com.norconex.collector.http.url.Link(refreshURL);
nxLink.setReferrer(url);
nxLinks.add(nxLink);
}
}
return nxLinks;
} catch (TikaExceptionSAXException | e) {
throw new IOException("Could not parse to extract URLs: " + url, e);
}
}Example 7
| Project: lutece-core-master File: PageIndexer.java View source code |
/**
* Builds a document which will be used by Lucene during the indexing of the pages of the site with the following fields : summary, uid, url, contents,
* title and description.
*
* @return the built Document
* @param strUrl
* The base URL for documents
* @param page
* the page to index
* @throws IOException
* The IO Exception
* @throws InterruptedException
* The InterruptedException
* @throws SiteMessageException
* occurs when a site message need to be displayed
*/
protected Document getDocument(Page page, String strUrl) throws IOException, InterruptedException, SiteMessageException {
FieldType ft = new FieldType(StringField.TYPE_STORED);
ft.setOmitNorms(false);
FieldType ftNotStored = new FieldType(StringField.TYPE_NOT_STORED);
ftNotStored.setOmitNorms(false);
ftNotStored.setTokenized(false);
// make a new, empty document
Document doc = new Document();
// Add the url as a field named "url". Use an UnIndexed field, so
// that the url is just stored with the document, but is not searchable.
doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft));
// Add the last modified date of the file a field named "modified".
// Use a field that is indexed (i.e. searchable), but don't tokenize
// the field into words.
String strDate = DateTools.dateToString(page.getDateUpdate(), DateTools.Resolution.DAY);
doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));
// Add the uid as a field, so that index can be incrementally maintained.
// This field is not stored with document, it is indexed, but it is not
// tokenized prior to indexing.
String strIdPage = String.valueOf(page.getId());
doc.add(new Field(SearchItem.FIELD_UID, strIdPage, ftNotStored));
String strPageContent = _pageService.getPageContent(page.getId(), 0, null);
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try {
new HtmlParser().parse(new ByteArrayInputStream(strPageContent.getBytes()), handler, metadata, new ParseContext());
} catch (SAXException e) {
throw new AppException("Error during page parsing.");
} catch (TikaException e) {
throw new AppException("Error during page parsing.");
}
// the content of the article is recovered in the parser because this one
// had replaced the encoded caracters (as é) by the corresponding special caracter (as ?)
StringBuilder sb = new StringBuilder(handler.toString());
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
StringBuilder sbFieldContent = new StringBuilder();
StringBuilder sbFieldMetadata = new StringBuilder();
sbFieldContent.append(page.getName()).append(" ").append(sb.toString());
// Add the metadata description of the page if it exists
if (page.getDescription() != null) {
sbFieldContent.append(" ").append(page.getDescription());
}
// Add the metadata keywords of the page if it exists
String strMetaKeywords = page.getMetaKeywords();
if (StringUtils.isNotBlank(strMetaKeywords)) {
sbFieldContent.append(" ").append(strMetaKeywords);
sbFieldMetadata.append(strMetaKeywords);
}
doc.add(new Field(SearchItem.FIELD_CONTENTS, sbFieldContent.toString(), TextField.TYPE_NOT_STORED));
if (StringUtils.isNotBlank(page.getMetaDescription())) {
if (sbFieldMetadata.length() > 0) {
sbFieldMetadata.append(" ");
}
sbFieldMetadata.append(page.getMetaDescription());
}
if (sbFieldMetadata.length() > 0) {
doc.add(new StringField(SearchItem.FIELD_METADATA, sbFieldMetadata.toString(), Field.Store.NO));
}
// Add the title as a separate Text field, so that it can be searched
// separately.
doc.add(new Field(SearchItem.FIELD_TITLE, page.getName(), ft));
if (StringUtils.isNotBlank(page.getDescription())) {
// Add the summary as an UnIndexed field, so that it is stored and returned
// with hit documents for display.
doc.add(new StoredField(SearchItem.FIELD_SUMMARY, page.getDescription()));
}
doc.add(new Field(SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft));
doc.add(new Field(SearchItem.FIELD_ROLE, page.getRole(), ft));
// return the document
return doc;
}Example 8
| Project: oodt-master File: ImapsProtocol.java View source code |
private String getContentFromHTML(Part p) throws MessagingException, IOException, SAXException, TikaException {
StringBuilder content = new StringBuilder("");
if (p.isMimeType("multipart/*")) {
Multipart mp = (Multipart) p.getContent();
int count = mp.getCount();
for (int i = 0; i < count; i++) {
content.append(getContentFromHTML(mp.getBodyPart(i)));
}
} else if (p.isMimeType("text/html")) {
HtmlParser parser = new HtmlParser();
Metadata met = new Metadata();
TextContentHandler handler = new TextContentHandler(new BodyContentHandler());
parser.parse(new ByteArrayInputStream(((String) p.getContent()).getBytes()), handler, met);
content.append(handler.toString());
} else {
Object obj = p.getContent();
if (obj instanceof Part) {
content.append(getContentFromHTML((Part) p.getContent()));
}
}
return content.toString();
}Example 9
| Project: carewebframework-core-master File: HelpSearchService.java View source code |
/**
* Initialize the index writer.
*
* @throws IOException Unspecified IO exception.
*/
public void init() throws IOException {
File path = resolveIndexDirectoryPath();
indexTracker = new IndexTracker(path);
indexDirectory = FSDirectory.open(path);
tika = new Tika(null, new HtmlParser());
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
writer = new IndexWriter(indexDirectory, config);
}