Java Examples for org.apache.tika.parser.html.HtmlParser

The following java examples will help you to understand the usage of org.apache.tika.parser.html.HtmlParser. These source code samples are taken from different open source projects.

Example 1

Project: trombone-master File: XmlOrHtmlTikaParser.java View source code

/* (non-Javadoc)
	 * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext)
	 */
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Detector detector = new DefaultDetector();
    MediaType mediaType = detector.detect(stream, metadata);
    if (mediaType == MediaType.TEXT_HTML) {
        new HtmlParser().parse(stream, handler, metadata, context);
    } else {
        new XMLParser().parse(stream, handler, metadata, context);
    }
}

Example 2

Project: tika-master File: OutlookExtractor.java View source code

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}

Example 3

Project: openalexis-master File: WebContentParserImpl.java View source code

public String parseHTML(final String string) throws IOException {
    final StringBuffer buffer = new StringBuffer();
    final HtmlParser htmlParser = new HtmlParser();
    final BoilerpipeContentHandler handler = new BoilerpipeContentHandler(new FullTextContentHandler(buffer), KeepEverythingExtractor.INSTANCE);
    return parse(htmlParser, new ReaderInputStream(new StringReader(string)), handler, buffer);
}

Example 4

Project: streamflow-core-master File: Translator.java View source code

public static String htmlToText(String html) {
    String result = html;
    String encoding = "";
    // if HTML contains encoding information we do not have to guess encoding!
    Document doc = Jsoup.parse(result);
    Element meta = doc.select("meta[http-equiv]").first();
    if (meta != null) {
        String contentString = meta.attr("content");
        contentString = contentString.indexOf(' ') != -1 ? contentString.replace(' ', ';') : contentString;
        ContentType contentType = new ContentType(contentString);
        encoding = contentType.getCharacterSet() != null ? contentType.getCharacterSet().getName() : "";
    }
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        // if we already found an encoding - don't guess
        new HtmlParser().parse(IOUtils.toInputStream(result, (!Strings.empty(encoding) ? encoding : guessEncoding(result))), handler, metadata, new ParseContext());
        result = handler.toString();
        // since Pdfbox COSString would interpret a string containing dash as UTF-16
        if (result.indexOf("–") != 1) {
            result = result.replace("–", "-");
        }
    } catch (Exception e) {
    }
    return result;
}

Example 5

Project: book-master File: TikaTest.java View source code

@Test
public void testHtml() throws Exception {
    String html = "<html><head><title>The Big Brown Shoe</title></head><body><p>The best pizza place " + "in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" + "<p>It is located in Amherst, MA.</p></body></html>";
    //<start id="tika-html"/>
    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    //<co id="html.text.co"/>
    ContentHandler text = new BodyContentHandler();
    //<co id="html.link.co"/>
    LinkContentHandler links = new LinkContentHandler();
    //<co id="html.merge"/>
    ContentHandler handler = new TeeContentHandler(links, text);
    //<co id="html.store"/>
    Metadata metadata = new Metadata();
    //<co id="html.parser"/>
    Parser parser = new HtmlParser();
    ParseContext context = new ParseContext();
    //<co id="html.parse"/>
    parser.parse(input, handler, metadata, context);
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
/*
    <calloutlist>
        <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>
        <callout arearefs="html.link.co"><para>Construct ContentHandler that knows about HTML links</para></callout>
        <callout arearefs="html.merge"><para>Wrap up our ContentHandlers into one</para></callout>
        <callout arearefs="html.store"><para>Metadata is a simple storage mechanism where the extracted metadata gets stored</para></callout>
        <callout arearefs="html.parser"><para>We know the input is HTML, so construct a Parser to parse it</para></callout>
        <callout arearefs="html.parse"><para>Do the parse</para></callout>
    </calloutlist>
    */
//<end id="tika-html"/>
}

Example 6

Project: collector-http-master File: TikaLinkExtractor.java View source code

@Override
public Set<com.norconex.collector.http.url.Link> extractLinks(InputStream is, String url, ContentType contentType) throws IOException {
    LinkContentHandler linkHandler = new LinkContentHandler();
    Metadata metadata = new Metadata();
    ParseContext parseContext = new ParseContext();
    parseContext.set(HtmlMapper.class, fixedHtmlMapper);
    HtmlParser parser = new HtmlParser();
    try {
        parser.parse(is, linkHandler, metadata, parseContext);
        IOUtils.closeQuietly(is);
        List<Link> tikaLinks = linkHandler.getLinks();
        Set<com.norconex.collector.http.url.Link> nxLinks = new HashSet<>(tikaLinks.size());
        for (Link tikaLink : tikaLinks) {
            if (!isIgnoreNofollow() && "nofollow".equalsIgnoreCase(StringUtils.trim(tikaLink.getRel()))) {
                continue;
            }
            String extractedURL = tikaLink.getUri();
            if (StringUtils.isBlank(extractedURL)) {
                continue;
            } else if (extractedURL.startsWith("?")) {
                extractedURL = url + extractedURL;
            } else if (extractedURL.startsWith("#")) {
                extractedURL = url + extractedURL;
            } else {
                extractedURL = resolve(url, extractedURL);
            }
            if (StringUtils.isNotBlank(extractedURL)) {
                com.norconex.collector.http.url.Link nxLink = new com.norconex.collector.http.url.Link(extractedURL);
                nxLink.setReferrer(url);
                if (StringUtils.isNotBlank(tikaLink.getText())) {
                    nxLink.setText(tikaLink.getText());
                }
                if (tikaLink.isAnchor()) {
                    nxLink.setTag("a.href");
                } else if (tikaLink.isImage()) {
                    nxLink.setTag("img.src");
                }
                if (StringUtils.isNotBlank(tikaLink.getTitle())) {
                    nxLink.setTitle(tikaLink.getTitle());
                }
                nxLinks.add(nxLink);
            }
        }
        //grab refresh URL from metadata (if present)
        String refreshURL = getCaseInsensitive(metadata, "refresh");
        if (StringUtils.isNotBlank(refreshURL)) {
            Matcher matcher = META_REFRESH_PATTERN.matcher(refreshURL);
            if (matcher.find()) {
                refreshURL = matcher.group(URL_PATTERN_GROUP_URL);
            }
            refreshURL = resolve(url, refreshURL);
            if (StringUtils.isNotBlank(refreshURL)) {
                com.norconex.collector.http.url.Link nxLink = new com.norconex.collector.http.url.Link(refreshURL);
                nxLink.setReferrer(url);
                nxLinks.add(nxLink);
            }
        }
        return nxLinks;
    } catch (TikaExceptionSAXException |  e) {
        throw new IOException("Could not parse to extract URLs: " + url, e);
    }
}

Example 7

Project: lutece-core-master File: PageIndexer.java View source code

/**
     * Builds a document which will be used by Lucene during the indexing of the pages of the site with the following fields : summary, uid, url, contents,
     * title and description.
     * 
     * @return the built Document
     * @param strUrl
     *            The base URL for documents
     * @param page
     *            the page to index
     * @throws IOException
     *             The IO Exception
     * @throws InterruptedException
     *             The InterruptedException
     * @throws SiteMessageException
     *             occurs when a site message need to be displayed
     */
protected Document getDocument(Page page, String strUrl) throws IOException, InterruptedException, SiteMessageException {
    FieldType ft = new FieldType(StringField.TYPE_STORED);
    ft.setOmitNorms(false);
    FieldType ftNotStored = new FieldType(StringField.TYPE_NOT_STORED);
    ftNotStored.setOmitNorms(false);
    ftNotStored.setTokenized(false);
    // make a new, empty document
    Document doc = new Document();
    // Add the url as a field named "url". Use an UnIndexed field, so
    // that the url is just stored with the document, but is not searchable.
    doc.add(new Field(SearchItem.FIELD_URL, strUrl, ft));
    // Add the last modified date of the file a field named "modified".
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    String strDate = DateTools.dateToString(page.getDateUpdate(), DateTools.Resolution.DAY);
    doc.add(new Field(SearchItem.FIELD_DATE, strDate, ft));
    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    String strIdPage = String.valueOf(page.getId());
    doc.add(new Field(SearchItem.FIELD_UID, strIdPage, ftNotStored));
    String strPageContent = _pageService.getPageContent(page.getId(), 0, null);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        new HtmlParser().parse(new ByteArrayInputStream(strPageContent.getBytes()), handler, metadata, new ParseContext());
    } catch (SAXException e) {
        throw new AppException("Error during page parsing.");
    } catch (TikaException e) {
        throw new AppException("Error during page parsing.");
    }
    // the content of the article is recovered in the parser because this one
    // had replaced the encoded caracters (as é) by the corresponding special caracter (as ?)
    StringBuilder sb = new StringBuilder(handler.toString());
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    StringBuilder sbFieldContent = new StringBuilder();
    StringBuilder sbFieldMetadata = new StringBuilder();
    sbFieldContent.append(page.getName()).append(" ").append(sb.toString());
    // Add the metadata description of the page if it exists
    if (page.getDescription() != null) {
        sbFieldContent.append(" ").append(page.getDescription());
    }
    // Add the metadata keywords of the page if it exists
    String strMetaKeywords = page.getMetaKeywords();
    if (StringUtils.isNotBlank(strMetaKeywords)) {
        sbFieldContent.append(" ").append(strMetaKeywords);
        sbFieldMetadata.append(strMetaKeywords);
    }
    doc.add(new Field(SearchItem.FIELD_CONTENTS, sbFieldContent.toString(), TextField.TYPE_NOT_STORED));
    if (StringUtils.isNotBlank(page.getMetaDescription())) {
        if (sbFieldMetadata.length() > 0) {
            sbFieldMetadata.append(" ");
        }
        sbFieldMetadata.append(page.getMetaDescription());
    }
    if (sbFieldMetadata.length() > 0) {
        doc.add(new StringField(SearchItem.FIELD_METADATA, sbFieldMetadata.toString(), Field.Store.NO));
    }
    // Add the title as a separate Text field, so that it can be searched
    // separately.
    doc.add(new Field(SearchItem.FIELD_TITLE, page.getName(), ft));
    if (StringUtils.isNotBlank(page.getDescription())) {
        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        doc.add(new StoredField(SearchItem.FIELD_SUMMARY, page.getDescription()));
    }
    doc.add(new Field(SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft));
    doc.add(new Field(SearchItem.FIELD_ROLE, page.getRole(), ft));
    // return the document
    return doc;
}

Example 8

Project: oodt-master File: ImapsProtocol.java View source code

private String getContentFromHTML(Part p) throws MessagingException, IOException, SAXException, TikaException {
    StringBuilder content = new StringBuilder("");
    if (p.isMimeType("multipart/*")) {
        Multipart mp = (Multipart) p.getContent();
        int count = mp.getCount();
        for (int i = 0; i < count; i++) {
            content.append(getContentFromHTML(mp.getBodyPart(i)));
        }
    } else if (p.isMimeType("text/html")) {
        HtmlParser parser = new HtmlParser();
        Metadata met = new Metadata();
        TextContentHandler handler = new TextContentHandler(new BodyContentHandler());
        parser.parse(new ByteArrayInputStream(((String) p.getContent()).getBytes()), handler, met);
        content.append(handler.toString());
    } else {
        Object obj = p.getContent();
        if (obj instanceof Part) {
            content.append(getContentFromHTML((Part) p.getContent()));
        }
    }
    return content.toString();
}

Example 9

Project: carewebframework-core-master File: HelpSearchService.java View source code

/**
     * Initialize the index writer.
     * 
     * @throws IOException Unspecified IO exception.
     */
public void init() throws IOException {
    File path = resolveIndexDirectoryPath();
    indexTracker = new IndexTracker(path);
    indexDirectory = FSDirectory.open(path);
    tika = new Tika(null, new HtmlParser());
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
    writer = new IndexWriter(indexDirectory, config);
}