WebContentParserTest.java example

Explorer

openalexis-master
- alexis-dao
  - src
    - main
      - java
        com
        mothsoft
        alexis
        dao
        DataSetDao.java
        DataSetDaoImpl.java
        DataSetPointDao.java
        DataSetPointDaoImpl.java
        DataSetTypeDao.java
        DataSetTypeDaoImpl.java
        DocumentDao.java
        DocumentDaoImpl.java
        ModelDao.java
        ModelDaoImpl.java
        RssFeedDao.java
        RssFeedDaoImpl.java
        SocialConnectionDao.java
        SocialConnectionDaoImpl.java
        SourceDao.java
        SourceDaoImpl.java
        TermDao.java
        TermDaoImpl.java
        TopicDao.java
        TopicDaoImpl.java
        TweetDao.java
        TweetDaoImpl.java
        UserDao.java
        UserDaoImpl.java
- alexis-domain
  - src
    - main
      - java
        com
        mothsoft
        alexis
        domain
        AssociationType.java
        ByteArrayAsStringFieldBridge.java
        Calculator.java
        DataRange.java
        DataSet.java
        DataSetAggregationAction.java
        DataSetPoint.java
        DataSetType.java
        DateAsLongFieldBridge.java
        DateConstants.java
        Document.java
        DocumentAssociation.java
        DocumentContent.java
        DocumentNamedEntity.java
        DocumentScore.java
        DocumentState.java
        DocumentStateFieldBridge.java
        DocumentTerm.java
        DocumentTermId.java
        DocumentType.java
        DocumentUser.java
        DocumentUserFieldBridge.java
        Edge.java
        FacebookSource.java
        Graph.java
        ImportantNamedEntity.java
        ImportantTerm.java
        Model.java
        ModelState.java
        ModelType.java
        Node.java
        ParsedContent.java
        PartOfSpeech.java
        RssFeed.java
        RssSource.java
        Sentiment.java
        SocialConnection.java
        SocialNetworkType.java
        SortOrder.java
        Source.java
        SourceType.java
        StopWords.java
        TFIDF.java
        Term.java
        TermComparator.java
        TimeUnits.java
        Topic.java
        TopicActivityDataSet.java
        TopicDocument.java
        TopicDocumentFieldBridge.java
        Tweet.java
        TweetFormatter.java
        TweetHashtag.java
        TweetLink.java
        TweetMention.java
        TwitterSource.java
        User.java
        UserApiToken.java
        UserAuthenticationDetails.java
        util
        HttpClientResponse.java
        NetworkingUtil.java
    - test
      - java
        com
        mothsoft
        alexis
        domain
        CalculatorTest.java
        ModelTest.java
- alexis-engine
  - src
    - main
      - java
        com
        mothsoft
        alexis
        engine
        CronTaskTrigger.java
        Task.java
        numeric
        CorrelationCalculator.java
        CorrelationCalculatorImpl.java
        DataSetImporter.java
        President2012DataSetImporter.java
        StockQuoteDataSetImporter.java
        TopicActivityDataSetImporter.java
        predictive
        AbstractModelTrainer.java
        ModelTrainer.java
        OpenNLPMaxentContextBuilder.java
        OpenNLPMaxentModelExecutorTask.java
        OpenNLPMaxentModelTrainerTask.java
        retrieval
        DocumentRetrievalTaskImpl.java
        IntelligentDelay.java
        RetrievalTask.java
        RssRetrievalTaskImpl.java
        TwitterRetrievalTaskImpl.java
        textual
        CompositeTaskImpl.java
        DocumentFeatureContext.java
        DocumentFeatures.java
        LuceneIndexerTask.java
        ParseResponseMessageListener.java
        TFIDFCalculatorImpl.java
        TopicDocumentMatcherImpl.java
        TransactionalCompositeTaskImpl.java
        WebContentParser.java
        WebContentParserImpl.java
    - test
      - java
        com
        mothsoft
        alexis
        engine
        numeric
        CorrelationCalculatorTest.java
        predictive
        OpenNLPMaxentTest.java
        textual
        WebContentParserTest.java
- alexis-rest-api
  - src
    - main
      - java
        com
        mothsoft
        alexis
        rest
        analysis
        v1
        AnalysisResource.java
        Edge.java
        Graph.java
        Node.java
        dataset
        v1
        Correlation.java
        DataSet.java
        DataSetPoint.java
        DataSetResource.java
        document
        v1
        Document.java
        DocumentResource.java
        ImportantTerm.java
        ImportantTerms.java
        Tweet.java
        source
        v1
        Source.java
        SourceResource.java
- alexis-security
  - src
    - main
      - java
        com
        mothsoft
        alexis
        security
        CurrentUserUtil.java
- alexis-service-api
  - src
    - main
      - java
        com
        mothsoft
        alexis
        service
        DataSetService.java
        DocumentService.java
        ModelService.java
        SourceService.java
        TopicService.java
        UserService.java
- alexis-service-impl
  - src
    - main
      - java
        com
        mothsoft
        alexis
        service
        impl
        DataSetServiceImpl.java
        DocumentServiceImpl.java
        ModelServiceImpl.java
        SourceServiceImpl.java
        TopicServiceImpl.java
        UserServiceImpl.java
        security
        AlexisUserDetailsService.java
- alexis-service-war
  - src
    - main
      - java
        com
        mothsoft
        alexis
        rest
        analysis
        v1
        impl
        AnalysisResourceImpl.java
        dataset
        v1
        impl
        DataSetResourceImpl.java
        document
        v1
        impl
        DocumentResourceImpl.java
        source
        v1
        impl
        SourceResourceImpl.java
        service
        exception
        DefaultExceptionMapper.java
        EmptyResultDataAccessExceptionMapper.java
        SecurityExceptionMapper.java
        monitoring
        RequestTimingFilter.java
        scheduler
        StartQuartzTask.java
        security
        AlexisApiAuthenticationProvider.java
- alexis-ui-war
  - src
    - main
      - java
        com
        mothsoft
        alexis
        web
        AddEditModelBackingBean.java
        AddEditSourceBackingBean.java
        AddEditTopicBackingBean.java
        ChartServlet.java
        ChartingBackingBean.java
        CorrelationBackingBean.java
        CurrentUser.java
        DashboardBackingBean.java
        ListDocumentsBackingBean.java
        ListModelsBackingBean.java
        ListSourcesBackingBean.java
        ListTopicsBackingBean.java
        Navigation.java
        SearchBackingBean.java
        SelectSeriesBackingBean.java
        TermPredictorsBackingBean.java
        TermsOfServiceBackingBean.java
        TwitterBackingBean.java
        ValueObject.java
        ViewDocumentDetailsBackingBean.java
        ViewTopicDetailsBackingBean.java
        faces
        HumanReadableBytesConverter.java
        LoginErrorPhaseListener.java
        LuceneSearchExpressionValidator.java
        TweetConverter.java
        logging
        JavaUtilLoggingFactory.java
        security
        AlexisWebAuthenticationProvider.java
        GoogleOauthAuthenticationFilter.java
        GoogleOauthAuthenticationProvider.java
        GoogleOauthServlet.java
        OutboundRestAuthenticationInterceptor.java
        StoreUsernameInSessionFilter.java
        TermsOfServiceFilter.java
- twitter-integration
  - src
    - main
      - java
        com
        mothsoft
        integration
        twitter
        TwitterService.java
        TwitterServiceException.java
        TwitterServiceImpl.java
    - test
      - java
        com
        mothsoft
        integration
        twitter
        TwitterServiceImplTest.java

/*   Copyright 2012 Tim Garrett, Mothsoft LLC
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package com.mothsoft.alexis.engine.textual;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import org.junit.Test;

public class WebContentParserTest {

    private WebContentParser wcp = new WebContentParserImpl();

    @Test
    public void testParseInputStreamText() throws IOException {
        final String text = "Hello, I am a document.";
        final InputStream is = new ByteArrayInputStream(text.getBytes(Charset.forName("UTF-8")));
        assertEquals("Hello, I am a document.", wcp.parse(is));
    }

    // FIXME - Boilerpipe isn't doing that good of job. The first sentence of
    // this article doesn't even make it in with the ArticleExtractor. Consider
    // writing a stack-based parser that tracks probable content tags and
    // discards on inferred HTML semantic structure rather than trying to do it
    // with Boilerpipe's algorithms
    @Test
    public void testParseInputStreamHTML() throws IOException {
        final InputStream is = this.getClass().getClassLoader().getResourceAsStream("test-article.html");
        final String document = wcp.parse(is);
        System.out.println(document);
        assertTrue(document.contains("The self-proclaimed mastermind of"));
    }

    @Test
    public void testParseHTML() throws IOException {
        final String html = "I hate <b>HTML</b> when I am expecting <em>only</em> plain text.";
        assertEquals("I hate HTML when I am expecting only plain text.", wcp.parseHTML(html));
    }

}