/** ========================================================================
* handytrowel: src/main/java/cli/Main.java
* Command-line interface that executes handytrowel.
* ========================================================================
* Copyright (c) 2014, Asim Ihsan, All rights reserved.
* <http://www.asimihsan.com>
* https://github.com/asimihsan/handytrowel/blob/master/LICENSE
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ========================================================================
*/
package com.asimihsan.handytrowel.cli;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeoutException;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
import de.l3s.boilerpipe.sax.HTMLDocument;
import com.asimihsan.handytrowel.extraction.LinkExtractor;
import com.asimihsan.handytrowel.network.HTMLFetcher;
import com.asimihsan.handytrowel.network.HTMLFetcher.HTMLFetcherBuilder;
import com.asimihsan.handytrowel.nlp.TextAnalyzer;
import com.asimihsan.handytrowel.nlp.TextAnalyzer.TextAnalyzerBuilder;
import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
public class Main {
// Positional arguments
@Argument private List<String> arguments = new ArrayList<>();
public static void main(String[] args) throws SAXException, CmdLineException, TimeoutException, BoilerpipeProcessingException, IOException {
new Main().doMain(args);
}
public void doMain(String[] args) throws SAXException, CmdLineException, TimeoutException, BoilerpipeProcessingException, IOException {
CmdLineParser parser = new CmdLineParser(this);
parser.setUsageWidth(80);
try {
parser.parseArgument(args);
if (arguments.isEmpty())
throw new CmdLineException(parser, "No arguments were given");
} catch (final CmdLineException e) {
System.err.println(e.getMessage());
System.err.println("handytrowel [URL]");
parser.printUsage(System.err);
System.err.println();
throw e;
}
String url = arguments.get(0);
HTMLFetcher htmlFetcher = new HTMLFetcherBuilder()
.timeoutMillis(30 * 10000)
.build();
String pageSource = null;
try {
pageSource = htmlFetcher.getPageSource(url);
} catch (final TimeoutException e) {
e.printStackTrace();
throw e;
}
String extractedBody = null;
List<String> links = null;
try {
final HTMLDocument htmlDoc = new HTMLDocument(pageSource);
final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
ArticleExtractor.INSTANCE.process(doc);
final InputSource is = htmlDoc.toInputSource();
links = LinkExtractor.INSTANCE.process(doc, is);
/*
* working article sentences extractor
* !!AI do I have to call this again, or can I piggy back on LinkExtractor?
*/
extractedBody = ArticleExtractor.INSTANCE.getText(pageSource);
} catch (BoilerpipeProcessingException e) {
e.printStackTrace();
throw e;
}
TextAnalyzer analyzer = new TextAnalyzerBuilder()
.body(extractedBody)
.build()
.analyze();
List<String> tokens = analyzer.getTokens();
ObjectMapper mapper = new ObjectMapper();
mapper.configure(SerializationFeature.INDENT_OUTPUT, true);
Map<String, Object> articleData = new HashMap<>();
articleData.put("extractedBody", extractedBody);
articleData.put("links", links);
articleData.put("tokens", tokens);
try {
mapper.writeValue(System.out, articleData);
} catch (JsonGenerationException e) {
e.printStackTrace();
throw e;
} catch (JsonMappingException e) {
e.printStackTrace();
throw e;
} catch (IOException e) {
e.printStackTrace();
throw e;
}
}
}