/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse.text; import org.apache.nutch.protocol.Content; import org.apache.nutch.parse.*; import org.apache.nutch.util.*; import org.apache.hadoop.conf.Configuration; public class TextParser implements Parser { private Configuration conf; /** * Encoding to be used when character set isn't specified * as HTTP header. */ private String defaultEncoding; /** * Parses plain text document. This code uses configured default encoding * {@code parser.character.encoding.default} if character set isn't specified * as HTTP header. */ public ParseResult getParse(Content content) { EncodingDetector detector = new EncodingDetector(conf); detector.autoDetectClues(content, false); String encoding = detector.guessEncoding(content, defaultEncoding); String text; try { text = new String(content.getContent(), encoding); } catch (java.io.UnsupportedEncodingException e) { return new ParseStatus(e) .getEmptyParseResult(content.getUrl(), getConf()); } ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata()); return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); } public void setConf(Configuration conf) { this.conf = conf; defaultEncoding = conf.get("parser.character.encoding.default", "windows-1252"); } public Configuration getConf() { return this.conf; } }