/*
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.reuters;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Extract all the documents from a Reuters-21587 corpus in SGML format. The SGML files are expected
* to reside in a single directory.
* <p>
* This is an adaption of the {@code ExtractReuters} class in the {@code lucene-benchmarks} package.
*
* @see <a href="http://lucene.apache.org/core/5_3_1/benchmark/org/apache/lucene/benchmark/utils/ExtractReuters.html">ExtractReuters</a>
*/
public class ExtractReuters
{
private static Set<String> NESTED_TAGS = new HashSet<>(
Arrays.asList(new String[] { "TOPICS" }));
private static Pattern EXTRACTION_PATTERN = Pattern
.compile(
" (LEWISSPLIT)=\"(.*?)\"|(CGISPLIT)=\"(.*?)\"|(OLDID)=\"(.*?)\"|(NEWID)=\"(.*?)\"|<(TITLE)>(.*?)</TITLE>|<(DATE)>(.*?)</DATE>|<(BODY)>(.*?)</BODY>|<(TOPICS)>(.*?)</TOPICS>|<(PLACES)>(.*?)</PLACES>|<(PEOPLE)>(.*?)</PEOPLE>|<(ORGS)>(.*?)</ORGS>|<(EXCHANGES)>(.*?)</EXCHANGES>|<(COMPANIES)>(.*?)</COMPANIES>|<(UNKNOWN)>(.*?)</UNKNOWN>|<(DATELINE)>(.*?)</DATELINE>");
private static Pattern NESTED_EXTRACTION_PATTERN = Pattern.compile("<D>(.*?)</D>");
private static String[] META_CHARS = { "&", "<", ">", "\"", "'" };
private static String[] META_CHARS_SERIALIZATIONS = { "&", "<", ">", """,
"'" };
/**
* Read all the SGML file in the given directory.
*
* @param reutersDir
* the directory that contains the Reuters SGML files.
* @return a list of {@link ReutersDocument}s
* @throws IOException
* if any of the files cannot be read.
* @throws ParseException
* if there was a problem parsing a date
*/
public static List<ReutersDocument> extract(Path reutersDir)
throws IOException, ParseException
{
List<ReutersDocument> docs = new ArrayList<>();
DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm");
for (Path sgmFile : stream) {
InputStream inputStream = Files.newInputStream(sgmFile);
docs.addAll(extractFile(inputStream, sgmFile.toUri()));
}
return docs;
}
/**
* Read the documents out of a single file. Each file contains approximately 1000 documents.
*
* @param sgmFile
* an {@link InputStream} of a Reuters SGML file.
* @param uri
* an {@link URI} pointing to the original SGML file location
* @return a list of {@link ReutersDocument}s extracted from the input stream
* @throws IOException
* if any of the files cannot be read.
* @throws ParseException
* if there was a problem parsing a date
*/
public static List<ReutersDocument> extractFile(InputStream sgmFile, URI uri)
throws IOException, ParseException
{
BufferedReader reader = new BufferedReader(
new InputStreamReader(sgmFile, StandardCharsets.ISO_8859_1));
List<ReutersDocument> entries = new ArrayList<>(); // collection of all documents in file
StringBuilder docBuffer = new StringBuilder(1024); // text of current document
String line;
while ((line = reader.readLine()) != null) {
// when we see a closing reuters tag, flush the file
if (!line.contains("</REUTERS")) {
/* document continuing */
docBuffer.append(line).append(' ');// accumulate the strings for now,
// then apply regular expression to
// get the pieces,
}
else {
/* document end reached in input file, parse content */
ReutersDocument reutersDocument = new ReutersDocument();
// Extract the relevant pieces and write to a map representing the document
Matcher matcher = EXTRACTION_PATTERN.matcher(docBuffer);
while (matcher.find()) {
/* iterate over outer tags */
for (int i = 1; i <= matcher.groupCount(); i += 2) {
if (matcher.group(i) != null) {
String tag = matcher.group(i).trim();
String value = matcher.group(i + 1).trim();
/* replace SGML characters */
for (int j = 0; j < META_CHARS_SERIALIZATIONS.length; j++) {
value = value
.replaceAll(META_CHARS_SERIALIZATIONS[j], META_CHARS[j]);
}
/* extract value(s) */
if (NESTED_TAGS.contains(tag)) {
extractNested(reutersDocument, tag, value);
}
else {
reutersDocument.set(tag, value);
}
}
}
}
/* add metadata information for current doc */
reutersDocument.setPath(uri);
entries.add(reutersDocument);
/* reset document buffer */
docBuffer.setLength(0);
}
}
return entries;
}
/**
* Find the {@code <D>} tags that are nested within another tag and add them to the given {@link ReutersDocument}.
*
* @param doc the current document represented as a {@link ReutersDocument}.
* @param tag the outer tag, e.g. {@code <TOPICS>}
* @param text the value of the outer tag from which nested tags are extracted
*/
private static void extractNested(ReutersDocument doc, String tag, String text)
throws ParseException
{
Matcher nestedMatcher = NESTED_EXTRACTION_PATTERN.matcher(text);
while (nestedMatcher.find()) {
/* iterate over <D> tags */
for (int j = 1; j <= nestedMatcher.groupCount(); j++) {
String d = nestedMatcher.group(j).trim();
doc.set(tag, d);
}
}
}
}