package uk.ac.shef.dcs.jate.util; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringEscapeUtils; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.common.SolrInputDocument; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.model.JATEDocument; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.*; import java.nio.file.Files; import java.nio.file.LinkOption; import java.nio.file.Path; import java.text.Normalizer; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.util.stream.Collectors.counting; import static java.util.stream.Collectors.groupingBy; public class JATEUtil { private static final Logger LOG = LoggerFactory.getLogger(JATEUtil.class); public static boolean isInteger(String s) { return isInteger(s, 10); } public static boolean isInteger(String s, int radix) { if (s.isEmpty()) return false; for (int i = 0; i < s.length(); i++) { if (i == 0 && s.charAt(i) == '-') { if (s.length() == 1) return false; else continue; } if (Character.digit(s.charAt(i), radix) < 0) return false; } return true; } /** * statistics of files and subdirectories * * @param corpusDir corpus directory path * @return Map {1|2: number} 1 stands for directory, 2 */ public static Map<Integer, Long> fileStatitics(Path corpusDir) throws IOException { Map<Integer, Long> stats = Files.walk(corpusDir) .parallel() .collect(groupingBy(n -> Files.isDirectory(n, LinkOption.NOFOLLOW_LINKS) ? 1 : 2, counting())); System.out.format("Files: %d, dirs: %d. ", stats.get(2), stats.get(1)); return stats; } public static JATEDocument loadACLRDTECDocument(InputStream fileInputStream) throws JATEException { return loadJATEDocFromXML(fileInputStream); } /** * load JATEDocument from any file. * * Raw text will be automatically extracted with Apache TIKA as document content * and Doc id will be set by file name. * * @param file any file format that is supported in Tika * @return JATEDocument return null if file name is null */ public static JATEDocument loadJATEDocument(Path file) throws JATEException { if (file.getFileName() == null){ return null; } String docId = file.getFileName().toString(); JATEDocument jateDocument = new JATEDocument(docId); jateDocument.setPath(file.toAbsolutePath().toString()); FileInputStream fileStream = null; try { fileStream = new FileInputStream(file.toFile()); jateDocument.setContent(parseToPlainText(fileStream)); return jateDocument; } catch (FileNotFoundException e) { throw new JATEException(String.format("File is not found from [%s]", file.toString())); } finally { if (fileStream != null) { try { fileStream.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * load ACL RD-TEC documents from raw text corpus * * @param rawTxtFile raw text file * @return JATEDocument */ public static JATEDocument loadACLRDTECDocumentFromRaw(File rawTxtFile) { JATEDocument jateDocument = new JATEDocument(rawTxtFile.toURI()); jateDocument.setId(rawTxtFile.getName()); StringBuilder rawTextBuffer = new StringBuilder(); try { List<String> lines = FileUtils.readLines(rawTxtFile); if (lines.size() > 0) { lines.forEach(rawTextBuffer::append); } } catch (IOException e) { e.printStackTrace(); } jateDocument.setContent(rawTextBuffer.toString()); return jateDocument; } /** * * @param fileInputStream ACL XML file * @return JATEDocument JATE Document Object * @throws JATEException */ private static JATEDocument loadJATEDocFromXML(InputStream fileInputStream) throws JATEException { SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = null; JATEDocument jateDocument = null; try { saxParser = factory.newSAXParser(); StringBuffer paperParagraphs = new StringBuffer(); StringBuffer paperId = new StringBuffer(); StringBuffer paperTitle = new StringBuffer(); DefaultHandler handler = new DefaultHandler() { boolean paper = false; boolean title = false; boolean section = false; boolean sectionTitle = false; boolean paragraph = false; boolean reference = false; public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws SAXException { if (qName.equalsIgnoreCase("Paper")) { paper = true; paperId.append(attributes.getValue("id")); } //TODO: need to skip title of reference, test data:P06-1139_cln.xml if (qName.equalsIgnoreCase("title")) { title = true; } if (qName.equalsIgnoreCase("Section")) { section = true; } if (qName.equalsIgnoreCase("SectionTitle")) { sectionTitle = true; } if (qName.equalsIgnoreCase("Paragraph")) { paragraph = true; } if (qName.equalsIgnoreCase("Reference")) { reference = true; } } public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equalsIgnoreCase("Paragraph")) { paragraph = false; } } public void characters(char ch[], int start, int length) throws SAXException { if (paper) { paper = false; } if (title) { title = false; if (!reference) { paperTitle.append(new String(ch, start, length)).append("\n"); } reference = false; } if (section) { section = false; } if (sectionTitle) { sectionTitle = false; } if (paragraph) { String paragraph = new String(ch, start, length); paperParagraphs.append(paragraph); } } }; saxParser.parse(fileInputStream, handler); StringBuffer fullText = new StringBuffer(); fullText.append(paperTitle).append("\n").append(paperParagraphs); String normalizedText = Normalizer.normalize(fullText.toString(), Normalizer.Form.NFD); normalizedText = StringEscapeUtils.unescapeXml(normalizedText); String cleanedText = cleanText(normalizedText); jateDocument = new JATEDocument(paperId.toString()); jateDocument.setContent(cleanedText.trim()); } catch (ParserConfigurationException e) { throw new JATEException("Failed to initialise SAXParser!" + e.toString()); } catch (SAXException e) { throw new JATEException("Failed to initialise SAXParser!" + e.toString()); } catch (IOException ioe) { throw new JATEException("I/O Exception when parsing input file!" + ioe.toString()); } return jateDocument; } public static String cleanText(String normalizedText) { List<String> extractBrokenWords = extractBrokenWords(normalizedText); // extractBrokenWords.parallelStream().forEach((extractBrokenWord) -> // fixBrokenWords(normalizedParagraph, extractBrokenWord) ); String cleanedText = normalizedText; for (String extractBrokenWord : extractBrokenWords) { cleanedText = fixBrokenWords(cleanedText, extractBrokenWord); } return cleanedText; } /** * two regex pattern matching rules to extract broken words in ACL RD-TEC corpus caused by pdf converter * e.g., "P r e v i o u s" for "previous" * * @param paragraphs text * @return List<String> a list of matched "broken word" text */ public static List<String> extractBrokenWords(String paragraphs) { List<String> brokenWords = new ArrayList<>(); Pattern p1 = Pattern.compile("([A-Z]\\s([a-z]\\s){3,10})"); Matcher m = p1.matcher(paragraphs); while (m.find()) { brokenWords.add(m.group()); } Pattern p2 = Pattern.compile("([A-Z]\\s([A-Z]\\s){3,10})"); m = p2.matcher(paragraphs); while (m.find()) { brokenWords.add(m.group()); } return brokenWords; } /** * clean text for fixing broken words * * @param paragraph text * @param brokenWord matched broken word * @return String cleaned text */ public static String fixBrokenWords(String paragraph, String brokenWord) { return paragraph.replaceAll(brokenWord, brokenWord.replaceAll(" ", "").concat(" ")); } /** * load files from corpus directory recursively * * @param corpusDir corpus directory * @return List<Path> * @throws JATEException */ public static List<Path> loadFiles(Path corpusDir) throws JATEException { try { List<Path> files = new ArrayList<>(); Files.walk(corpusDir).parallel().forEach((n) -> { if (!Files.isDirectory(n, LinkOption.NOFOLLOW_LINKS)) { files.add(n); } }); return files; } catch (IOException e) { //LOG.error(String.format("Failed to access corpus path [%s]", corpusDir.toUri())); throw new JATEException(String.format("Failed to access corpus path [%s]", corpusDir.toUri())); } } public static String parseToPlainText(InputStream fileStream) { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); String rawContent = ""; try { parser.parse(fileStream, handler, metadata); rawContent = handler.toString(); } catch (IOException | SAXException | TikaException e) { LOG.debug("Parsing Exception while extracting content from current file. " + e.toString()); } return rawContent; } public static void addNewDoc(EmbeddedSolrServer server, String docId, String docTitle, String text, JATEProperties jateProperties, boolean commit) throws IOException, SolrServerException, JATEException { SolrInputDocument newDoc = new SolrInputDocument(); newDoc.addField("id", docId); newDoc.addField("title_s", docTitle); newDoc.addField("text", text); newDoc.addField(jateProperties.getSolrFieldNameJATENGramInfo(), text); newDoc.addField(jateProperties.getSolrFieldNameJATECTerms(), text); server.add(newDoc); if (commit) { server.commit(); } } }