package com.aspose.pdf.examples.AsposePdfExamples.Text;
import java.io.IOException;
import com.aspose.pdf.Document;
import com.aspose.pdf.TextAbsorber;
import com.aspose.pdf.TextExtractionOptions;
import com.aspose.pdf.TextFragment;
import com.aspose.pdf.TextFragmentAbsorber;
import com.aspose.pdf.TextFragmentCollection;
public class ExtractTextBasedOnColumns {
public static void main(String[] args) throws IOException {
extractTextBasedOnColumns();
usingSetScaleFactorMethod();
}
public static void extractTextBasedOnColumns() throws IOException {
String path = "PathToDir";
// instantiate Document instance with path of input file as argument
Document pdfDocument = new Document(path + "net_New-age NED's.pdf");
// create TextFragment Absorber instance
TextFragmentAbsorber tfa = new TextFragmentAbsorber();
pdfDocument.getPages().accept(tfa);
// create TextFragment Collection instance
TextFragmentCollection tfc = tfa.getTextFragments();
for (TextFragment tf : (Iterable<TextFragment>) tfc) {
// need to reduce font size at least for 70%
tf.getTextState().setFontSize(tf.getTextState().getFontSize() * 0.7f);
}
// temporary save the file
pdfDocument.save("" + "TempOutput.pdf");
pdfDocument = new Document("TempOutput.pdf");
TextAbsorber textAbsorber = new TextAbsorber();
pdfDocument.getPages().accept(textAbsorber);
String extractedText = textAbsorber.getText();
textAbsorber.visit(pdfDocument);
// Create a writer and open the file
java.io.FileWriter writer = new java.io.FileWriter(new java.io.File("Extracted_text.txt"));
writer.write(extractedText);
// Write a line of text to the file
// Close the stream
writer.close();
}
public static void usingSetScaleFactorMethod() {
Document pdfDocument = new Document("inputFile.pdf");
TextAbsorber textAbsorber = new TextAbsorber();
textAbsorber.setExtractionOptions(new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Pure));
// Setting scale factor to 0.5 is enough to split columns in the majority of documents
// Setting of zero allows to algorithm choose scale factor automatically
textAbsorber.getExtractionOptions().setScaleFactor((double) 0.5);
pdfDocument.getPages().accept(textAbsorber);
String extractedText = textAbsorber.getText();
}
}