12
Jul

Extracting images from a PDF File – org.apache.pdfbox

Maven dependency configuration

org.apache.pdfbox
pdfbox
2.0.8

This application extracts images from a PDF document. We loop over each page and get all the resources. Next we iterate over each object and filter out all the images.


import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.io.File;
import java.io.IOException;
public class ExtractImages {
private static final String OUTPUT_DIR = "/tmp/";
public static void main(String[] args) throws Exception{
try (final PDDocument document = PDDocument.load(new File("c:\\abc.pdf"))){
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
int i = 1;
for (COSName name : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(name);
if (o instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject)o;
String filename = OUTPUT_DIR + "extracted-image-" + i + ".png";
ImageIO.write(image.getImage(), "png", new File(filename));
i++;
}
}
}
} catch (IOException e){
System.err.println("Exception while trying to create pdf document - " + e);
}
}
}