Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 14 additions & 19 deletions src/main/java/com/mindee/image/ImageExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import com.mindee.geometry.PositionDataField;
import com.mindee.input.InputSourceUtils;
import com.mindee.input.LocalInputSource;
import com.mindee.pdf.PDFUtils;
import com.mindee.pdf.PDFBoxApi;
import com.mindee.pdf.PDFOperation;
import com.mindee.pdf.PdfPageImage;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
Expand All @@ -21,29 +22,13 @@ public class ImageExtractor {
private final String filename;
private final String saveFormat;

/**
* Init from a path.
*
* @param filePath Path to the file.
* @throws IOException Throws if the file can't be accessed.
*/
public ImageExtractor(String filePath) throws IOException {
this(new LocalInputSource(filePath));
}

/**
* Init from a {@link LocalInputSource}.
*
* @param source The local source.
* @throws IOException Throws if the file can't be accessed.
*/
public ImageExtractor(LocalInputSource source) throws IOException {
public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException {
this.filename = source.getFilename();
this.pageImages = new ArrayList<>();

if (source.isPdf()) {
this.saveFormat = "jpg";
var pdfPageImages = PDFUtils.pdfToImages(source);
var pdfPageImages = pdfOperation.pdfToImages(source);
for (PdfPageImage pdfPageImage : pdfPageImages) {
this.pageImages.add(pdfPageImage.getImage());
}
Expand All @@ -56,6 +41,16 @@ public ImageExtractor(LocalInputSource source) throws IOException {
}
}

/**
* Init from a {@link LocalInputSource}.
*
* @param source The local source.
* @throws IOException Throws if the file can't be accessed.
*/
public ImageExtractor(LocalInputSource source) throws IOException {
this(source, new PDFBoxApi());
}

/**
* Get the number of pages in the file.
*
Expand Down
6 changes: 2 additions & 4 deletions src/main/java/com/mindee/input/LocalInputSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import com.mindee.pdf.PDFBoxApi;
import com.mindee.pdf.PDFCompressor;
import com.mindee.pdf.PDFOperation;
import com.mindee.pdf.PDFUtils;
import com.mindee.pdf.SplitQuery;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -76,7 +74,7 @@ public int getPageCount() throws IOException {
if (!this.isPdf()) {
return 1;
}
return PDFUtils.getNumberOfPages(this.file);
return getPdfOperation().getNumberOfPages(this.file);
}

/**
Expand All @@ -87,7 +85,7 @@ public int getPageCount() throws IOException {
*/
public void applyPageOptions(PageOptions pageOptions) throws IOException {
if (pageOptions != null && this.isPdf()) {
this.file = getPdfOperation().split(new SplitQuery(this.file, pageOptions)).getFile();
this.file = getPdfOperation().split(this.file, pageOptions).getFile();
}
}

Expand Down
66 changes: 64 additions & 2 deletions src/main/java/com/mindee/pdf/BasePDFExtractor.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
package com.mindee.pdf;

import static com.mindee.pdf.PDFUtils.mergePdfPages;

import com.mindee.MindeeException;
import com.mindee.input.InputSourceUtils;
import com.mindee.input.LocalInputSource;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
Expand Down Expand Up @@ -112,4 +114,64 @@ public List<ExtractedPDF> extractSubDocuments(
}
return extractedPDFs;
}

private static PDPage clonePage(PDPage page) {

COSDictionary pageDict = page.getCOSObject();
COSDictionary newPageDict = new COSDictionary(pageDict);

newPageDict.removeItem(COSName.ANNOTS);

return new PDPage(newPageDict);
}

private static byte[] createPdfFromExistingPdf(
PDDocument document,
List<Integer> pageNumbers,
boolean closeOriginal
) throws IOException {
var outputStream = new ByteArrayOutputStream();
var newDocument = new PDDocument();
int pageCount = document.getNumberOfPages();
pageNumbers
.stream()
.filter(i -> i < pageCount)
.forEach(i -> newDocument.addPage(clonePage(document.getPage(i))));

newDocument.save(outputStream);
newDocument.close();
if (closeOriginal) {
document.close();
}

byte[] output = outputStream.toByteArray();
outputStream.close();
return output;
}

/**
* Merge specified PDF pages together.
*
* @param file The PDF file.
* @param pageNumbers Lit of page numbers to merge together.
*/
public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
PDDocument document = Loader.loadPDF(file);
return createPdfFromExistingPdf(document, pageNumbers, true);
}

public static byte[] mergePdfPages(
PDDocument document,
List<Integer> pageNumbers
) throws IOException {
return mergePdfPages(document, pageNumbers, true);
}

public static byte[] mergePdfPages(
PDDocument document,
List<Integer> pageNumbers,
boolean closeOriginal
) throws IOException {
return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
}
}
77 changes: 65 additions & 12 deletions src/main/java/com/mindee/pdf/PDFBoxApi.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.mindee.MindeeException;
import com.mindee.input.PageOptions;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -13,28 +14,31 @@
import java.util.stream.IntStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
* Allows performing various operations on PDFs.
*/
public final class PDFBoxApi implements PDFOperation {

@Override
public SplitPDF split(SplitQuery splitQuery) throws IOException {
public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException {

if (!checkPdfOpen(splitQuery.getFile())) {
if (!checkPdfOpen(fileBytes)) {
throw new MindeeException("This document cannot be open and cannot be split.");
}

try (var originalDocument = Loader.loadPDF(splitQuery.getFile())) {
try (var originalDocument = Loader.loadPDF(fileBytes)) {
try (var splitDocument = new PDDocument()) {
int totalOriginalPages = countPages(splitQuery.getFile());
int totalOriginalPages = getNumberOfPages(fileBytes);

if (totalOriginalPages < splitQuery.getPageOptions().getOnMinPages()) {
return new SplitPDF(splitQuery.getFile(), totalOriginalPages);
if (totalOriginalPages < pageOptions.getOnMinPages()) {
return new SplitPDF(fileBytes, totalOriginalPages);
}

var pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages);
var pageRange = getPageRanges(pageOptions, totalOriginalPages);
pageRange
.stream()
.filter(i -> i < totalOriginalPages)
Expand All @@ -43,12 +47,65 @@ public SplitPDF split(SplitQuery splitQuery) throws IOException {
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
splitDocument.save(outputStream);
byte[] splitPdf = outputStream.toByteArray();
return new SplitPDF(splitPdf, countPages(splitPdf));
return new SplitPDF(splitPdf, getNumberOfPages(splitPdf));
}
}
}
}

@Override
public int getNumberOfPages(byte[] fileBytes) throws IOException {
var document = Loader.loadPDF(fileBytes);
int pageCount = document.getNumberOfPages();
document.close();
return pageCount;
}

@Override
public PdfPageImage pdfPageToImage(
byte[] fileBytes,
String filename,
int pageNumber
) throws IOException {
int index = pageNumber - 1;
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
document.close();
return new PdfPageImage(imageBuffer, index, filename, "jpg");
}

@Override
public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
List<PdfPageImage> pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
}
document.close();
return pdfPageImages;
}

private BufferedImage pdfPageToImageBuffer(
int index,
PDDocument document,
PDFRenderer pdfRenderer
) throws IOException {
PDRectangle bbox = document.getPage(index).getBBox();
float dimension = bbox.getWidth() * bbox.getHeight();
int dpi;
if (dimension < 200000) {
dpi = 300;
} else if (dimension < 300000) {
dpi = 250;
} else {
dpi = 200;
}
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
}

private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPages) {

Set<Integer> pages = Optional
Expand Down Expand Up @@ -81,8 +138,4 @@ private boolean checkPdfOpen(byte[] documentFile) {
}
return opens;
}

private int countPages(byte[] documentFile) throws IOException {
return PDFUtils.getNumberOfPages(documentFile);
}
}
Loading
Loading