Skip to content

Commit 939eff7

Browse files
committed
TIKA-4363: refactor
1 parent 7a784b3 commit 939eff7

File tree

1 file changed

+7
-7
lines changed
  • tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf

1 file changed

+7
-7
lines changed

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ public void close() {
139139
throw new TikaException("Unable to extract PDF content", e);
140140
}
141141
}
142-
if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
142+
if (!pdfMarkedContent2XHTML.exceptions.isEmpty()) {
143143
//throw the first
144144
throw new TikaException("Unable to extract PDF content",
145145
pdfMarkedContent2XHTML.exceptions.get(0));
@@ -192,15 +192,15 @@ private static void findPages(COSBase kidsObj, List<ObjectRef> pageRefs) {
192192
}
193193

194194
@Override
195-
protected void processPages(PDPageTree pages) throws IOException {
195+
protected void processPages(PDPageTree pageTree) throws IOException {
196196

197197
//this is a 0-indexed list of object refs for each page
198198
//we need this to map the mcids later...
199199
//TODO: is there a better way of getting these/doing the mapping?
200200

201201
List<ObjectRef> pageRefs = new ArrayList<>();
202202
//STEP 1: get the page refs
203-
findPages(pdDocument.getPages().getCOSObject().getDictionaryObject(COSName.KIDS), pageRefs);
203+
findPages(pageTree.getCOSObject().getDictionaryObject(COSName.KIDS), pageRefs);
204204
//confirm the right number of pages was found
205205
if (pageRefs.size() != pdDocument.getNumberOfPages()) {
206206
throw new IOException(new TikaException(
@@ -215,7 +215,7 @@ protected void processPages(PDPageTree pages) throws IOException {
215215
Map<String, HtmlTag> roleMap = loadRoleMap(structureTreeRoot.getRoleMap());
216216

217217
//STEP 3: load all of the text, mapped to MCIDs
218-
Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);
218+
Map<MCID, String> paragraphs = loadTextByMCID(pageTree, pageRefs);
219219

220220
//STEP 4: now recurse the the structure tree root and output the structure
221221
//and the text bits from paragraphs
@@ -254,7 +254,7 @@ protected void processPages(PDPageTree pages) throws IOException {
254254
//TODO: figure out when we're crossing page boundaries during the recursion
255255
// step above and do the page by page processing then...rather than dumping this
256256
// all here.
257-
for (PDPage page : pdDocument.getPages()) {
257+
for (PDPage page : pageTree) {
258258
startPage(page);
259259
endPage(page);
260260
}
@@ -408,10 +408,10 @@ private HtmlTag getTag(String name, Map<String, HtmlTag> roleMap) {
408408
return roleMap.get(name);
409409
}
410410

411-
private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws IOException {
411+
private Map<MCID, String> loadTextByMCID(PDPageTree pageTree, List<ObjectRef> pageRefs) throws IOException {
412412
int pageCount = 1;
413413
Map<MCID, String> paragraphs = new HashMap<>();
414-
for (PDPage page : pdDocument.getPages()) {
414+
for (PDPage page : pageTree) {
415415
ObjectRef pageRef = pageRefs.get(pageCount - 1);
416416
PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
417417
try {

0 commit comments

Comments
 (0)