Skip to content

Commit a4c2c1c

Browse files
committed
PVQ-4179 Fixed checking MCIDs in nested tags and tags with variable content marks
1 parent 3255a1b commit a4c2c1c

File tree

8 files changed

+244
-39
lines changed

8 files changed

+244
-39
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ Operations:
3434
Arguments:
3535
-i <file> : Path to a single PDF file to validate.
3636
-d <folder> : Path to a directory of PDF files to validate (will process all PDFs in the folder).
37+
38+
Return Codes:
39+
success with no duplicate MCIDs: 0 - no duplcate MCIDs
40+
success with found duplicate MCIDs: count of invalid duplicate MCIDs (maximum 100)
41+
error: 101 and higher - error check the message in System.err
3742
```
3843
3944
## Run the CLI Commands

config.json

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,106 @@
2222
],
2323
"returnCodes": [
2424
0,
25-
1
25+
1,
26+
2,
27+
3,
28+
4,
29+
5,
30+
6,
31+
7,
32+
8,
33+
9,
34+
10,
35+
11,
36+
12,
37+
13,
38+
14,
39+
15,
40+
16,
41+
17,
42+
18,
43+
19,
44+
20,
45+
21,
46+
22,
47+
23,
48+
24,
49+
25,
50+
26,
51+
27,
52+
28,
53+
29,
54+
30,
55+
31,
56+
32,
57+
33,
58+
34,
59+
35,
60+
36,
61+
37,
62+
38,
63+
39,
64+
40,
65+
41,
66+
42,
67+
43,
68+
44,
69+
45,
70+
46,
71+
47,
72+
48,
73+
49,
74+
50,
75+
51,
76+
52,
77+
53,
78+
54,
79+
55,
80+
56,
81+
57,
82+
58,
83+
59,
84+
60,
85+
61,
86+
62,
87+
63,
88+
64,
89+
65,
90+
66,
91+
67,
92+
68,
93+
69,
94+
70,
95+
71,
96+
72,
97+
73,
98+
74,
99+
75,
100+
76,
101+
77,
102+
78,
103+
79,
104+
80,
105+
81,
106+
82,
107+
83,
108+
84,
109+
85,
110+
86,
111+
87,
112+
88,
113+
89,
114+
90,
115+
91,
116+
92,
117+
93,
118+
94,
119+
95,
120+
96,
121+
97,
122+
98,
123+
99,
124+
100
26125
],
27126
"stdout": "${output_txt}",
28127
"program": "java -jar \"${action_path}/net.pdfix.validate-pdf-0.0.0.jar\" duplicate-mcid -i \"${input_pdf}\"",

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
<dependency>
4242
<groupId>net.pdfix</groupId>
4343
<artifactId>net.pdfix.pdfixlib</artifactId>
44-
<version>8.4.3</version>
44+
<version>8.7.3</version>
4545
</dependency>
4646
</dependencies>
4747

@@ -122,7 +122,7 @@
122122
<groupId>net.pdfix</groupId>
123123
<artifactId>net.pdfix.pdfixlib</artifactId>
124124
<version>${project.version}</version>
125-
<file>${pom.basedir}/lib/net.pdfix.pdfixlib-8.4.3.jar</file>
125+
<file>${pom.basedir}/lib/net.pdfix.pdfixlib-8.7.3.jar</file>
126126
<packaging>jar</packaging>
127127
<generatePom>true</generatePom>
128128
</configuration>

resources/test1.pdf

751 KB
Binary file not shown.

src/main/java/net/pdfix/App.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import java.io.File;
44
import java.io.FileInputStream;
5+
import java.io.FileNotFoundException;
56
import java.io.IOException;
67
import java.nio.file.Files;
78
import java.nio.file.Paths;
@@ -15,6 +16,13 @@ public class App {
1516
private static String VERSION = "1.0.0";
1617
private static String APP_NAME = "Validate PDF Accessibility";
1718

19+
// Success codes: 0-255 for object counts
20+
// Error codes: 256+ to avoid conflict with object counts
21+
private static final int ERROR_GENERAL = 101;
22+
23+
// Maximum count we can return as exit code is 100
24+
private static final int MAX_EXIT_CODE = 100; // maximum number of errors returned as an exit code
25+
1826
private static void displayVersion() {
1927
Properties properties = new Properties();
2028
try {
@@ -65,13 +73,13 @@ private static List<File> collectFiles(String directoryPath) {
6573
return fileList;
6674
}
6775

68-
private static void processFile(File file) throws Exception {
76+
private static int processFile(File file) throws Exception {
6977
// Process single file
7078
System.out.println("File: " + file.getPath() + "");
7179

7280
if (!isPDFFile(file.getAbsolutePath())) {
7381
System.out.println("Not a PDF file");
74-
return;
82+
return 0;
7583
}
7684

7785
int count = FindDuplicateMcid.checkDuplicateMcid(file.getAbsolutePath());
@@ -80,6 +88,7 @@ private static void processFile(File file) throws Exception {
8088
} else {
8189
System.out.println(String.format("Total %d duplicate MCID(s) found", count));
8290
}
91+
return count;
8392
}
8493

8594
private static String OP_DUPLICATE_MCID = "OP_DUPLICATE_MCID";
@@ -152,21 +161,25 @@ public int compare(File f1, File f2) {
152161
}
153162
});
154163

164+
int count = 0;
165+
155166
// Process each file
156167
for (File file : fileList) {
157168
System.out.println("===============================================================================");
158169
try {
159170
if (op == OP_DUPLICATE_MCID) {
160-
processFile(file);
171+
count += processFile(file);
161172
}
162173
} catch (Exception e) {
163-
System.out.println(e.getLocalizedMessage());
174+
System.err.println(e.getLocalizedMessage());
164175
}
165176
System.out.println("===============================================================================\n");
166177
}
167178
System.out.println("Process complete");
179+
System.exit(Math.min(count, MAX_EXIT_CODE));
168180
} catch (Exception e) {
169-
System.out.println(e.getLocalizedMessage());
181+
System.err.println(e.getLocalizedMessage());
182+
System.exit(ERROR_GENERAL);
170183
}
171184
}
172185
}

src/main/java/net/pdfix/FindDuplicateMcid.java

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
package net.pdfix;
22

3+
import java.io.File;
4+
import java.io.FileNotFoundException;
35
import java.nio.channels.NonReadableChannelException;
46
import java.util.ArrayList;
57
import java.util.List;
68

79
import net.pdfix.pdfixlib.*;
810

911
public class FindDuplicateMcid {
12+
private static int kReportTypeDplicateMcid = 1;
13+
private static int kReportTypeArtifactMcid = 2;
14+
1015
// Helper function to get a readable object type
1116
private static String getNiceObjType(PdfPageObjectType type) {
1217
switch (type) {
@@ -42,8 +47,13 @@ private static String getObjContent(PdsPageObject obj) {
4247
return info.toString();
4348
}
4449

45-
public static void reportMcid(int pageNum, PdsPageObject obj, int index, int mcid) {
46-
System.out.println("Duplicate MCID Found:");
50+
public static void reportMcid(int pageNum, PdsPageObject obj, int index, int mcid, int reportType) {
51+
// report type
52+
if (reportType == kReportTypeDplicateMcid) {
53+
System.out.println("Error: Duplicate MCID found");
54+
} else if (reportType == kReportTypeArtifactMcid) {
55+
System.out.println("Warning: Artifact with MCID found");
56+
}
4757
String objType = getNiceObjType(obj.GetObjectType());
4858
String objBBox = getObjBBox(obj);
4959
String objContent = getObjContent(obj);
@@ -63,11 +73,40 @@ public static void reportMcid(int pageNum, PdsPageObject obj, int index, int mci
6373
System.out.println(info.toString());
6474
}
6575

76+
private static Boolean compareContentMarkMCID(PdsPageObject obj1, PdsPageObject obj2) {
77+
if (obj1 == obj2) {
78+
return true;
79+
}
80+
if ((obj1 == null) || (obj2 == null)) {
81+
return false;
82+
}
83+
PdsContentMark cm1 = obj1.GetContentMark();
84+
PdsContentMark cm2 = obj2.GetContentMark();
85+
86+
// compare content mark index with MCID
87+
if (cm1.GetTagMcid() != cm2.GetTagMcid()) {
88+
return false;
89+
}
90+
91+
// compare content mark names, manes on each index must me equal
92+
for (int i = 0; i <= cm1.GetTagMcid(); i++) {
93+
if (cm1.GetTagName(i).compareTo(cm2.GetTagName(i)) != 0) {
94+
return false;
95+
}
96+
}
97+
return true;
98+
}
99+
66100
// Check for duplicate MCIDs in a PDF file. Return the number of dulicate mcids
67101
// found
68102
public static int checkDuplicateMcid(String path) throws Exception {
69103
Pdfix pdfix = new Pdfix();
70104

105+
File file = new File(path);
106+
if (!file.exists()) {
107+
throw new FileNotFoundException(path);
108+
}
109+
71110
PdfDoc doc = pdfix.OpenDoc(path, "");
72111
if (doc == null) {
73112
throw new RuntimeException(pdfix.GetError());
@@ -89,34 +128,37 @@ public static int checkDuplicateMcid(String path) throws Exception {
89128
}
90129

91130
int lastMcid = -1;
131+
PdsPageObject lastObject = null;
92132
List<Integer> mcids = new ArrayList<Integer>();
93-
PdsPageObject lastObj = null;
94133
for (int j = 0; j < content.GetNumObjects(); j++) {
95134
PdsPageObject obj = content.GetObject(j);
135+
PdsContentMark contentMark = obj.GetContentMark();
96136
int mcid = obj.GetMcid();
97-
if ((mcid != -1) && (mcid == lastMcid)) {
98-
// content marks must be equal for equal mcid
99-
if (lastObj != null) {
100-
if (obj.GetNumEqualTags(lastObj) != obj.GetContentMark().GetNumTags()) {
101-
reportMcid(i, obj, j, mcid);
102-
found++;
103-
}
104-
}
105-
} else if (mcid != lastMcid) {
137+
Boolean isArtifact = (contentMark.GetTagArtifact() != -1);
138+
139+
// reports following options:
140+
// Error: duplicite MCID in tagged content (second MCID occurence can be in tagged content or artifact)
141+
// Warning: MCID set for Artifact (it may be used in tag tree)
142+
143+
if ((mcid != lastMcid) || ((mcid != -1) && (lastObject != null) && (!compareContentMarkMCID(obj, lastObject)))) {
106144
lastMcid = mcid;
107145
if (mcid == -1) {
108146
continue;
109147
}
110-
111148
if (mcids.contains(mcid)) {
112-
reportMcid(i, obj, j, mcid);
149+
reportMcid(i, obj, j, mcid, kReportTypeDplicateMcid);
113150
found++;
114151
}
115152
mcids.add(mcid);
116153
}
117-
lastObj = obj;
154+
if (isArtifact && (mcid != -1)) {
155+
if (mcid != -1) {
156+
reportMcid(i, obj, j, mcid, kReportTypeArtifactMcid);
157+
}
158+
lastMcid = -1;
159+
}
160+
lastObject = obj;
118161
}
119-
120162
page.Release();
121163
}
122164

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,37 @@
11
package net.pdfix;
22

3+
import java.util.AbstractMap;
4+
import java.util.ArrayList;
5+
import java.util.List;
6+
37
import org.junit.jupiter.api.Test;
48

59
public class FindDuplicateMcidTest {
610

7-
// Test validation of dulicate MCID
8-
// test.pdf contains a duplicate MCID '0' with content 'Multi-Platform PDF Library SDK'
11+
// Test validation of dulicate MCID
12+
// test.pdf contains a duplicate MCID '0' with content 'Multi-Platform PDF
13+
// Library SDK'
914
// on top of the page
1015

1116
@Test
1217
void testDuplicateMcid() throws Exception {
18+
19+
// list of files to test. Each item contains path to the file and number of
20+
// expected errors found
21+
List<AbstractMap.SimpleEntry<String, Integer>> testFiles = new ArrayList<>();
22+
testFiles.add(new AbstractMap.SimpleEntry<>("/resources/test.pdf", 1));
23+
testFiles.add(new AbstractMap.SimpleEntry<>("/resources/test1.pdf", 40));
24+
1325
String basePath = System.getProperty("user.dir"); // path to current folder
14-
String pdfPath = basePath + "/resources/test.pdf";
15-
int numberOfDuplicities = FindDuplicateMcid.checkDuplicateMcid(pdfPath);
16-
if (numberOfDuplicities != 5) {
17-
throw new Exception("testDuplicateMcid Failed - Expected 5 duplicate MCIDs, found " + numberOfDuplicities);
26+
27+
for (AbstractMap.SimpleEntry<String, Integer> entry : testFiles) {
28+
String pdfPath = basePath + entry.getKey();
29+
System.out.println(pdfPath);
30+
int numberOfDuplicities = FindDuplicateMcid.checkDuplicateMcid(pdfPath);
31+
if (numberOfDuplicities != entry.getValue()) {
32+
throw new Exception(String.format("testDuplicateMcid Failed - Expected %d duplicate MCIDs, found %d",
33+
entry.getValue(), numberOfDuplicities));
34+
}
1835
}
1936
}
2037
}

0 commit comments

Comments
 (0)