Skip to content

Commit 2f63758

Browse files
committed
DocClassification samples (#174)
1 parent cdd7980 commit 2f63758

File tree

7 files changed

+222
-2
lines changed

7 files changed

+222
-2
lines changed

Samples/DataExtractionTest/GO/DataExtraction_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,60 @@ func GenericKeyValueTest() (err error) {
302302
return nil
303303
}
304304

305+
//---------------------------------------------------------------------------------------
306+
// The following sample illustrates how to extract document classes from PDF documents.
307+
//---------------------------------------------------------------------------------------
308+
309+
func DocClassifierTest() (err error) {
310+
defer catch(&err)
311+
312+
// Test if the add-on is installed
313+
if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocClassification) {
314+
fmt.Println("")
315+
fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
316+
fmt.Println("-----------------------------------------------------------------------------")
317+
fmt.Println("The Data Extraction suite is an optional add-on, available for download")
318+
fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
319+
fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
320+
fmt.Println("using the PDFNetAddResourceSearchPath() function.")
321+
fmt.Println("")
322+
return nil
323+
}
324+
325+
// Simple example: classify pages as a JSON file
326+
fmt.Println("Classify pages as a JSON file")
327+
328+
inputFile := inputPath + "Invoice.pdf"
329+
outputFile := outputPath + "Invoice_Classified.json"
330+
DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification)
331+
332+
fmt.Println("Result saved in " + outputFile)
333+
334+
// Classify pages as a JSON string
335+
fmt.Println("Classify pages as a JSON string")
336+
337+
inputFile = inputPath + "Scientific_Publication.pdf"
338+
outputFile = outputPath + "Scientific_Publication_Classified.json"
339+
json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocClassification).(string)
340+
WriteTextToFile(outputFile, json)
341+
342+
fmt.Println("Result saved in " + outputFile)
343+
344+
// Example with customized options:
345+
fmt.Println("Classify pages with customized options")
346+
347+
inputFile = inputPath + "Email.pdf"
348+
outputFile = outputPath + "Email_Classified.json"
349+
options := NewDataExtractionOptions()
350+
// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
351+
options.SetMinimumConfidenceThreshold(0.7)
352+
DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification, options)
353+
354+
fmt.Println("Result saved in " + outputFile)
355+
356+
return nil
357+
}
358+
305359
//---------------------------------------------------------------------------------------
306360

307361
func TestDataExtraction(t *testing.T) {
@@ -335,13 +389,22 @@ func TestDataExtraction(t *testing.T) {
335389
fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
336390
}
337391

392+
//-----------------------------------------------------------------------------------
393+
338394
err = GenericKeyValueTest()
339395
if err != nil {
340396
fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
341397
}
342398

343399
//-----------------------------------------------------------------------------------
344400

401+
err = DocClassifierTest()
402+
if err != nil {
403+
fmt.Println(fmt.Errorf("Unable to extract document classifications, error: %s", err))
404+
}
405+
406+
//-----------------------------------------------------------------------------------
407+
345408
PDFNetTerminate()
346409
fmt.Println("Done.")
347410
}

Samples/DataExtractionTest/PHP/DataExtractionTest.php

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ function main()
212212
}
213213

214214
//////////////////////////////////////////////////////////////////////////
215-
// The following sample illustrates how to extract document structure from PDF documents.
215+
// The following sample illustrates how to extract key-value pairs from PDF documents.
216216
//////////////////////////////////////////////////////////////////////////
217217

218218
// Test if the add-on is installed
@@ -239,7 +239,7 @@ function main()
239239
// Example with customized options:
240240
// Extract Keys & Values from pages 2-4, excluding ads
241241
$options = new DataExtractionOptions();
242-
$options->setPages("2-4");
242+
$options->SetPages("2-4");
243243

244244
$p2ExclusionZones = new RectCollection();
245245
// Exclude the ad on page 2
@@ -267,6 +267,58 @@ function main()
267267
}
268268
}
269269

270+
//////////////////////////////////////////////////////////////////////////
271+
// The following sample illustrates how to extract document classes from PDF documents.
272+
//////////////////////////////////////////////////////////////////////////
273+
274+
// Test if the add-on is installed
275+
if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification)) {
276+
echo(nl2br("\n"));
277+
echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
278+
echo(nl2br("-----------------------------------------------------------------------------\n"));
279+
echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
280+
echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
281+
echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
282+
echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
283+
echo(nl2br("\n"));
284+
}
285+
else {
286+
try {
287+
// Simple example: classify pages as a JSON file
288+
echo(nl2br("Classify pages as a JSON file\n"));
289+
290+
$outputFile = $outputPath."Invoice_Classified.json";
291+
DataExtractionModule::ExtractData($inputPath."Invoice.pdf", $outputFile, DataExtractionModule::e_DocClassification);
292+
293+
echo(nl2br("Result saved in " . $outputFile . "\n"));
294+
295+
///////////////////////////////////////////////////////
296+
// Classify pages as a JSON string
297+
echo(nl2br("Classify pages as a JSON string\n"));
298+
299+
$outputFile = $outputPath."Scientific_Publication_Classified.json";
300+
$json = DataExtractionModule::ExtractData($inputPath."Scientific_Publication.pdf", DataExtractionModule::e_DocClassification);
301+
WriteTextToFile($outputFile, $json);
302+
303+
echo(nl2br("Result saved in " . $outputFile . "\n"));
304+
305+
///////////////////////////////////////////////////////
306+
// Example with customized options:
307+
echo(nl2br("Classify pages with customized options\n"));
308+
309+
$options = new DataExtractionOptions();
310+
// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
311+
$options->SetMinimumConfidenceThreshold(0.7);
312+
$outputFile = $outputPath."Email_Classified.json";
313+
DataExtractionModule::ExtractData($inputPath."Email.pdf", $outputFile, DataExtractionModule::e_DocClassification, $options);
314+
315+
echo(nl2br("Result saved in " . $outputFile . "\n"));
316+
}
317+
catch(Exception $e) {
318+
echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
319+
}
320+
}
321+
270322
//-----------------------------------------------------------------------------------
271323

272324
PDFNet::Terminate();

Samples/DataExtractionTest/PYTHON/DataExtractionTest.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,55 @@ def main():
252252
print("Unable to extract key-value data, error: " + str(e))
253253

254254

255+
#-----------------------------------------------------------------------------------
256+
# The following sample illustrates how to extract document classes from PDF documents.
257+
#-----------------------------------------------------------------------------------
258+
259+
# Test if the add-on is installed
260+
if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocClassification):
261+
print("")
262+
print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
263+
print("-----------------------------------------------------------------------------")
264+
print("The Data Extraction suite is an optional add-on, available for download")
265+
print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
266+
print("downloaded this module, ensure that the SDK is able to find the required files")
267+
print("using the PDFNet.AddResourceSearchPath() function.")
268+
print("")
269+
else:
270+
try:
271+
# Simple example: classify pages as a JSON file
272+
print("Classify pages as a JSON file")
273+
274+
outputFile = outputPath + "Invoice_Classified.json"
275+
DataExtractionModule.ExtractData(inputPath + "Invoice.pdf", outputFile, DataExtractionModule.e_DocClassification)
276+
277+
print("Result saved in " + outputFile)
278+
279+
#------------------------------------------------------
280+
# Classify pages as a JSON string
281+
print("Classify pages as a JSON string")
282+
283+
outputFile = outputPath + "Scientific_Publication_Classified.json"
284+
json = DataExtractionModule.ExtractData(inputPath + "Scientific_Publication.pdf", DataExtractionModule.e_DocClassification)
285+
WriteTextToFile(outputFile, json)
286+
287+
print("Result saved in " + outputFile)
288+
289+
#------------------------------------------------------
290+
# Example with customized options:
291+
print("Classify pages with customized options")
292+
293+
options = DataExtractionOptions()
294+
# Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
295+
options.SetMinimumConfidenceThreshold(0.7)
296+
outputFile = outputPath + "Email_Classified.json"
297+
DataExtractionModule.ExtractData(inputPath + "Email.pdf", outputFile, DataExtractionModule.e_DocClassification, options)
298+
299+
print("Result saved in " + outputFile)
300+
301+
except Exception as e:
302+
print("Unable to extract document structure data, error: " + str(e))
303+
255304
PDFNet.Terminate()
256305
print("Done.")
257306

Samples/DataExtractionTest/RUBY/DataExtractionTest.rb

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,11 @@ def main()
200200
end
201201
end
202202

203+
#-----------------------------------------------------------------------------------
204+
# The following sample illustrates how to extract key-value pairs from PDF documents.
205+
#-----------------------------------------------------------------------------------
206+
207+
# Test if the add-on is installed
203208
if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
204209
puts ""
205210
puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
@@ -244,6 +249,57 @@ def main()
244249
end
245250
end
246251

252+
#-----------------------------------------------------------------------------------
253+
# The following sample illustrates how to extract document classes from PDF documents.
254+
#-----------------------------------------------------------------------------------
255+
256+
# Test if the add-on is installed
257+
if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocClassification) then
258+
puts ""
259+
puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
260+
puts "-----------------------------------------------------------------------------"
261+
puts "The Data Extraction suite is an optional add-on, available for download"
262+
puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
263+
puts "downloaded this module, ensure that the SDK is able to find the required files"
264+
puts "using the PDFNet.AddResourceSearchPath() function."
265+
puts ""
266+
else
267+
begin
268+
# Simple example: classify pages as a JSON file
269+
puts "Classify pages as a JSON file"
270+
271+
outputFile = $outputPath + "Invoice_Classified.json"
272+
DataExtractionModule.ExtractData($inputPath + "Invoice.pdf", outputFile, DataExtractionModule::E_DocClassification)
273+
274+
puts "Result saved in " + outputFile
275+
276+
#------------------------------------------------------
277+
# Classify pages as a JSON string
278+
puts "Classify pages as a JSON string"
279+
280+
outputFile = $outputPath + "Scientific_Publication_Classified.json"
281+
json = DataExtractionModule.ExtractData($inputPath + "Scientific_Publication.pdf", DataExtractionModule::E_DocClassification)
282+
File.open(outputFile, 'w') { |file| file.write(json) }
283+
284+
puts "Result saved in " + outputFile
285+
286+
#------------------------------------------------------
287+
# Example with customized options:
288+
puts "Classify pages with customized options"
289+
290+
options = DataExtractionOptions.new()
291+
# Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
292+
options.SetMinimumConfidenceThreshold(0.7)
293+
outputFile = $outputPath + "Email_Classified.json"
294+
DataExtractionModule.ExtractData($inputPath + "Email.pdf", outputFile, DataExtractionModule::E_DocClassification, options)
295+
296+
puts "Result saved in " + outputFile
297+
298+
rescue => error
299+
puts "Unable to extract document structure data, error: " + error.message
300+
end
301+
end
302+
247303
#-----------------------------------------------------------------------------------
248304

249305
PDFNet.Terminate

Samples/TestFiles/Email.pdf

188 KB
Binary file not shown.

Samples/TestFiles/Invoice.pdf

488 KB
Binary file not shown.
269 KB
Binary file not shown.

0 commit comments

Comments
 (0)