Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions Samples/DataExtractionTest/GO/DataExtraction_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,60 @@ func GenericKeyValueTest() (err error) {
return nil
}

//---------------------------------------------------------------------------------------
// The following sample illustrates how to extract document classes from PDF documents.
//---------------------------------------------------------------------------------------

func DocClassifierTest() (err error) {
defer catch(&err)

// Test if the add-on is installed
if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocClassification) {
fmt.Println("")
fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
fmt.Println("-----------------------------------------------------------------------------")
fmt.Println("The Data Extraction suite is an optional add-on, available for download")
fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
fmt.Println("using the PDFNetAddResourceSearchPath() function.")
fmt.Println("")
return nil
}

// Simple example: classify pages as a JSON file
fmt.Println("Classify pages as a JSON file")

inputFile := inputPath + "Invoice.pdf"
outputFile := outputPath + "Invoice_Classified.json"
DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification)

fmt.Println("Result saved in " + outputFile)

// Classify pages as a JSON string
fmt.Println("Classify pages as a JSON string")

inputFile = inputPath + "Scientific_Publication.pdf"
outputFile = outputPath + "Scientific_Publication_Classified.json"
json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocClassification).(string)
WriteTextToFile(outputFile, json)

fmt.Println("Result saved in " + outputFile)

// Example with customized options:
fmt.Println("Classify pages with customized options")

inputFile = inputPath + "Email.pdf"
outputFile = outputPath + "Email_Classified.json"
options := NewDataExtractionOptions()
// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
options.SetMinimumConfidenceThreshold(0.7)
DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification, options)

fmt.Println("Result saved in " + outputFile)

return nil
}

//---------------------------------------------------------------------------------------

func TestDataExtraction(t *testing.T) {
Expand Down Expand Up @@ -335,13 +389,22 @@ func TestDataExtraction(t *testing.T) {
fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
}

//-----------------------------------------------------------------------------------

err = GenericKeyValueTest()
if err != nil {
fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
}

//-----------------------------------------------------------------------------------

err = DocClassifierTest()
if err != nil {
fmt.Println(fmt.Errorf("Unable to extract document classifications, error: %s", err))
}

//-----------------------------------------------------------------------------------

PDFNetTerminate()
fmt.Println("Done.")
}
56 changes: 54 additions & 2 deletions Samples/DataExtractionTest/PHP/DataExtractionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ function main()
}

//////////////////////////////////////////////////////////////////////////
// The following sample illustrates how to extract document structure from PDF documents.
// The following sample illustrates how to extract key-value pairs from PDF documents.
//////////////////////////////////////////////////////////////////////////

// Test if the add-on is installed
Expand All @@ -239,7 +239,7 @@ function main()
// Example with customized options:
// Extract Keys & Values from pages 2-4, excluding ads
$options = new DataExtractionOptions();
$options->setPages("2-4");
$options->SetPages("2-4");

$p2ExclusionZones = new RectCollection();
// Exclude the ad on page 2
Expand Down Expand Up @@ -267,6 +267,58 @@ function main()
}
}

//////////////////////////////////////////////////////////////////////////
// The following sample illustrates how to extract document classes from PDF documents.
//////////////////////////////////////////////////////////////////////////

// Test if the add-on is installed
if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification)) {
echo(nl2br("\n"));
echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
echo(nl2br("-----------------------------------------------------------------------------\n"));
echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
echo(nl2br("\n"));
}
else {
try {
// Simple example: classify pages as a JSON file
echo(nl2br("Classify pages as a JSON file\n"));

$outputFile = $outputPath."Invoice_Classified.json";
DataExtractionModule::ExtractData($inputPath."Invoice.pdf", $outputFile, DataExtractionModule::e_DocClassification);

echo(nl2br("Result saved in " . $outputFile . "\n"));

///////////////////////////////////////////////////////
// Classify pages as a JSON string
echo(nl2br("Classify pages as a JSON string\n"));

$outputFile = $outputPath."Scientific_Publication_Classified.json";
$json = DataExtractionModule::ExtractData($inputPath."Scientific_Publication.pdf", DataExtractionModule::e_DocClassification);
WriteTextToFile($outputFile, $json);

echo(nl2br("Result saved in " . $outputFile . "\n"));

///////////////////////////////////////////////////////
// Example with customized options:
echo(nl2br("Classify pages with customized options\n"));

$options = new DataExtractionOptions();
// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
$options->SetMinimumConfidenceThreshold(0.7);
$outputFile = $outputPath."Email_Classified.json";
DataExtractionModule::ExtractData($inputPath."Email.pdf", $outputFile, DataExtractionModule::e_DocClassification, $options);

echo(nl2br("Result saved in " . $outputFile . "\n"));
}
catch(Exception $e) {
echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
}
}

//-----------------------------------------------------------------------------------

PDFNet::Terminate();
Expand Down
49 changes: 49 additions & 0 deletions Samples/DataExtractionTest/PYTHON/DataExtractionTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,55 @@ def main():
print("Unable to extract key-value data, error: " + str(e))


#-----------------------------------------------------------------------------------
# The following sample illustrates how to extract document classes from PDF documents.
#-----------------------------------------------------------------------------------

# Test if the add-on is installed
if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocClassification):
print("")
print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
print("-----------------------------------------------------------------------------")
print("The Data Extraction suite is an optional add-on, available for download")
print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
print("downloaded this module, ensure that the SDK is able to find the required files")
print("using the PDFNet.AddResourceSearchPath() function.")
print("")
else:
try:
# Simple example: classify pages as a JSON file
print("Classify pages as a JSON file")

outputFile = outputPath + "Invoice_Classified.json"
DataExtractionModule.ExtractData(inputPath + "Invoice.pdf", outputFile, DataExtractionModule.e_DocClassification)

print("Result saved in " + outputFile)

#------------------------------------------------------
# Classify pages as a JSON string
print("Classify pages as a JSON string")

outputFile = outputPath + "Scientific_Publication_Classified.json"
json = DataExtractionModule.ExtractData(inputPath + "Scientific_Publication.pdf", DataExtractionModule.e_DocClassification)
WriteTextToFile(outputFile, json)

print("Result saved in " + outputFile)

#------------------------------------------------------
# Example with customized options:
print("Classify pages with customized options")

options = DataExtractionOptions()
# Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
options.SetMinimumConfidenceThreshold(0.7)
outputFile = outputPath + "Email_Classified.json"
DataExtractionModule.ExtractData(inputPath + "Email.pdf", outputFile, DataExtractionModule.e_DocClassification, options)

print("Result saved in " + outputFile)

except Exception as e:
print("Unable to extract document structure data, error: " + str(e))

PDFNet.Terminate()
print("Done.")

Expand Down
56 changes: 56 additions & 0 deletions Samples/DataExtractionTest/RUBY/DataExtractionTest.rb
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,11 @@ def main()
end
end

#-----------------------------------------------------------------------------------
# The following sample illustrates how to extract key-value pairs from PDF documents.
#-----------------------------------------------------------------------------------

# Test if the add-on is installed
if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
puts ""
puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
Expand Down Expand Up @@ -244,6 +249,57 @@ def main()
end
end

#-----------------------------------------------------------------------------------
# The following sample illustrates how to extract document classes from PDF documents.
#-----------------------------------------------------------------------------------

# Test if the add-on is installed
if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocClassification) then
puts ""
puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
puts "-----------------------------------------------------------------------------"
puts "The Data Extraction suite is an optional add-on, available for download"
puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
puts "downloaded this module, ensure that the SDK is able to find the required files"
puts "using the PDFNet.AddResourceSearchPath() function."
puts ""
else
begin
# Simple example: classify pages as a JSON file
puts "Classify pages as a JSON file"

outputFile = $outputPath + "Invoice_Classified.json"
DataExtractionModule.ExtractData($inputPath + "Invoice.pdf", outputFile, DataExtractionModule::E_DocClassification)

puts "Result saved in " + outputFile

#------------------------------------------------------
# Classify pages as a JSON string
puts "Classify pages as a JSON string"

outputFile = $outputPath + "Scientific_Publication_Classified.json"
json = DataExtractionModule.ExtractData($inputPath + "Scientific_Publication.pdf", DataExtractionModule::E_DocClassification)
File.open(outputFile, 'w') { |file| file.write(json) }

puts "Result saved in " + outputFile

#------------------------------------------------------
# Example with customized options:
puts "Classify pages with customized options"

options = DataExtractionOptions.new()
# Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
options.SetMinimumConfidenceThreshold(0.7)
outputFile = $outputPath + "Email_Classified.json"
DataExtractionModule.ExtractData($inputPath + "Email.pdf", outputFile, DataExtractionModule::E_DocClassification, options)

puts "Result saved in " + outputFile

rescue => error
puts "Unable to extract document structure data, error: " + error.message
end
end

#-----------------------------------------------------------------------------------

PDFNet.Terminate
Expand Down
Binary file added Samples/TestFiles/Email.pdf
Binary file not shown.
Binary file added Samples/TestFiles/Invoice.pdf
Binary file not shown.
Binary file added Samples/TestFiles/Scientific_Publication.pdf
Binary file not shown.